diff --git a/Dockerfile b/Dockerfile index cb571bab..1c8b682d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -393,7 +393,7 @@ VOLUME "$DATA_DIR" EXPOSE 8000 HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ - CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK' + CMD curl --silent 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK' ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"] diff --git a/README.md b/README.md index 2da5f877..6a5117c0 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,8 @@ archivebox init --setup curl -fsSL 'https://get.archivebox.io' | bash
-Open http://localhost:8000 to see your server's Web UI ➡️ +Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI ➡️
+Set LISTEN_HOST to change the base domain; web. and admin. subdomains are used automatically.
@@ -469,6 +470,7 @@ For more discussion on managed and paid hosting options see here: http://localhost:8000 to see your server's Web UI ➡️ +Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI ➡️
+Set LISTEN_HOST to change the base domain; web. and admin. subdomains are used automatically.

For more info, see our Usage: Web UI wiki. ➡️

diff --git a/archivebox/api/auth.py b/archivebox/api/auth.py index ae58e1e3..da537606 100644 --- a/archivebox/api/auth.py +++ b/archivebox/api/auth.py @@ -127,6 +127,20 @@ class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth): """Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)""" pass +class DjangoSessionAuth: + """Allow authenticating with existing Django session cookies (same-origin only).""" + def __call__(self, request: HttpRequest) -> Optional[AbstractBaseUser]: + return self.authenticate(request) + + def authenticate(self, request: HttpRequest, **kwargs) -> Optional[AbstractBaseUser]: + user = getattr(request, 'user', None) + if user and user.is_authenticated: + request._api_auth_method = self.__class__.__name__ + if not user.is_superuser: + raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)') + return cast(AbstractBaseUser, user) + return None + ### Enabled Auth Methods API_AUTH_METHODS = [ @@ -134,5 +148,4 @@ API_AUTH_METHODS = [ BearerTokenAuth(), QueryParamTokenAuth(), # django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False - UsernameAndPasswordAuth(), ] diff --git a/archivebox/api/middleware.py b/archivebox/api/middleware.py new file mode 100644 index 00000000..952503b1 --- /dev/null +++ b/archivebox/api/middleware.py @@ -0,0 +1,34 @@ +__package__ = 'archivebox.api' + +from django.http import HttpResponse + + +class ApiCorsMiddleware: + """Attach permissive CORS headers for API routes (token-based auth).""" + + def __init__(self, get_response): + self.get_response = get_response + + def __call__(self, request): + if request.path.startswith('/api/'): + if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'): + response = HttpResponse(status=204) + return self._add_cors_headers(request, response) + + response = self.get_response(request) + return self._add_cors_headers(request, response) + + return self.get_response(request) + + def _add_cors_headers(self, request, response): + origin = request.META.get('HTTP_ORIGIN') + if not origin: + return response + + response['Access-Control-Allow-Origin'] = '*' + response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS' + response['Access-Control-Allow-Headers'] = ( + 'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken' + ) + response['Access-Control-Max-Age'] = '600' + return response diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index f49f05af..12f68509 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -188,6 +188,11 @@ class SnapshotSchema(Schema): return ArchiveResult.objects.none() +class SnapshotUpdateSchema(Schema): + status: str | None = None + retry_at: datetime | None = None + + class SnapshotFilterSchema(FilterSchema): id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith']) created_by_id: str = Field(None, q='crawl__created_by_id') @@ -225,6 +230,31 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True): return Snapshot.objects.get(Q(id__icontains=snapshot_id)) +@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot") +def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema): + """Update a snapshot (e.g., set status=sealed to cancel queued work).""" + try: + snapshot = Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id)) + except Snapshot.DoesNotExist: + snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id)) + + payload = data.dict(exclude_unset=True) + + if 'status' in payload: + if payload['status'] not in Snapshot.StatusChoices.values: + raise HttpError(400, f'Invalid status: {payload["status"]}') + snapshot.status = payload['status'] + if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload: + snapshot.retry_at = None + + if 'retry_at' in payload: + snapshot.retry_at = payload['retry_at'] + + snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + request.with_archiveresults = False + return snapshot + + ### Tag ######################################################################### class TagSchema(Schema): diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index d450b766..36cf5f20 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -3,11 +3,13 @@ __package__ = 'archivebox.api' from uuid import UUID from typing import List from datetime import datetime +from django.utils import timezone from django.db.models import Q from django.contrib.auth import get_user_model from ninja import Router, Schema +from ninja.errors import HttpError from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl @@ -54,6 +56,11 @@ class CrawlSchema(Schema): return Snapshot.objects.none() +class CrawlUpdateSchema(Schema): + status: str | None = None + retry_at: datetime | None = None + + @router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls") def get_crawls(request): return Crawl.objects.all().distinct() @@ -79,3 +86,32 @@ def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=F return crawl + +@router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl") +def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema): + """Update a crawl (e.g., set status=sealed to cancel queued work).""" + crawl = Crawl.objects.get(id__icontains=crawl_id) + payload = data.dict(exclude_unset=True) + + if 'status' in payload: + if payload['status'] not in Crawl.StatusChoices.values: + raise HttpError(400, f'Invalid status: {payload["status"]}') + crawl.status = payload['status'] + if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload: + crawl.retry_at = None + + if 'retry_at' in payload: + crawl.retry_at = payload['retry_at'] + + crawl.save(update_fields=['status', 'retry_at', 'modified_at']) + + if payload.get('status') == Crawl.StatusChoices.SEALED: + Snapshot.objects.filter( + crawl=crawl, + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).update( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=timezone.now(), + ) + return crawl diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py index 0eb21b86..4a53e513 100644 --- a/archivebox/cli/archivebox_persona.py +++ b/archivebox/cli/archivebox_persona.py @@ -15,6 +15,7 @@ Examples: # Create a new persona archivebox persona create work archivebox persona create --import=chrome personal + archivebox persona create --import=edge work # List all personas archivebox persona list @@ -34,6 +35,7 @@ import subprocess import tempfile from pathlib import Path from typing import Optional, Iterable +from collections import OrderedDict import rich_click as click from rich import print as rprint @@ -78,34 +80,6 @@ def get_chrome_user_data_dir() -> Optional[Path]: return None -def get_firefox_profile_dir() -> Optional[Path]: - """Get the default Firefox profile directory for the current platform.""" - system = platform.system() - home = Path.home() - - if system == 'Darwin': - profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles' - elif system == 'Linux': - profiles_dir = home / '.mozilla' / 'firefox' - elif system == 'Windows': - app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming')) - profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles' - else: - return None - - if not profiles_dir.exists(): - return None - - # Find the default profile (usually ends with .default or .default-release) - for profile in profiles_dir.iterdir(): - if profile.is_dir() and ('default' in profile.name.lower()): - return profile - - # If no default found, return the first profile - profiles = [p for p in profiles_dir.iterdir() if p.is_dir()] - return profiles[0] if profiles else None - - def get_brave_user_data_dir() -> Optional[Path]: """Get the default Brave user data directory for the current platform.""" system = platform.system() @@ -134,25 +108,99 @@ def get_brave_user_data_dir() -> Optional[Path]: return None +def get_edge_user_data_dir() -> Optional[Path]: + """Get the default Edge user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == 'Darwin': + candidates = [ + home / 'Library' / 'Application Support' / 'Microsoft Edge', + ] + elif system == 'Linux': + candidates = [ + home / '.config' / 'microsoft-edge', + home / '.config' / 'microsoft-edge-beta', + home / '.config' / 'microsoft-edge-dev', + ] + elif system == 'Windows': + local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local')) + candidates = [ + local_app_data / 'Microsoft' / 'Edge' / 'User Data', + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / 'Default').exists(): + return candidate + + return None + + BROWSER_PROFILE_FINDERS = { 'chrome': get_chrome_user_data_dir, 'chromium': get_chrome_user_data_dir, # Same locations - 'firefox': get_firefox_profile_dir, 'brave': get_brave_user_data_dir, + 'edge': get_edge_user_data_dir, } +CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'} + # ============================================================================= # Cookie Extraction via CDP # ============================================================================= +NETSCAPE_COOKIE_HEADER = [ + '# Netscape HTTP Cookie File', + '# https://curl.se/docs/http-cookies.html', + '# This file was generated by ArchiveBox persona cookie extraction', + '#', + '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue', + '', +] + + +def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]": + cookies = OrderedDict() + if not path.exists(): + return cookies + + for line in path.read_text().splitlines(): + if not line or line.startswith('#'): + continue + parts = line.split('\t') + if len(parts) < 7: + continue + domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7] + key = (domain, cookie_path, name) + cookies[key] = (domain, include_subdomains, cookie_path, secure, expiry, name, value) + return cookies + + +def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None: + lines = list(NETSCAPE_COOKIE_HEADER) + for cookie in cookies.values(): + lines.append('\t'.join(cookie)) + path.write_text('\n'.join(lines) + '\n') + + +def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: + existing = _parse_netscape_cookies(existing_file) + new = _parse_netscape_cookies(new_file) + for key, cookie in new.items(): + existing[key] = cookie + _write_netscape_cookies(existing_file, existing) + + def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: """ Launch Chrome with the given user data dir and extract cookies via CDP. Returns True if successful, False otherwise. """ - from archivebox.config.constants import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG # Find the cookie extraction script chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome' @@ -163,14 +211,21 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: return False # Get node modules dir - node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules' + node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules' # Set up environment env = os.environ.copy() env['NODE_MODULES_DIR'] = str(node_modules_dir) env['CHROME_USER_DATA_DIR'] = str(user_data_dir) - env['COOKIES_OUTPUT_FILE'] = str(output_file) env['CHROME_HEADLESS'] = 'true' + output_path = output_file + temp_output = None + temp_dir = None + if output_file.exists(): + temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_')) + temp_output = temp_dir / 'cookies.txt' + output_path = temp_output + env['COOKIES_OUTPUT_FILE'] = str(output_path) try: result = subprocess.run( @@ -182,6 +237,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: ) if result.returncode == 0: + if temp_output and temp_output.exists(): + _merge_netscape_cookies(output_file, temp_output) return True else: rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr) @@ -196,6 +253,9 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: except Exception as e: rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr) return False + finally: + if temp_dir and temp_dir.exists(): + shutil.rmtree(temp_dir, ignore_errors=True) # ============================================================================= @@ -323,6 +383,9 @@ def create_personas( # Import browser profile if requested if import_from and source_profile_dir: + cookies_file = Path(persona.path) / 'cookies.txt' + + if import_from in CHROMIUM_BROWSERS: persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR) # Copy the browser profile @@ -349,7 +412,6 @@ def create_personas( rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr) # Extract cookies via CDP - cookies_file = Path(persona.path) / 'cookies.txt' rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) if extract_cookies_via_cdp(persona_chrome_dir, cookies_file): @@ -589,7 +651,7 @@ def main(): @main.command('create') @click.argument('names', nargs=-1) -@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)') +@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)') def create_cmd(names: tuple, import_from: Optional[str]): """Create Personas, optionally importing from a browser profile.""" sys.exit(create_personas(names, import_from=import_from)) diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index b9273e31..afc4542a 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -3,6 +3,9 @@ __package__ = 'archivebox.cli' from typing import Iterable +import os +import sys +import subprocess import rich_click as click from rich import print @@ -60,6 +63,26 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), pass if run_in_debug: + os.environ['ARCHIVEBOX_RUNSERVER'] = '1' + if reload: + os.environ['ARCHIVEBOX_AUTORELOAD'] = '1' + os.environ['ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER'] = '1' + from archivebox.config.common import STORAGE_CONFIG + pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid') + os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile + + from django.utils.autoreload import DJANGO_AUTORELOAD_ENV + is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true' + if not is_reloader_child: + env = os.environ.copy() + env['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1' + subprocess.Popen( + [sys.executable, '-m', 'archivebox', 'manage', 'orchestrator_watch', f'--pidfile={pidfile}'], + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + from django.core.management import call_command print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]') print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') @@ -79,7 +102,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), is_port_in_use, ) from archivebox.workers.orchestrator import Orchestrator - import sys # Check if port is already in use if is_port_in_use(host, int(port)): diff --git a/archivebox/config/common.py b/archivebox/config/common.py index edf7b602..c6359279 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -99,8 +99,11 @@ class ServerConfig(BaseConfigSet): SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_")) BIND_ADDR: str = Field(default="127.0.0.1:8000") + LISTEN_HOST: str = Field(default="archivebox.localhost:8000") + ADMIN_BASE_URL: str = Field(default="") + ARCHIVE_BASE_URL: str = Field(default="") ALLOWED_HOSTS: str = Field(default="*") - CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000") + CSRF_TRUSTED_ORIGINS: str = Field(default="http://admin.archivebox.localhost:8000") SNAPSHOTS_PER_PAGE: int = Field(default=40) PREVIEW_ORIGINALS: bool = Field(default=True) diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 9e78d722..c1f6ae44 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -118,6 +118,10 @@ class ConstantsDict(Mapping): DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS}) + # Hard safety limits (seconds) + MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE STATICFILE_EXTENSIONS: frozenset[str] = frozenset(( diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index b4c420b7..70353578 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -14,6 +14,7 @@ from archivebox.config.common import SERVER_CONFIG from archivebox.misc.paginators import AccelleratedPaginator from archivebox.base_models.admin import BaseModelAdmin from archivebox.hooks import get_plugin_icon +from archivebox.core.host_utils import build_snapshot_url from archivebox.core.models import ArchiveResult, Snapshot @@ -57,7 +58,11 @@ def render_archiveresults_list(archiveresults_qs, limit=50): # Build output link - use embed_path() which checks output_files first embed_path = result.embed_path() if hasattr(result, 'embed_path') else None - output_link = f'/{result.snapshot.archive_path}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/{result.snapshot.archive_path}/' + snapshot_id = str(getattr(result, 'snapshot_id', '')) + if embed_path and result.status == 'succeeded': + output_link = build_snapshot_url(snapshot_id, embed_path) + else: + output_link = build_snapshot_url(snapshot_id, '') # Get version - try cmd_version field version = result.cmd_version if result.cmd_version else '-' @@ -252,7 +257,7 @@ class ArchiveResultInline(admin.TabularInline): class ArchiveResultAdmin(BaseModelAdmin): list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str') sort_fields = ('id', 'created_at', 'plugin', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon') + readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon') search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp') autocomplete_fields = ['snapshot'] @@ -300,10 +305,11 @@ class ArchiveResultAdmin(BaseModelAdmin): description='Snapshot Info' ) def snapshot_info(self, result): + snapshot_id = str(result.snapshot_id) return format_html( - '[{}]   {}   {}
', - result.snapshot.archive_path, - str(result.snapshot.id)[:8], + '[{}]   {}   {}
', + build_snapshot_url(snapshot_id, "index.html"), + snapshot_id[:8], result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'), result.snapshot.url[:128], ) @@ -335,10 +341,10 @@ class ArchiveResultAdmin(BaseModelAdmin): # Determine output link path - use embed_path() which checks output_files embed_path = result.embed_path() if hasattr(result, 'embed_path') else None output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html' + snapshot_id = str(result.snapshot_id) return format_html( - '↗️
{}
', - result.snapshot.archive_path, - output_path, + '↗️
{}
', + build_snapshot_url(snapshot_id, output_path), result.output_str, ) @@ -348,7 +354,11 @@ class ArchiveResultAdmin(BaseModelAdmin): '
{}

', result.output_str, ) - output_html += format_html('See result files ...
', str(result.snapshot.archive_path))
+        snapshot_id = str(result.snapshot_id)
+        output_html += format_html(
+            'See result files ...
',
+            build_snapshot_url(snapshot_id, "index.html"),
+        )
         embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
         path_from_embed = (snapshot_dir / (embed_path or ''))
         output_html += format_html('{}/{}

', str(snapshot_dir), str(embed_path)) diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index d75198ff..25c89e15 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -8,6 +8,8 @@ from django.contrib import admin, messages from django.urls import path from django.utils.html import format_html, mark_safe from django.utils import timezone +from django.db.models import Q, Sum, Count, Prefetch +from django.db.models.functions import Coalesce from django import forms from django.template import Template, RequestContext from django.contrib.admin.helpers import ActionForm @@ -18,11 +20,12 @@ from archivebox.misc.util import htmldecode, urldecode from archivebox.misc.paginators import AccelleratedPaginator from archivebox.misc.logging_util import printable_filesize from archivebox.search.admin import SearchResultsAdminMixin +from archivebox.core.host_utils import build_snapshot_url, build_web_url from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from archivebox.workers.tasks import bg_archive_snapshots, bg_add -from archivebox.core.models import Tag, Snapshot +from archivebox.core.models import Tag, Snapshot, ArchiveResult from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget @@ -36,7 +39,7 @@ class SnapshotActionForm(ActionForm): super().__init__(*args, **kwargs) # Define tags field in __init__ to avoid database access during app initialization self.fields['tags'] = forms.CharField( - label='Edit tags', + label='', required=False, widget=TagEditorWidget(), ) @@ -67,6 +70,19 @@ class SnapshotActionForm(ActionForm): # ) +class TagNameListFilter(admin.SimpleListFilter): + title = 'By tag name' + parameter_name = 'tag' + + def lookups(self, request, model_admin): + return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by('name')] + + def queryset(self, request, queryset): + if self.value(): + return queryset.filter(tags__id=self.value()) + return queryset + + class SnapshotAdminForm(forms.ModelForm): """Custom form for Snapshot admin with tag editor widget.""" tags_editor = forms.CharField( @@ -117,11 +133,11 @@ class SnapshotAdminForm(forms.ModelForm): class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): form = SnapshotAdminForm - list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str') - sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl') + list_display = ('created_at', 'preview_icon', 'title_str', 'tags_inline', 'status_with_progress', 'files', 'size_with_stats') + sort_fields = ('title_str', 'created_at', 'status', 'crawl') readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list') search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') - list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name') + list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter) fieldsets = ( ('URL', { @@ -163,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) ordering = ['-created_at'] - actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] + actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] inlines = [] # Removed TagInline, using TagEditorWidget instead list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) @@ -182,6 +198,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') return super().changelist_view(request, GLOBAL_CONTEXT) + def get_actions(self, request): + actions = super().get_actions(request) + if 'delete_selected' in actions: + func, name, _desc = actions['delete_selected'] + actions['delete_selected'] = (func, name, 'Delete') + return actions + def get_urls(self): urls = super().get_urls() @@ -196,6 +219,52 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # self.request = request # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) + def get_queryset(self, request): + self.request = request + ordering_fields = self._get_ordering_fields(request) + needs_size_sort = 'size_with_stats' in ordering_fields + needs_files_sort = 'files' in ordering_fields + needs_tags_sort = 'tags_inline' in ordering_fields + + prefetch_qs = ArchiveResult.objects.filter( + Q(status='succeeded') + ).only( + 'id', + 'snapshot_id', + 'plugin', + 'status', + 'output_size', + 'output_files', + 'output_str', + ) + + qs = ( + super() + .get_queryset(request) + .defer('config', 'notes') + .prefetch_related('tags') + .prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs)) + ) + + if needs_size_sort: + qs = qs.annotate( + output_size_sum=Coalesce(Sum( + 'archiveresult__output_size', + filter=Q(archiveresult__status='succeeded'), + ), 0), + ) + + if needs_files_sort: + qs = qs.annotate( + ar_succeeded_count=Count( + 'archiveresult', + filter=Q(archiveresult__status='succeeded'), + ), + ) + if needs_tags_sort: + qs = qs.annotate(tag_count=Count('tags', distinct=True)) + + return qs @admin.display(description="Imported Timestamp") def imported_timestamp(self, obj): @@ -233,17 +302,19 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # ) def admin_actions(self, obj): + summary_url = build_web_url(f'/{obj.archive_path}') + results_url = build_web_url(f'/{obj.archive_path}/index.html#all') return format_html( '''
📄 Summary Page 📁 Result Files @@ -263,7 +334,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): title="Get missing extractors" onmouseover="this.style.background='#d1fae5';" onmouseout="this.style.background='#ecfdf5';"> - ⬇️ Get Missing + ⬇️ Finish Tip: Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.

''', - obj.archive_path, - obj.archive_path, + summary_url, + results_url, obj.url, obj.pk, obj.pk, @@ -301,6 +372,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) def status_info(self, obj): + favicon_url = build_snapshot_url(str(obj.id), 'favicon.ico') return format_html( ''' Archived: {} ({} files {})     @@ -310,7 +382,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): '✅' if obj.is_archived else '❌', obj.num_outputs, self.size(obj) or '0kb', - f'/{obj.archive_path}/favicon.ico', + favicon_url, obj.extension or '-', ) @@ -323,7 +395,37 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ordering='title', ) def title_str(self, obj): - # Render inline tag editor widget + title_raw = (obj.title or '').strip() + url_raw = (obj.url or '').strip() + title_normalized = title_raw.lower() + url_normalized = url_raw.lower() + show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized + css_class = 'fetched' if show_title else 'pending' + + detail_url = build_web_url(f'/{obj.archive_path}/index.html') + title_html = '' + if show_title: + title_html = format_html( + '
' + '{}' + '', + detail_url, + css_class, + urldecode(htmldecode(title_raw))[:128], + ) + + return format_html( + '{}' + '
' + '{}' + '
', + title_html, + url_raw or obj.url, + (url_raw or obj.url)[:128], + ) + + @admin.display(description='Tags', ordering='tag_count') + def tags_inline(self, obj): widget = InlineTagEditorWidget(snapshot_id=str(obj.pk)) tags_html = widget.render( name=f'tags_{obj.pk}', @@ -331,28 +433,58 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): attrs={'id': f'tags_{obj.pk}'}, snapshot_id=str(obj.pk), ) + return mark_safe(f'{tags_html}') - # Show title if available, otherwise show URL - display_text = obj.title or obj.url - css_class = 'fetched' if obj.title else 'pending' + @admin.display(description='Preview', empty_value='') + def preview_icon(self, obj): + results = self._get_prefetched_results(obj) + has_screenshot = False + has_favicon = False + if results is not None: + has_screenshot = any(r.plugin == 'screenshot' for r in results) + has_favicon = any(r.plugin == 'favicon' for r in results) + + if not has_screenshot and not has_favicon: + return None + + if has_screenshot: + img_url = build_snapshot_url(str(obj.id), 'screenshot/screenshot.png') + fallbacks = [ + build_snapshot_url(str(obj.id), 'screenshot.png'), + build_snapshot_url(str(obj.id), 'favicon/favicon.ico'), + build_snapshot_url(str(obj.id), 'favicon.ico'), + ] + img_alt = 'Screenshot' + preview_class = 'screenshot' + else: + img_url = build_snapshot_url(str(obj.id), 'favicon/favicon.ico') + fallbacks = [ + build_snapshot_url(str(obj.id), 'favicon.ico'), + ] + img_alt = 'Favicon' + preview_class = 'favicon' + + fallback_list = ','.join(fallbacks) + onerror_js = ( + "this.dataset.fallbacks && this.dataset.fallbacks.length ? " + "(this.src=this.dataset.fallbacks.split(',').shift(), " + "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : " + "this.remove()" + ) return format_html( - '' - '' - '' - '' - '{}' - '', - obj.archive_path, - obj.archive_path, - obj.archive_path, - css_class, - urldecode(htmldecode(display_text))[:128] - ) + mark_safe(f' {tags_html}') + '{}', + img_url, + img_alt, + preview_class, + onerror_js, + fallback_list, + ) @admin.display( description='Files Saved', - # ordering='archiveresult_count', + ordering='ar_succeeded_count', ) def files(self, obj): # return '-' @@ -371,8 +503,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): else: size_txt = mark_safe('...') return format_html( - '{}', - obj.archive_path, + '{}', + build_web_url(f'/{obj.archive_path}'), size_txt, ) @@ -382,7 +514,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) def status_with_progress(self, obj): """Show status with progress bar for in-progress snapshots.""" - stats = obj.get_progress_stats() + stats = self._get_progress_stats(obj) # Status badge colors status_colors = { @@ -440,16 +572,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): @admin.display( description='Size', + ordering='output_size_sum', ) def size_with_stats(self, obj): """Show archive size with output size from archive results.""" - stats = obj.get_progress_stats() - - # Use output_size from archive results if available, fallback to disk size + stats = self._get_progress_stats(obj) output_size = stats['output_size'] - archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size - - size_bytes = output_size or archive_size or 0 + size_bytes = output_size or 0 if size_bytes: size_txt = printable_filesize(size_bytes) @@ -461,22 +590,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # Show hook statistics if stats['total'] > 0: return format_html( - '' + '' '{}' '
' '{}/{} hooks
', - obj.archive_path, + build_web_url(f'/{obj.archive_path}'), size_txt, stats['succeeded'], stats['total'], ) return format_html( - '{}', - obj.archive_path, + '{}', + build_web_url(f'/{obj.archive_path}'), size_txt, ) + def _get_progress_stats(self, obj): + results = self._get_prefetched_results(obj) + if results is None: + return obj.get_progress_stats() + + total = len(results) + succeeded = sum(1 for r in results if r.status == 'succeeded') + failed = sum(1 for r in results if r.status == 'failed') + running = sum(1 for r in results if r.status == 'started') + skipped = sum(1 for r in results if r.status == 'skipped') + pending = max(total - succeeded - failed - running - skipped, 0) + completed = succeeded + failed + skipped + percent = int((completed / total * 100) if total > 0 else 0) + is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED) + output_size = None + + if hasattr(obj, 'output_size_sum'): + output_size = obj.output_size_sum or 0 + else: + output_size = sum(r.output_size or 0 for r in results if r.status == 'succeeded') + + return { + 'total': total, + 'succeeded': succeeded, + 'failed': failed, + 'running': running, + 'pending': pending, + 'skipped': skipped, + 'percent': percent, + 'output_size': output_size or 0, + 'is_sealed': is_sealed, + } + + def _get_prefetched_results(self, obj): + if hasattr(obj, '_prefetched_objects_cache') and 'archiveresult_set' in obj._prefetched_objects_cache: + return obj.archiveresult_set.all() + return None + + def _get_ordering_fields(self, request): + ordering = request.GET.get('o') + if not ordering: + return set() + fields = set() + for part in ordering.split('.'): + if not part: + continue + try: + idx = abs(int(part)) - 1 + except ValueError: + continue + if 0 <= idx < len(self.list_display): + fields.add(self.list_display[idx]) + return fields + @admin.display( description='Original URL', ordering='url', @@ -524,20 +707,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # return super().changelist_view(request, extra_context=None) @admin.action( - description="ℹ️ Get Title" - ) - def update_titles(self, request, queryset): - count = queryset.count() - - # Queue snapshots for archiving via the state machine system - queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR}) - messages.success( - request, - f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.", - ) - - @admin.action( - description="⬇️ Get Missing" + description="⏯️ Finish" ) def update_snapshots(self, request, queryset): count = queryset.count() @@ -551,7 +721,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): @admin.action( - description="🆕 Archive Again" + description="⬇️ Fresh" ) def resnapshot_snapshot(self, request, queryset): for snapshot in queryset: @@ -579,7 +749,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) @admin.action( - description="☠️ Delete" + description="🗑️ Delete" ) def delete_snapshots(self, request, queryset): """Delete snapshots in a single transaction to avoid SQLite concurrency issues.""" diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 4c0e438a..713d34d9 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -1,6 +1,9 @@ __package__ = 'archivebox.core' from django.apps import AppConfig +import os + +_ORCHESTRATOR_BOOTSTRAPPED = False class CoreConfig(AppConfig): @@ -10,6 +13,7 @@ class CoreConfig(AppConfig): def ready(self): """Register the archivebox.core.admin_site as the main django admin site""" import sys + from django.utils.autoreload import DJANGO_AUTORELOAD_ENV from archivebox.core.admin_site import register_admin_site register_admin_site() @@ -18,3 +22,45 @@ class CoreConfig(AppConfig): # Skip during makemigrations to avoid premature state machine access if 'makemigrations' not in sys.argv: from archivebox.core import models # noqa: F401 + + pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE') + if pidfile: + should_write_pid = True + if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1': + should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true' + if should_write_pid: + try: + with open(pidfile, 'w') as handle: + handle.write(str(os.getpid())) + except Exception: + pass + + def _should_manage_orchestrator() -> bool: + if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1': + return False + if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1': + return False + if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1': + if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1': + return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true' + return True + + argv = ' '.join(sys.argv).lower() + if 'orchestrator' in argv: + return False + return 'daphne' in argv and '--reload' in sys.argv + + if _should_manage_orchestrator(): + global _ORCHESTRATOR_BOOTSTRAPPED + if _ORCHESTRATOR_BOOTSTRAPPED: + return + _ORCHESTRATOR_BOOTSTRAPPED = True + + from archivebox.machine.models import Process, Machine + from archivebox.workers.orchestrator import Orchestrator + + Process.cleanup_stale_running() + machine = Machine.current() + + if not Orchestrator.is_running(): + Orchestrator(exit_on_idle=False).start() diff --git a/archivebox/core/host_utils.py b/archivebox/core/host_utils.py new file mode 100644 index 00000000..2e723d05 --- /dev/null +++ b/archivebox/core/host_utils.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +from __future__ import annotations + +import re +from urllib.parse import urlparse + +from archivebox.config.common import SERVER_CONFIG + + +_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$") + + +def split_host_port(host: str) -> tuple[str, str | None]: + parsed = urlparse(f"//{host}") + hostname = (parsed.hostname or host or "").lower() + port = str(parsed.port) if parsed.port else None + return hostname, port + + +def _normalize_base_url(value: str | None) -> str: + if not value: + return "" + base = value.strip() + if not base: + return "" + if "://" not in base: + base = f"http://{base}" + parsed = urlparse(base) + if not parsed.netloc: + return "" + return f"{parsed.scheme}://{parsed.netloc}" + + +def normalize_base_url(value: str | None) -> str: + return _normalize_base_url(value) + + +def get_listen_host() -> str: + return (SERVER_CONFIG.LISTEN_HOST or "").strip() + + +def get_listen_parts() -> tuple[str, str | None]: + return split_host_port(get_listen_host()) + + +def _build_listen_host(subdomain: str | None) -> str: + host, port = get_listen_parts() + if not host: + return "" + full_host = f"{subdomain}.{host}" if subdomain else host + if port: + return f"{full_host}:{port}" + return full_host + + +def get_admin_host() -> str: + override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL) + if override: + return urlparse(override).netloc.lower() + return _build_listen_host("admin") + + +def get_web_host() -> str: + override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL) + if override: + return urlparse(override).netloc.lower() + return _build_listen_host("web") + +def get_api_host() -> str: + return _build_listen_host("api") + +def get_public_host() -> str: + return _build_listen_host("public") + + +def get_snapshot_host(snapshot_id: str) -> str: + return _build_listen_host(snapshot_id) + + +def get_original_host(domain: str) -> str: + return _build_listen_host(domain) + + +def is_snapshot_subdomain(subdomain: str) -> bool: + return bool(_SNAPSHOT_ID_RE.match(subdomain or "")) + + +def get_listen_subdomain(request_host: str) -> str: + req_host, req_port = split_host_port(request_host) + listen_host, listen_port = get_listen_parts() + if not listen_host: + return "" + if listen_port and req_port and listen_port != req_port: + return "" + if req_host == listen_host: + return "" + suffix = f".{listen_host}" + if req_host.endswith(suffix): + return req_host[: -len(suffix)] + return "" + + +def host_matches(request_host: str, target_host: str) -> bool: + if not request_host or not target_host: + return False + req_host, req_port = split_host_port(request_host) + target_host_only, target_port = split_host_port(target_host) + if req_host != target_host_only: + return False + if target_port and req_port and target_port != req_port: + return False + return True + + +def _scheme_from_request(request=None) -> str: + if request: + return request.scheme + return "http" + + +def _build_base_url_for_host(host: str, request=None) -> str: + if not host: + return "" + scheme = _scheme_from_request(request) + return f"{scheme}://{host}" + + +def get_admin_base_url(request=None) -> str: + override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL) + if override: + return override + return _build_base_url_for_host(get_admin_host(), request=request) + + +def get_web_base_url(request=None) -> str: + override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL) + if override: + return override + return _build_base_url_for_host(get_web_host(), request=request) + +def get_api_base_url(request=None) -> str: + return _build_base_url_for_host(get_api_host(), request=request) + + +# Backwards-compat aliases (archive == web) +def get_archive_base_url(request=None) -> str: + return get_web_base_url(request=request) + + +def get_snapshot_base_url(snapshot_id: str, request=None) -> str: + return _build_base_url_for_host(get_snapshot_host(snapshot_id), request=request) + + +def get_original_base_url(domain: str, request=None) -> str: + return _build_base_url_for_host(get_original_host(domain), request=request) + + +def build_admin_url(path: str = "", request=None) -> str: + return _build_url(get_admin_base_url(request), path) + + +def build_web_url(path: str = "", request=None) -> str: + return _build_url(get_web_base_url(request), path) + +def build_api_url(path: str = "", request=None) -> str: + return _build_url(get_api_base_url(request), path) + + +def build_archive_url(path: str = "", request=None) -> str: + return _build_url(get_archive_base_url(request), path) + + +def build_snapshot_url(snapshot_id: str, path: str = "", request=None) -> str: + return _build_url(get_snapshot_base_url(snapshot_id, request=request), path) + + +def build_original_url(domain: str, path: str = "", request=None) -> str: + return _build_url(get_original_base_url(domain, request=request), path) + + +def _build_url(base_url: str, path: str) -> str: + if not base_url: + if not path: + return "" + return path if path.startswith("/") else f"/{path}" + if not path: + return base_url + return f"{base_url}{path if path.startswith('/') else f'/{path}'}" diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index a5343196..2003b478 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -2,11 +2,33 @@ __package__ = 'archivebox.core' import ipaddress import re +from pathlib import Path from django.utils import timezone from django.contrib.auth.middleware import RemoteUserMiddleware +from django.contrib.auth.models import AnonymousUser from django.core.exceptions import ImproperlyConfigured +from django.shortcuts import redirect +from django.contrib.staticfiles import finders +from django.utils.http import http_date +from django.http import HttpResponseNotModified from archivebox.config.common import SERVER_CONFIG +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH +from archivebox.core.host_utils import ( + build_admin_url, + build_api_url, + build_web_url, + get_api_host, + get_admin_host, + get_listen_host, + get_listen_subdomain, + get_public_host, + get_web_host, + host_matches, + is_snapshot_subdomain, +) +from archivebox.core.views import SnapshotHostView, OriginalDomainHostView def detect_timezone(request, activate: bool=True): @@ -30,17 +52,112 @@ def TimezoneMiddleware(get_response): def CacheControlMiddleware(get_response): snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/") + static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip() def middleware(request): response = get_response(request) + if request.path.startswith('/static/'): + rel_path = request.path[len('/static/'):] + static_path = finders.find(rel_path) + if static_path: + try: + mtime = Path(static_path).stat().st_mtime + except OSError: + mtime = None + etag = f'"{static_cache_key}:{int(mtime) if mtime else 0}"' + inm = request.META.get("HTTP_IF_NONE_MATCH") + if inm: + inm_list = [item.strip() for item in inm.split(",")] + if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]: + not_modified = HttpResponseNotModified() + not_modified.headers["ETag"] = etag + not_modified.headers["Cache-Control"] = "public, max-age=31536000, immutable" + if mtime: + not_modified.headers["Last-Modified"] = http_date(mtime) + return not_modified + response.headers["ETag"] = etag + response.headers["Cache-Control"] = "public, max-age=31536000, immutable" + if mtime and not response.headers.get("Last-Modified"): + response.headers["Last-Modified"] = http_date(mtime) + return response + if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path): - policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private' - response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300' - # print('Set Cache-Control header to', response['Cache-Control']) + if not response.get('Cache-Control'): + policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private' + response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300' + # print('Set Cache-Control header to', response['Cache-Control']) return response return middleware + +def HostRoutingMiddleware(get_response): + def middleware(request): + request_host = (request.get_host() or "").lower() + admin_host = get_admin_host() + web_host = get_web_host() + api_host = get_api_host() + public_host = get_public_host() + listen_host = get_listen_host() + subdomain = get_listen_subdomain(request_host) + + if host_matches(request_host, admin_host): + return get_response(request) + + if host_matches(request_host, api_host): + request.user = AnonymousUser() + request._cached_user = request.user + if request.path.startswith("/admin"): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + if not request.path.startswith("/api/"): + target_path = f"/api{request.path if request.path.startswith('/') else f'/{request.path}'}" + if request.META.get("QUERY_STRING"): + target_path = f"{target_path}?{request.META['QUERY_STRING']}" + return redirect(target_path) + return get_response(request) + + if host_matches(request_host, web_host): + request.user = AnonymousUser() + request._cached_user = request.user + if request.path.startswith("/admin"): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + return get_response(request) + + if host_matches(request_host, public_host): + request.user = AnonymousUser() + request._cached_user = request.user + return get_response(request) + + if subdomain: + if is_snapshot_subdomain(subdomain): + view = SnapshotHostView.as_view() + return view(request, snapshot_id=subdomain, path=request.path.lstrip("/")) + view = OriginalDomainHostView.as_view() + return view(request, domain=subdomain, path=request.path.lstrip("/")) + + if host_matches(request_host, listen_host): + target = build_web_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + if admin_host or web_host: + target = build_web_url(request.path, request=request) + if target: + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return get_response(request) + + return middleware + class ReverseProxyAuthMiddleware(RemoteUserMiddleware): header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper()) diff --git a/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py new file mode 100644 index 00000000..cea2b04d --- /dev/null +++ b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py @@ -0,0 +1,17 @@ +# Generated by Codex on 2026-01-21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0030_alter_archiveresult_id'), + ] + + operations = [ + migrations.AddIndex( + model_name='archiveresult', + index=models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index e306fd64..b2c4d719 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1297,7 +1297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea path = self.archive_path output = "" - output_template = '{}  ' + output_template = '{}' # Get all plugins from hooks system (sorted by numeric prefix) all_plugins = [get_plugin_name(e) for e in get_plugins()] @@ -1322,7 +1322,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea icon ) - return format_html('{}', mark_safe(output)) + return format_html('{}', mark_safe(output)) cache_result = cache.get(cache_key) if cache_result: @@ -1789,7 +1789,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea )['total_size'] or 0 # Check if sealed - is_sealed = self.status in (self.StatusChoices.SEALED, self.StatusChoices.FAILED, self.StatusChoices.BACKOFF) + is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED) return { 'total': total, @@ -1992,6 +1992,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file()) else: size = abs_path.stat().st_size + plugin_lower = (result.plugin or '').lower() + if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl'): + plugin_dir = snap_dir / result.plugin + if plugin_dir.exists(): + try: + size = sum(p.stat().st_size for p in plugin_dir.rglob('*') if p.is_file()) + except OSError: + pass outputs.append({ 'name': result.plugin, 'path': embed_path, @@ -2057,6 +2065,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def to_dict(self, extended: bool = False) -> Dict[str, Any]: """Convert Snapshot to a dictionary (replacement for Link._asdict())""" from archivebox.misc.util import ts_to_date_str + from archivebox.core.host_utils import build_snapshot_url result = { 'TYPE': 'core.models.Snapshot', @@ -2078,6 +2087,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'is_static': self.is_static, 'is_archived': self.is_archived, 'archive_path': self.archive_path, + 'archive_url': build_snapshot_url(str(self.id), 'index.html'), 'output_dir': self.output_dir, 'link_dir': self.output_dir, # backwards compatibility alias 'archive_size': self.archive_size, @@ -2129,14 +2139,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea outputs_by_plugin = {out['name']: out for out in outputs} best_preview_path = 'about:blank' + best_result = {'path': 'about:blank', 'result': None} for plugin in preview_priority: out = outputs_by_plugin.get(plugin) if out and out.get('path'): best_preview_path = out['path'] + best_result = out break if best_preview_path == 'about:blank' and outputs: best_preview_path = outputs[0].get('path') or 'about:blank' + best_result = outputs[0] context = { **self.to_dict(extended=True), 'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)), @@ -2151,6 +2164,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, 'best_preview_path': best_preview_path, + 'best_result': best_result, 'archiveresults': outputs, } rendered_html = render_to_string('snapshot.html', context) @@ -2326,6 +2340,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi app_label = 'core' verbose_name = 'Archive Result' verbose_name_plural = 'Archive Results Log' + indexes = [ + models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'), + ] def __str__(self): return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}' @@ -2487,6 +2504,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi plugin_lower = (plugin_name or '').lower() prefer_media = plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') + preferred_text = [] + if plugin_lower: + preferred_text.extend([ + f'{plugin_lower}.jsonl', + f'{plugin_lower}.json', + f'{plugin_lower}.txt', + f'{plugin_lower}.log', + ]) + preferred_text.extend(['index.jsonl', 'index.json']) + for name in preferred_text: + candidate = dir_path / name + if candidate.exists() and candidate.is_file(): + return candidate + if not prefer_media: for name in ('index.html', 'index.htm'): candidate = dir_path / name @@ -2504,6 +2535,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi if file_path.is_dir() or file_path.name.startswith('.'): continue ext = file_path.suffix.lstrip('.').lower() + if ext in ('pid', 'log', 'sh'): + continue if ext not in embeddable_exts: continue try: @@ -2547,20 +2580,44 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # Fallback: treat output_str as a file path only if it exists on disk if self.output_str: try: - output_path = Path(self.output_str) + raw_output = str(self.output_str).strip() + if raw_output in ('.', './', ''): + best_file = self._find_best_output_file(plugin_dir, self.plugin) + if best_file: + return str(best_file.relative_to(snapshot_dir)) + output_path = None + else: + output_path = Path(raw_output) - if output_path.is_absolute(): + if output_path and output_path.is_absolute(): # If absolute and within snapshot dir, normalize to relative if snapshot_dir in output_path.parents and output_path.exists(): - return str(output_path.relative_to(snapshot_dir)) - else: + if output_path.is_file(): + return str(output_path.relative_to(snapshot_dir)) + if output_path.is_dir(): + best_file = self._find_best_output_file(output_path, self.plugin) + if best_file: + return str(best_file.relative_to(snapshot_dir)) + elif output_path: # If relative, prefer plugin-prefixed path, then direct path - if (plugin_dir / output_path).exists(): - return f'{self.plugin}/{output_path}' + plugin_candidate = plugin_dir / output_path + if plugin_candidate.exists(): + if plugin_candidate.is_file(): + return f'{self.plugin}/{output_path}' + if plugin_candidate.is_dir(): + best_file = self._find_best_output_file(plugin_candidate, self.plugin) + if best_file: + return str(best_file.relative_to(snapshot_dir)) if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'): return None - if (snapshot_dir / output_path).exists(): - return str(output_path) + snapshot_candidate = snapshot_dir / output_path + if snapshot_candidate.exists(): + if snapshot_candidate.is_file(): + return str(output_path) + if snapshot_candidate.is_dir(): + best_file = self._find_best_output_file(snapshot_candidate, self.plugin) + if best_file: + return str(best_file.relative_to(snapshot_dir)) except Exception: pass @@ -2569,7 +2626,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'} output_candidates = [ f for f in self.output_files.keys() - if Path(f).name not in ignored + if Path(f).name not in ignored and Path(f).suffix not in ('.pid', '.log', '.sh') ] first_file = output_candidates[0] if output_candidates else None if first_file and (plugin_dir / first_file).exists(): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 16b6df0c..2dec9a03 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -12,6 +12,7 @@ import archivebox from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa +from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3] @@ -77,9 +78,11 @@ MIDDLEWARE = [ "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", + "archivebox.api.middleware.ApiCorsMiddleware", "django.middleware.csrf.CsrfViewMiddleware", "django.contrib.auth.middleware.AuthenticationMiddleware", "archivebox.core.middleware.ReverseProxyAuthMiddleware", + "archivebox.core.middleware.HostRoutingMiddleware", "django.contrib.messages.middleware.MessageMiddleware", "archivebox.core.middleware.CacheControlMiddleware", # Additional middlewares from plugins (if any) @@ -347,6 +350,14 @@ SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnop ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",") CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(","))) +admin_base_url = normalize_base_url(get_admin_base_url()) +if admin_base_url and admin_base_url not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(admin_base_url) + +api_base_url = normalize_base_url(get_api_base_url()) +if api_base_url and api_base_url not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(api_base_url) + # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com) # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS for hostname in ALLOWED_HOSTS: @@ -363,6 +374,7 @@ CSRF_COOKIE_SECURE = False SESSION_COOKIE_SECURE = False SESSION_COOKIE_HTTPONLY = True SESSION_COOKIE_DOMAIN = None +CSRF_COOKIE_DOMAIN = None SESSION_COOKIE_AGE = 1209600 # 2 weeks SESSION_EXPIRE_AT_BROWSER_CLOSE = False SESSION_SAVE_EVERY_REQUEST = False diff --git a/archivebox/core/templatetags/config_tags.py b/archivebox/core/templatetags/config_tags.py index 9921b1fb..94992075 100644 --- a/archivebox/core/templatetags/config_tags.py +++ b/archivebox/core/templatetags/config_tags.py @@ -15,6 +15,6 @@ def get_config(key: str) -> any: Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %} """ try: - return _get_config(key) + return _get_config().get(key) except (KeyError, AttributeError): return None diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index bcf7f10d..e9a38023 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -9,10 +9,114 @@ from pathlib import Path from archivebox.hooks import ( get_plugin_icon, get_plugin_template, get_plugin_name, ) +from archivebox.core.host_utils import ( + get_admin_base_url, + get_web_base_url, + get_snapshot_base_url, + build_snapshot_url, +) register = template.Library() +_MEDIA_FILE_EXTS = { + '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v', '.mpg', '.mpeg', '.ts', '.m2ts', '.mts', + '.3gp', '.3g2', '.ogv', + '.mp3', '.m4a', '.aac', '.ogg', '.oga', '.opus', '.wav', '.flac', '.alac', '.aiff', '.wma', '.mka', '.ac3', '.eac3', '.dts', +} + + +def _count_media_files(result) -> int: + try: + output_files = getattr(result, 'output_files', None) or {} + except Exception: + output_files = {} + + count_from_output = 0 + if output_files: + count_from_output = sum( + 1 + for path in output_files.keys() + if Path(path).suffix.lower() in _MEDIA_FILE_EXTS + ) + if count_from_output >= 2: + return count_from_output + + try: + plugin_dir = Path(result.snapshot_dir) / result.plugin + except Exception: + return 0 + + if not plugin_dir.exists(): + return 0 + + count = 0 + scanned = 0 + max_scan = 500 + for file_path in plugin_dir.rglob('*'): + if scanned >= max_scan: + break + scanned += 1 + if not file_path.is_file(): + continue + if file_path.suffix.lower() in _MEDIA_FILE_EXTS: + count += 1 + return max(count_from_output, count) + + +def _list_media_files(result) -> list[dict]: + media_files: list[dict] = [] + try: + plugin_dir = Path(result.snapshot_dir) / result.plugin + snapshot_dir = Path(result.snapshot_dir) + except Exception: + return media_files + + output_files = getattr(result, 'output_files', None) or {} + candidates: list[Path] = [] + if output_files: + for path in output_files.keys(): + rel_path = Path(path) + if rel_path.suffix.lower() in _MEDIA_FILE_EXTS: + candidates.append(rel_path) + + if not candidates and plugin_dir.exists(): + scanned = 0 + max_scan = 2000 + for file_path in plugin_dir.rglob('*'): + if scanned >= max_scan: + break + scanned += 1 + if not file_path.is_file(): + continue + if file_path.suffix.lower() in _MEDIA_FILE_EXTS: + try: + rel_path = file_path.relative_to(plugin_dir) + except ValueError: + continue + candidates.append(rel_path) + + for rel_path in candidates: + file_path = plugin_dir / rel_path + if not file_path.exists() or not file_path.is_file(): + continue + try: + size = file_path.stat().st_size + except OSError: + size = None + try: + href = str(file_path.relative_to(snapshot_dir)) + except ValueError: + href = str(Path(result.plugin) / rel_path) + media_files.append({ + 'name': file_path.name, + 'path': href, + 'size': size, + }) + + media_files.sort(key=lambda item: item['name'].lower()) + return media_files + @register.filter(name='split') def split(value, separator: str=','): return (value or '').split(separator) @@ -52,6 +156,28 @@ def url_replace(context, **kwargs): return dict_.urlencode() +@register.simple_tag(takes_context=True) +def admin_base_url(context) -> str: + return get_admin_base_url(request=context.get('request')) + + +@register.simple_tag(takes_context=True) +def web_base_url(context) -> str: + return get_web_base_url(request=context.get('request')) + + +@register.simple_tag(takes_context=True) +def snapshot_base_url(context, snapshot) -> str: + snapshot_id = getattr(snapshot, 'id', snapshot) + return get_snapshot_base_url(str(snapshot_id), request=context.get('request')) + + +@register.simple_tag(takes_context=True) +def snapshot_url(context, snapshot, path: str = "") -> str: + snapshot_id = getattr(snapshot, 'id', snapshot) + return build_snapshot_url(str(snapshot_id), path, request=context.get('request')) + + @register.simple_tag def plugin_icon(plugin: str) -> str: """ @@ -82,24 +208,41 @@ def plugin_card(context, result) -> str: template_str = get_plugin_template(plugin, 'card') # Use embed_path() for the display path - output_path = result.embed_path() if hasattr(result, 'embed_path') else '' + raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else '' + output_url = build_snapshot_url( + str(getattr(result, 'snapshot_id', '')), + raw_output_path or '', + request=context.get('request'), + ) icon_html = get_plugin_icon(plugin) + plugin_lower = (plugin or '').lower() + media_file_count = _count_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else 0 + media_files = _list_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else [] + if media_files: + snapshot_id = str(getattr(result, 'snapshot_id', '')) + request = context.get('request') + for item in media_files: + path = item.get('path') or '' + item['url'] = build_snapshot_url(snapshot_id, path, request=request) if path else '' - output_lower = (output_path or '').lower() + output_lower = (raw_output_path or '').lower() text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log') force_text_preview = output_lower.endswith(text_preview_exts) # Create a mini template and render it with context try: - if template_str and output_path and str(output_path).strip() not in ('.', '/', './') and not force_text_preview: + if template_str and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './') and not force_text_preview: tpl = template.Template(template_str) ctx = template.Context({ 'result': result, 'snapshot': result.snapshot, - 'output_path': output_path, + 'output_path': output_url, + 'output_path_raw': raw_output_path, 'plugin': plugin, 'plugin_icon': icon_html, + 'media_file_count': media_file_count, + 'media_files': media_files, }) rendered = tpl.render(ctx) # Only return non-empty content (strip whitespace to check) @@ -108,10 +251,10 @@ def plugin_card(context, result) -> str: except Exception: pass - if force_text_preview and output_path and str(output_path).strip() not in ('.', '/', './'): - output_file = Path(output_path) + if force_text_preview and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './'): + output_file = Path(raw_output_path) if not output_file.is_absolute(): - output_file = Path(result.snapshot_dir) / output_path + output_file = Path(result.snapshot_dir) / raw_output_path try: output_file = output_file.resolve() snap_dir = Path(result.snapshot_dir).resolve() @@ -169,14 +312,20 @@ def plugin_full(context, result) -> str: if not template_str: return '' - output_path = result.embed_path() if hasattr(result, 'embed_path') else '' + raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else '' + output_url = build_snapshot_url( + str(getattr(result, 'snapshot_id', '')), + raw_output_path or '', + request=context.get('request'), + ) try: tpl = template.Template(template_str) ctx = template.Context({ 'result': result, 'snapshot': result.snapshot, - 'output_path': output_path, + 'output_path': output_url, + 'output_path_raw': raw_output_path, 'plugin': plugin, }) rendered = tpl.render(ctx) @@ -198,3 +347,30 @@ def plugin_name(value: str) -> str: Usage: {{ result.plugin|plugin_name }} """ return get_plugin_name(value) + + +@register.filter +def plugin_display_name(value: str) -> str: + """ + Human-friendly plugin name overrides for UI display. + """ + name = get_plugin_name(value) + if name == 'merkletree': + return 'hashes' + return name + + +@register.simple_tag(takes_context=True) +def api_token(context) -> str: + """ + Return an API token string for the logged-in user, creating one if needed. + """ + from archivebox.api.auth import get_or_create_api_token + + request = context.get('request') + user = getattr(request, 'user', None) + if not user or not user.is_authenticated: + return '' + + token = get_or_create_api_token(user) + return token.token if token else '' diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 708705a6..92f106e1 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView from archivebox.misc.serve_static import serve_static from archivebox.core.admin_site import archivebox_admin -from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, HealthCheckView, live_progress_view +from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view from archivebox.workers.views import JobsDashboardView @@ -29,11 +29,15 @@ urlpatterns = [ path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), path('public/', PublicIndexView.as_view(), name='public-index'), + path('public.html', RedirectView.as_view(url='/public/'), name='public-index-html'), path('archive/', RedirectView.as_view(url='/')), path('archive/', SnapshotView.as_view(), name='Snapshot'), + re_path(r'^web/(?P(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$', WebAddView.as_view(), name='web-add'), re_path(r'^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?Phttps?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'), re_path(r'^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'), + re_path(r'^(?P[^/]+)/(?Phttps?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url-nodate'), + re_path(r'^(?P[^/]+)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path-nodate'), path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')), path('add/', AddView.as_view(), name='add'), diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 4dd7afea..42ec421c 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,13 +1,16 @@ __package__ = 'archivebox.core' import os +import posixpath +from glob import glob, escape from django.utils import timezone import inspect from typing import Callable, get_type_hints from pathlib import Path +from urllib.parse import urlparse from django.shortcuts import render, redirect -from django.http import HttpRequest, HttpResponse, Http404 +from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden from django.utils.html import format_html, mark_safe from django.views import View from django.views.generic.list import ListView @@ -31,6 +34,21 @@ from archivebox.misc.logging_util import printable_filesize from archivebox.search import query_search_index from archivebox.core.models import Snapshot +from archivebox.core.host_utils import build_snapshot_url + + +def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: + target = archivefile or '' + if target == 'index.html': + target = '' + fullpath = Path(snapshot.output_dir) / target + if fullpath.is_file(): + target = str(Path(target).parent) + if target == '.': + target = '' + return target + + from archivebox.core.forms import AddLinkForm from archivebox.crawls.models import Crawl from archivebox.hooks import get_enabled_plugins, get_plugin_name @@ -86,13 +104,95 @@ class SnapshotView(View): def render_live_index(request, snapshot): TITLE_LOADING_MSG = 'Not yet archived...' - outputs = snapshot.discover_outputs() + hidden_card_plugins = {'archivedotorg', 'favicon', 'title'} + outputs = [ + out for out in snapshot.discover_outputs() + if (out.get('size') or 0) > 0 and out.get('name') not in hidden_card_plugins + ] archiveresults = {out['name']: out for out in outputs} snap_dir = Path(snapshot.output_dir) - # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) # Convert to base names for display ordering all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()] + accounted_entries: set[str] = set() + for output in outputs: + output_name = output.get('name') or '' + if output_name: + accounted_entries.add(output_name) + output_path = output.get('path') or '' + if not output_path: + continue + parts = Path(output_path).parts + if parts: + accounted_entries.add(parts[0]) + + ignore_names = { + '.DS_Store', + 'index.html', + 'index.json', + 'index.jsonl', + 'favicon.ico', + } + ignored_suffixes = {'.log', '.pid', '.sh'} + max_loose_scan = 300 + + def has_meaningful_files(dir_path: Path) -> bool: + scanned = 0 + for file_path in dir_path.rglob('*'): + scanned += 1 + if scanned > max_loose_scan: + return True + if file_path.is_dir() or file_path.name.startswith('.'): + continue + if file_path.suffix.lower() in ignored_suffixes: + continue + try: + if file_path.stat().st_size == 0: + continue + except OSError: + continue + return True + return False + + unaccounted_entries = [] + if snap_dir.exists(): + for entry in snap_dir.iterdir(): + name = entry.name + if name.startswith('.') or name in ignore_names or name in accounted_entries: + continue + is_dir = entry.is_dir() + is_meaningful = False + size = None + if is_dir: + is_meaningful = has_meaningful_files(entry) + elif entry.is_file(): + if entry.suffix.lower() not in ignored_suffixes: + try: + size = entry.stat().st_size + is_meaningful = size > 0 + except OSError: + size = None + is_meaningful = False + + unaccounted_entries.append({ + 'name': name, + 'path': name, + 'is_dir': is_dir, + 'size': size, + 'is_meaningful': is_meaningful, + }) + + unaccounted_entries.sort(key=lambda item: item['name'].lower()) + loose_items = [item for item in unaccounted_entries if item['is_meaningful']] + failed_exclude_suffixes = {'.json', '.jsonl', '.sh', '.log'} + failed_items = [ + item for item in unaccounted_entries + if not item['is_meaningful'] + and not ( + not item['is_dir'] + and Path(item['name']).suffix.lower() in failed_exclude_suffixes + ) + ] preview_priority = [ 'singlefile', 'screenshot', @@ -111,12 +211,48 @@ class SnapshotView(View): break snapshot_info = snapshot.to_dict(extended=True) + related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url) + related_snapshots = list( + related_snapshots_qs.exclude(id=snapshot.id).order_by('-bookmarked_at', '-created_at', '-timestamp')[:25] + ) + related_years_map: dict[int, list[Snapshot]] = {} + for snap in [snapshot, *related_snapshots]: + snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at + if not snap_dt: + continue + related_years_map.setdefault(snap_dt.year, []).append(snap) + related_years = [] + for year, snaps in related_years_map.items(): + snaps_sorted = sorted( + snaps, + key=lambda s: (s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now()), + reverse=True, + ) + related_years.append({ + 'year': year, + 'latest': snaps_sorted[0], + 'snapshots': snaps_sorted, + }) + related_years.sort(key=lambda item: item['year'], reverse=True) try: warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name except IndexError: warc_path = 'warc/' + ordered_outputs = sorted( + archiveresults.values(), + key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size'], + ) + non_compact_outputs = [ + out for out in ordered_outputs + if not out.get('is_compact') and not out.get('is_metadata') + ] + compact_outputs = [ + out for out in ordered_outputs + if out.get('is_compact') or out.get('is_metadata') + ] + context = { **snapshot_info, 'title': htmlencode( @@ -131,9 +267,13 @@ class SnapshotView(View): 'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date), 'warc_path': warc_path, 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, - 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']), + 'archiveresults': [*non_compact_outputs, *compact_outputs], 'best_result': best_result, 'snapshot': snapshot, # Pass the snapshot object for template tags + 'related_snapshots': related_snapshots, + 'related_years': related_years, + 'loose_items': loose_items, + 'failed_items': failed_items, } return render(template_name='core/snapshot_live.html', request=request, context=context) @@ -168,13 +308,20 @@ class SnapshotView(View): target_path = f'{target_path}?{query}' return redirect(target_path) - if archivefile == 'index.html': + if request.GET.get('files'): + target_path = _files_index_target(snapshot, archivefile) + response = serve_static_with_byterange_support( + request, target_path, document_root=snapshot.output_dir, show_indexes=True, + ) + elif archivefile == 'index.html': # if they requested snapshot index, serve live rendered template instead of static html response = self.render_live_index(request, snapshot) else: - response = serve_static_with_byterange_support( - request, archivefile, document_root=snapshot.output_dir, show_indexes=True, - ) + target = build_snapshot_url(str(snapshot.id), archivefile, request=request) + query = request.META.get('QUERY_STRING') + if query: + target = f'{target}?{query}' + return redirect(target) response["Link"] = f'<{snapshot.url}>; rel="canonical"' return response except Snapshot.DoesNotExist: @@ -328,13 +475,16 @@ class SnapshotView(View): class SnapshotPathView(View): """Serve snapshots by the new URL scheme: /////...""" - def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None): + def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None): if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: return redirect(f'/admin/login/?next={request.path}') if username == 'system': return redirect(request.path.replace('/system/', '/web/', 1)) + if date and domain and domain == date: + raise Http404 + requested_url = url if not requested_url and domain and domain.startswith(('http://', 'https://')): requested_url = domain @@ -358,19 +508,20 @@ class SnapshotPathView(View): else: qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup) - try: - if len(date) == 4: - qs = qs.filter(created_at__year=int(date)) - elif len(date) == 6: - qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6])) - elif len(date) == 8: - qs = qs.filter( - created_at__year=int(date[:4]), - created_at__month=int(date[4:6]), - created_at__day=int(date[6:8]), - ) - except ValueError: - pass + if date: + try: + if len(date) == 4: + qs = qs.filter(created_at__year=int(date)) + elif len(date) == 6: + qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6])) + elif len(date) == 8: + qs = qs.filter( + created_at__year=int(date[:4]), + created_at__month=int(date[4:6]), + created_at__day=int(date[6:8]), + ) + except ValueError: + pass if requested_url: snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first() @@ -401,7 +552,10 @@ class SnapshotPathView(View): ) canonical_base = snapshot.url_path - requested_base = f'{username}/{date}/{domain or url or ""}' + if date: + requested_base = f'{username}/{date}/{domain or url or ""}' + else: + requested_base = f'{username}/{domain or url or ""}' if snapshot_id: requested_base = f'{requested_base}/{snapshot_id}' if canonical_base != requested_base: @@ -412,6 +566,18 @@ class SnapshotPathView(View): return redirect(target) archivefile = path or "index.html" + if archivefile != "index.html" and not request.GET.get('files'): + target = build_snapshot_url(str(snapshot.id), archivefile, request=request) + query = request.META.get('QUERY_STRING') + if query: + target = f'{target}?{query}' + return redirect(target) + + if request.GET.get('files'): + target_path = _files_index_target(snapshot, archivefile) + return serve_static_with_byterange_support( + request, target_path, document_root=snapshot.output_dir, show_indexes=True, + ) if archivefile == "index.html": return SnapshotView.render_live_index(request, snapshot) @@ -421,6 +587,202 @@ class SnapshotPathView(View): ) +def _safe_archive_relpath(path: str) -> str | None: + if not path: + return "" + cleaned = posixpath.normpath(path) + cleaned = cleaned.lstrip("/") + if cleaned.startswith("..") or "/../" in f"/{cleaned}/": + return None + return cleaned + + +def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None: + if not domain or not rel_path: + return None + domain = domain.split(":", 1)[0].lower() + # TODO: optimize by querying output_files in DB instead of globbing filesystem + data_root = DATA_DIR / "users" + escaped_domain = escape(domain) + escaped_path = escape(rel_path) + pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path) + matches = glob(pattern) + if not matches: + return None + + def sort_key(match_path: str) -> tuple[str, str]: + parts = Path(match_path).parts + date_str = "" + try: + idx = parts.index("snapshots") + date_str = parts[idx + 1] + except Exception: + date_str = "" + return (date_str, match_path) + + best = max(matches, key=sort_key) + best_path = Path(best) + parts = best_path.parts + try: + responses_idx = parts.index("responses") + except ValueError: + return None + responses_root = Path(*parts[: responses_idx + 1]) + rel_to_root = Path(*parts[responses_idx + 1 :]) + return responses_root, rel_to_root + + +def _latest_responses_root(domain: str) -> Path | None: + if not domain: + return None + domain = domain.split(":", 1)[0].lower() + data_root = DATA_DIR / "users" + escaped_domain = escape(domain) + pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain) + matches = glob(pattern) + if not matches: + return None + + def sort_key(match_path: str) -> tuple[str, str]: + parts = Path(match_path).parts + date_str = "" + try: + idx = parts.index("snapshots") + date_str = parts[idx + 1] + except Exception: + date_str = "" + return (date_str, match_path) + + best = max(matches, key=sort_key) + return Path(best) + + +def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool): + candidates: list[str] = [] + rel_path = rel_path or "" + if rel_path.endswith("/"): + rel_path = f"{rel_path}index.html" + if "." not in Path(rel_path).name: + candidates.append(f"{rel_path.rstrip('/')}/index.html") + candidates.append(rel_path) + + for candidate in candidates: + try: + return serve_static_with_byterange_support( + request, + candidate, + document_root=str(responses_root), + show_indexes=show_indexes, + ) + except Http404: + pass + + if rel_path.endswith("index.html"): + rel_dir = rel_path[: -len("index.html")] + try: + return serve_static_with_byterange_support( + request, + rel_dir, + document_root=str(responses_root), + show_indexes=True, + ) + except Http404: + return None + return None + + +class SnapshotHostView(View): + """Serve snapshot directory contents on ./.""" + + def get(self, request, snapshot_id: str, path: str = ""): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return HttpResponseForbidden("Public snapshots are disabled.") + snapshot = None + if snapshot_id: + try: + snapshot = Snapshot.objects.get(pk=snapshot_id) + except Snapshot.DoesNotExist: + try: + snapshot = Snapshot.objects.get(id__startswith=snapshot_id) + except Snapshot.DoesNotExist: + snapshot = None + except Snapshot.MultipleObjectsReturned: + snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first() + + if not snapshot: + raise Http404 + + rel_path = path or "" + show_indexes = bool(request.GET.get("files")) + if not rel_path or rel_path.endswith("/"): + if show_indexes: + rel_path = rel_path.rstrip("/") + else: + rel_path = f"{rel_path}index.html" + rel_path = _safe_archive_relpath(rel_path) + if rel_path is None: + raise Http404 + + try: + return serve_static_with_byterange_support( + request, + rel_path, + document_root=snapshot.output_dir, + show_indexes=show_indexes, + ) + except Http404: + pass + + # Fallback to responses// + host = urlparse(snapshot.url).hostname or snapshot.domain + responses_root = Path(snapshot.output_dir) / "responses" / host + if responses_root.exists(): + response = _serve_responses_path(request, responses_root, rel_path, show_indexes) + if response is not None: + return response + + raise Http404 + + +class OriginalDomainHostView(View): + """Serve responses from the most recent snapshot when using ./.""" + + def get(self, request, domain: str, path: str = ""): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return HttpResponseForbidden("Public snapshots are disabled.") + rel_path = path or "" + if not rel_path or rel_path.endswith("/"): + rel_path = f"{rel_path}index.html" + rel_path = _safe_archive_relpath(rel_path) + if rel_path is None: + raise Http404 + + domain = domain.lower() + match = _latest_response_match(domain, rel_path) + if not match and "." not in Path(rel_path).name: + index_path = f"{rel_path.rstrip('/')}/index.html" + match = _latest_response_match(domain, index_path) + if not match and "." not in Path(rel_path).name: + html_path = f"{rel_path}.html" + match = _latest_response_match(domain, html_path) + + show_indexes = bool(request.GET.get("files")) + if match: + responses_root, rel_to_root = match + response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes) + if response is not None: + return response + + # If no direct match, try serving directory index from latest responses root + responses_root = _latest_responses_root(domain) + if responses_root: + response = _serve_responses_path(request, responses_root, rel_path, show_indexes) + if response is not None: + return response + + raise Http404 + + class PublicIndexView(ListView): template_name = 'public_index.html' model = Snapshot @@ -508,7 +870,7 @@ class AddView(UserPassesTestMixin, FormView): 'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)), } - def form_valid(self, form): + def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl: urls = form.cleaned_data["url"] print(f'[+] Adding URL: {urls}') @@ -522,13 +884,21 @@ class AddView(UserPassesTestMixin, FormView): update = form.cleaned_data.get("update", False) index_only = form.cleaned_data.get("index_only", False) notes = form.cleaned_data.get("notes", "") - custom_config = form.cleaned_data.get("config", {}) + custom_config = form.cleaned_data.get("config") or {} from archivebox.config.permissions import HOSTNAME + if created_by_id is None: + if self.request.user.is_authenticated: + created_by_id = self.request.user.pk + else: + from archivebox.base_models.models import get_or_create_system_user_pk + created_by_id = get_or_create_system_user_pk() + + created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web' # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt - sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt' + sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt' sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) # 2. create a new Crawl with the URLs from the file @@ -552,8 +922,8 @@ class AddView(UserPassesTestMixin, FormView): max_depth=depth, tags_str=tag, notes=notes, - label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}', - created_by_id=self.request.user.pk, + label=f'{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}', + created_by_id=created_by_id, config=config ) @@ -566,7 +936,7 @@ class AddView(UserPassesTestMixin, FormView): is_enabled=True, label=crawl.label, notes=f"Auto-created from add page. {notes}".strip(), - created_by_id=self.request.user.pk, + created_by_id=created_by_id, ) crawl.schedule = crawl_schedule crawl.save(update_fields=['schedule']) @@ -576,7 +946,13 @@ class AddView(UserPassesTestMixin, FormView): # from archivebox.crawls.actors import CrawlActor # from archivebox.core.actors import SnapshotActor, ArchiveResultActor + return crawl + def form_valid(self, form): + crawl = self._create_crawl_from_form(form) + + urls = form.cleaned_data["url"] + schedule = form.cleaned_data.get("schedule", "").strip() rough_url_count = urls.count('://') # Build success message with schedule link if created @@ -593,6 +969,74 @@ class AddView(UserPassesTestMixin, FormView): return redirect(crawl.admin_change_url) +class WebAddView(AddView): + def _latest_snapshot_for_url(self, requested_url: str): + return SnapshotView.find_snapshots_for_url(requested_url).order_by( + '-created_at', '-bookmarked_at', '-timestamp' + ).first() + + def _normalize_add_url(self, requested_url: str) -> str: + if requested_url.startswith(('http://', 'https://')): + return requested_url + return f'https://{requested_url}' + + def dispatch(self, request, *args, **kwargs): + requested_url = urldecode(kwargs.get('url', '') or '') + if requested_url: + snapshot = self._latest_snapshot_for_url(requested_url) + if snapshot: + return redirect(f'/{snapshot.url_path}') + + if not self.test_func(): + return HttpResponse( + format_html( + ( + '



' + 'No Snapshots match the given url: {}


' + 'Return to the Main Index' + '
' + ), + requested_url or '', + ), + content_type="text/html", + status=404, + ) + + return super().dispatch(request, *args, **kwargs) + + def get(self, request, url: str): + requested_url = urldecode(url) + if not requested_url: + raise Http404 + + snapshot = self._latest_snapshot_for_url(requested_url) + if snapshot: + return redirect(f'/{snapshot.url_path}') + + add_url = self._normalize_add_url(requested_url) + defaults_form = self.form_class() + form_data = { + 'url': add_url, + 'depth': defaults_form.fields['depth'].initial or '0', + 'persona': defaults_form.fields['persona'].initial or 'Default', + 'config': {}, + } + if defaults_form.fields['update'].initial: + form_data['update'] = 'on' + if defaults_form.fields['overwrite'].initial: + form_data['overwrite'] = 'on' + if defaults_form.fields['index_only'].initial: + form_data['index_only'] = 'on' + + form = self.form_class(data=form_data) + if not form.is_valid(): + return self.form_invalid(form) + + crawl = self._create_crawl_from_form(form) + snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl}) + return redirect(f'/{snapshot.url_path}') + + class HealthCheckView(View): """ A Django view that renders plain text "OK" for service discovery tools @@ -617,11 +1061,19 @@ def live_progress_view(request): from archivebox.workers.orchestrator import Orchestrator from archivebox.crawls.models import Crawl from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.machine.models import Process, Machine from django.db.models import Case, When, Value, IntegerField # Get orchestrator status orchestrator_running = Orchestrator.is_running() total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0 + machine = Machine.current() + orchestrator_proc = Process.objects.filter( + machine=machine, + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + ).order_by('-started_at').first() + orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None # Get model counts by status crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count() @@ -653,24 +1105,47 @@ def live_progress_view(request): ext = embed.lower().split('.')[-1] if '.' in embed else '' is_embeddable = ext in ('png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'pdf', 'html') if is_embeddable or ar.plugin in ('screenshot', 'favicon', 'dom'): + archive_path = embed or '' recent_thumbnails.append({ 'id': str(ar.id), 'plugin': ar.plugin, 'snapshot_id': str(ar.snapshot_id), 'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '', 'embed_path': embed, - 'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '', + 'archive_path': archive_path, + 'archive_url': build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else '', 'end_ts': ar.end_ts.isoformat() if ar.end_ts else None, }) # Build hierarchical active crawls with nested snapshots and archive results from django.db.models import Prefetch + running_workers = Process.objects.filter( + machine=machine, + process_type=Process.TypeChoices.WORKER, + status=Process.StatusChoices.RUNNING, + ) + crawl_worker_pids: dict[str, int] = {} + snapshot_worker_pids: dict[str, int] = {} + for proc in running_workers: + env = proc.env or {} + if not isinstance(env, dict): + continue + if proc.worker_type == 'crawl': + crawl_id = env.get('CRAWL_ID') + if crawl_id: + crawl_worker_pids[str(crawl_id)] = proc.pid + elif proc.worker_type == 'snapshot': + snapshot_id = env.get('SNAPSHOT_ID') + if snapshot_id: + snapshot_worker_pids[str(snapshot_id)] = proc.pid + active_crawls_qs = Crawl.objects.filter( status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED] ).prefetch_related( 'snapshot_set', 'snapshot_set__archiveresult_set', + 'snapshot_set__archiveresult_set__process', ).distinct().order_by('-modified_at')[:10] active_crawls = [] @@ -710,8 +1185,9 @@ def live_progress_view(request): failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED) pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED) - # Calculate snapshot progress - snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0 + # Calculate snapshot progress using per-plugin progress + now = timezone.now() + plugin_progress_values: list[int] = [] # Get all extractor plugins for this snapshot (already prefetched, sort in Python) # Order: started first, then queued, then completed @@ -724,14 +1200,42 @@ def live_progress_view(request): } return (status_order.get(ar.status, 4), ar.plugin) - all_plugins = [ - { + all_plugins = [] + for ar in sorted(snapshot_results, key=plugin_sort_key): + status = ar.status + progress_value = 0 + if status in ( + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ): + progress_value = 100 + elif status == ArchiveResult.StatusChoices.STARTED: + started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None) + timeout = ar.timeout or 120 + if started_at and timeout: + elapsed = max(0.0, (now - started_at).total_seconds()) + progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100))) + else: + progress_value = 1 + else: + progress_value = 0 + + plugin_progress_values.append(progress_value) + + plugin_payload = { 'id': str(ar.id), 'plugin': ar.plugin, - 'status': ar.status, + 'status': status, } - for ar in sorted(snapshot_results, key=plugin_sort_key) - ] + if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING: + plugin_payload['pid'] = ar.process.pid + if status == ArchiveResult.StatusChoices.STARTED: + plugin_payload['progress'] = progress_value + plugin_payload['timeout'] = ar.timeout or 120 + all_plugins.append(plugin_payload) + + snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0 active_snapshots_for_crawl.append({ 'id': str(snapshot.id), @@ -744,6 +1248,7 @@ def live_progress_view(request): 'failed_plugins': failed_plugins, 'pending_plugins': pending_plugins, 'all_plugins': all_plugins, + 'worker_pid': snapshot_worker_pids.get(str(snapshot.id)), }) # Check if crawl can start (for debugging stuck crawls) @@ -772,10 +1277,12 @@ def live_progress_view(request): 'urls_preview': urls_preview, 'retry_at_future': retry_at_future, 'seconds_until_retry': seconds_until_retry, + 'worker_pid': crawl_worker_pids.get(str(crawl.id)), }) return JsonResponse({ 'orchestrator_running': orchestrator_running, + 'orchestrator_pid': orchestrator_pid, 'total_workers': total_workers, 'crawls_pending': crawls_pending, 'crawls_started': crawls_started, diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py index 433f5c93..bbbceaa7 100644 --- a/archivebox/core/widgets.py +++ b/archivebox/core/widgets.py @@ -1,8 +1,11 @@ __package__ = 'archivebox.core' import json +import re +import hashlib from django import forms from django.utils.html import escape +from django.utils.safestring import mark_safe class TagEditorWidget(forms.Widget): @@ -27,6 +30,23 @@ class TagEditorWidget(forms.Widget): """Escape HTML entities in value.""" return escape(str(value)) if value else '' + def _normalize_id(self, value): + """Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start).""" + normalized = re.sub(r'[^A-Za-z0-9_]', '_', str(value)) + if not normalized or not re.match(r'[A-Za-z_]', normalized): + normalized = f't_{normalized}' + return normalized + + def _tag_style(self, value): + """Compute a stable pastel color style for a tag value.""" + tag = (value or '').strip().lower() + digest = hashlib.md5(tag.encode('utf-8')).hexdigest() + hue = int(digest[:4], 16) % 360 + bg = f'hsl({hue}, 70%, 92%)' + border = f'hsl({hue}, 60%, 82%)' + fg = f'hsl({hue}, 35%, 28%)' + return f'--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};' + def render(self, name, value, attrs=None, renderer=None): """ Render the tag editor widget. @@ -67,13 +87,14 @@ class TagEditorWidget(forms.Widget): elif isinstance(value, str): tags = sorted([t.strip() for t in value.split(',') if t.strip()]) - widget_id = attrs.get('id', name) if attrs else name + widget_id_raw = attrs.get('id', name) if attrs else name + widget_id = self._normalize_id(widget_id_raw) # Build pills HTML pills_html = '' for tag in tags: pills_html += f''' - + {self._escape(tag)} @@ -92,6 +113,7 @@ class TagEditorWidget(forms.Widget): placeholder="Add tag..." autocomplete="off" onkeydown="handleTagKeydown_{widget_id}(event)" + onkeypress="if(event.key==='Enter' || event.keyCode===13){{event.preventDefault(); event.stopPropagation();}}" oninput="fetchTagAutocomplete_{widget_id}(this.value)" > @@ -112,6 +134,47 @@ class TagEditorWidget(forms.Widget): document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(','); }}; + function computeTagStyle_{widget_id}(tagName) {{ + var hash = 0; + var name = String(tagName || '').toLowerCase(); + for (var i = 0; i < name.length; i++) {{ + hash = (hash * 31 + name.charCodeAt(i)) % 360; + }} + var bg = 'hsl(' + hash + ', 70%, 92%)'; + var border = 'hsl(' + hash + ', 60%, 82%)'; + var fg = 'hsl(' + hash + ', 35%, 28%)'; + return {{ bg: bg, border: border, fg: fg }}; + }} + + function applyTagStyle_{widget_id}(el, tagName) {{ + var colors = computeTagStyle_{widget_id}(tagName); + el.style.setProperty('--tag-bg', colors.bg); + el.style.setProperty('--tag-border', colors.border); + el.style.setProperty('--tag-fg', colors.fg); + }} + + function getApiKey() {{ + return (window.ARCHIVEBOX_API_KEY || '').trim(); + }} + + function buildApiUrl(path) {{ + var apiKey = getApiKey(); + if (!apiKey) return path; + var sep = path.indexOf('?') !== -1 ? '&' : '?'; + return path + sep + 'api_key=' + encodeURIComponent(apiKey); + }} + + function buildApiHeaders() {{ + var headers = {{ + 'Content-Type': 'application/json', + }}; + var apiKey = getApiKey(); + if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey; + var csrfToken = getCSRFToken(); + if (csrfToken) headers['X-CSRFToken'] = csrfToken; + return headers; + }} + window.addTag_{widget_id} = function(tagName) {{ tagName = tagName.trim(); if (!tagName) return; @@ -139,12 +202,9 @@ class TagEditorWidget(forms.Widget): document.getElementById('{widget_id}_input').value = ''; // Create tag via API if it doesn't exist (fire and forget) - fetch('/api/v1/core/tags/create/', {{ + fetch(buildApiUrl('/api/v1/core/tags/create/'), {{ method: 'POST', - headers: {{ - 'Content-Type': 'application/json', - 'X-CSRFToken': getCSRFToken() - }}, + headers: buildApiHeaders(), body: JSON.stringify({{ name: tagName }}) }}).catch(function(err) {{ console.log('Tag creation note:', err); @@ -166,6 +226,7 @@ class TagEditorWidget(forms.Widget): var pill = document.createElement('span'); pill.className = 'tag-pill'; pill.setAttribute('data-tag', tag); + applyTagStyle_{widget_id}(pill, tag); var tagText = document.createTextNode(tag); pill.appendChild(tagText); @@ -195,14 +256,16 @@ class TagEditorWidget(forms.Widget): var input = event.target; var value = input.value.trim(); - if (event.key === 'Enter' || event.key === ' ' || event.key === ',') {{ + if (event.key === 'Enter' || event.keyCode === 13 || event.key === ' ' || event.key === ',') {{ event.preventDefault(); + event.stopPropagation(); if (value) {{ // Handle comma-separated values value.split(',').forEach(function(tag) {{ addTag_{widget_id}(tag.trim()); }}); }} + return false; }} else if (event.key === 'Backspace' && !value && currentTags_{widget_id}.length > 0) {{ // Remove last tag on backspace when input is empty var lastTag = currentTags_{widget_id}.pop(); @@ -222,7 +285,7 @@ class TagEditorWidget(forms.Widget): return; }} - fetch('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query)) + fetch(buildApiUrl('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))) .then(function(response) {{ return response.json(); }}) .then(function(data) {{ var datalist = document.getElementById('{widget_id}_datalist'); @@ -261,7 +324,7 @@ class TagEditorWidget(forms.Widget): ''' - return html + return mark_safe(html) class InlineTagEditorWidget(TagEditorWidget): @@ -295,20 +358,23 @@ class InlineTagEditorWidget(TagEditorWidget): tag_data.sort(key=lambda x: x['name'].lower()) tags = [t['name'] for t in tag_data] - widget_id = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name) + widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name) + widget_id = self._normalize_id(widget_id_raw) # Build pills HTML with filter links pills_html = '' for td in tag_data: pills_html += f''' - + {self._escape(td['name'])} ''' + tags_json = escape(json.dumps(tag_data)) + html = f''' - + {pills_html} @@ -318,195 +384,10 @@ class InlineTagEditorWidget(TagEditorWidget): list="{widget_id}_datalist" placeholder="+" autocomplete="off" - onkeydown="handleInlineTagKeydown_{widget_id}(event)" - oninput="fetchInlineTagAutocomplete_{widget_id}(this.value)" - onfocus="this.placeholder='add tag...'" - onblur="this.placeholder='+'" + data-inline-tag-input="1" > - - ''' - return html + return mark_safe(html) diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 08cedf0f..b8429c11 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -62,6 +62,7 @@ import json import signal import time import subprocess +from functools import lru_cache from pathlib import Path from typing import List, Dict, Any, Optional, TypedDict @@ -255,6 +256,7 @@ def run_hook( records = process.get_records() # Get parsed JSONL output """ from archivebox.machine.models import Process, Machine + from archivebox.config.constants import CONSTANTS import time import sys start_time = time.time() @@ -264,6 +266,8 @@ def run_hook( plugin_name = script.parent.name plugin_config = get_plugin_special_config(plugin_name, config) timeout = plugin_config['timeout'] + if timeout: + timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) # Get current machine machine = Machine.current() @@ -568,6 +572,7 @@ def run_hooks( return results +@lru_cache(maxsize=1) def get_plugins() -> List[str]: """ Get list of available plugins by discovering Snapshot hooks. @@ -988,6 +993,8 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) Template content as string, or None if not found and fallback=False. """ base_name = get_plugin_name(plugin) + if base_name in ('yt-dlp', 'youtube-dl'): + base_name = 'ytdlp' for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): if not base_dir.exists(): @@ -1011,6 +1018,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) return None +@lru_cache(maxsize=None) def get_plugin_icon(plugin: str) -> str: """ Get the icon for a plugin from its icon.html template. diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index a47f32ea..6f57cd0b 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -1685,8 +1685,11 @@ class Process(models.Model): TimeoutError if process doesn't exit in time """ import time + from archivebox.config.constants import CONSTANTS timeout = timeout or self.timeout + if self.process_type == self.TypeChoices.HOOK: + timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) start = time.time() while True: diff --git a/archivebox/misc/serve_static.py b/archivebox/misc/serve_static.py index 8df249e1..76bc74e8 100644 --- a/archivebox/misc/serve_static.py +++ b/archivebox/misc/serve_static.py @@ -1,3 +1,6 @@ +import html +import json +import re import os import stat import posixpath @@ -10,6 +13,267 @@ from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpRespon from django.utils._os import safe_join from django.utils.http import http_date from django.utils.translation import gettext as _ +from archivebox.config.common import SERVER_CONFIG + + +_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {} + + +def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None: + hashes_path = snapshot_dir / 'hashes' / 'hashes.json' + if not hashes_path.exists(): + return None + try: + mtime = hashes_path.stat().st_mtime + except OSError: + return None + + cached = _HASHES_CACHE.get(hashes_path) + if cached and cached[0] == mtime: + return cached[1] + + try: + data = json.loads(hashes_path.read_text(encoding='utf-8')) + except Exception: + return None + + file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')} + _HASHES_CACHE[hashes_path] = (mtime, file_map) + return file_map + + +def _hash_for_path(document_root: Path, rel_path: str) -> str | None: + file_map = _load_hash_map(document_root) + if not file_map: + return None + return file_map.get(rel_path) + + +def _cache_policy() -> str: + return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private' + + +# Ensure common web types are mapped consistently across platforms. +mimetypes.add_type("text/html", ".html") +mimetypes.add_type("text/html", ".htm") +mimetypes.add_type("text/css", ".css") +mimetypes.add_type("application/javascript", ".js") +mimetypes.add_type("application/json", ".json") +mimetypes.add_type("application/x-ndjson", ".jsonl") +mimetypes.add_type("text/markdown", ".md") +mimetypes.add_type("text/yaml", ".yml") +mimetypes.add_type("text/yaml", ".yaml") +mimetypes.add_type("text/csv", ".csv") +mimetypes.add_type("text/tab-separated-values", ".tsv") +mimetypes.add_type("application/xml", ".xml") +mimetypes.add_type("image/svg+xml", ".svg") + +try: + import markdown as _markdown +except Exception: + _markdown = None + +MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)') +MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') +MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*') +MARKDOWN_ITALIC_RE = re.compile(r'(?]*>') +HTML_BODY_RE = re.compile(r']*>(.*)', flags=re.IGNORECASE | re.DOTALL) + + +def _extract_markdown_candidate(text: str) -> str: + candidate = text + body_match = HTML_BODY_RE.search(candidate) + if body_match: + candidate = body_match.group(1) + candidate = re.sub(r'^\s*]*>', '', candidate, flags=re.IGNORECASE) + candidate = re.sub(r'

\s*$', '', candidate, flags=re.IGNORECASE) + return candidate.strip() + + +def _looks_like_markdown(text: str) -> bool: + lower = text.lower() + if "" in lower: + return False + md_markers = 0 + md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE)) + md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE)) + md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE)) + md_markers += text.count('[TOC]') + md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text)) + md_markers += text.count('\n---') + text.count('\n***') + return md_markers >= 6 + + +def _render_markdown_fallback(text: str) -> str: + if _markdown is not None and not HTML_TAG_RE.search(text): + try: + return _markdown.markdown( + text, + extensions=["extra", "toc", "sane_lists"], + output_format="html5", + ) + except Exception: + pass + + lines = text.splitlines() + headings = [] + + def slugify(value: str) -> str: + slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-') + return slug or "section" + + for raw_line in lines: + heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line) + if heading_match: + level = len(heading_match.group(1)) + content = heading_match.group(2).strip() + headings.append((level, content, slugify(content))) + + html_lines = [] + in_code = False + in_ul = False + in_ol = False + in_blockquote = False + + def render_inline(markup: str) -> str: + content = MARKDOWN_INLINE_IMAGE_RE.sub(r'\1', markup) + content = MARKDOWN_INLINE_LINK_RE.sub(r'\1', content) + content = MARKDOWN_BOLD_RE.sub(r'\1', content) + content = MARKDOWN_ITALIC_RE.sub(r'\1', content) + return content + + def close_lists(): + nonlocal in_ul, in_ol + if in_ul: + html_lines.append("") + in_ul = False + if in_ol: + html_lines.append("") + in_ol = False + + for raw_line in lines: + line = raw_line.rstrip("\n") + stripped = line.strip() + + if stripped.startswith("```"): + if in_code: + html_lines.append("
") + in_code = False + else: + close_lists() + if in_blockquote: + html_lines.append("") + in_blockquote = False + html_lines.append("
")
+                in_code = True
+            continue
+
+        if in_code:
+            html_lines.append(html.escape(line))
+            continue
+
+        if not stripped:
+            close_lists()
+            if in_blockquote:
+                html_lines.append("")
+                in_blockquote = False
+            html_lines.append("
") + continue + + heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line) + if heading_match: + close_lists() + if in_blockquote: + html_lines.append("") + in_blockquote = False + leading_tags = heading_match.group(1).strip() + level = len(heading_match.group(2)) + content = heading_match.group(3).strip() + if leading_tags: + html_lines.append(leading_tags) + html_lines.append(f"{render_inline(content)}") + continue + + if stripped in ("---", "***"): + close_lists() + html_lines.append("
") + continue + + if stripped.startswith("> "): + if not in_blockquote: + close_lists() + html_lines.append("
") + in_blockquote = True + content = stripped[2:] + html_lines.append(render_inline(content)) + continue + else: + if in_blockquote: + html_lines.append("
") + in_blockquote = False + + ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line) + if ul_match: + if in_ol: + html_lines.append("") + in_ol = False + if not in_ul: + html_lines.append("
    ") + in_ul = True + html_lines.append(f"
  • {render_inline(ul_match.group(1))}
  • ") + continue + + ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line) + if ol_match: + if in_ul: + html_lines.append("
") + in_ul = False + if not in_ol: + html_lines.append("
    ") + in_ol = True + html_lines.append(f"
  1. {render_inline(ol_match.group(1))}
  2. ") + continue + + close_lists() + + # Inline conversions (leave raw HTML intact) + if stripped == "[TOC]": + toc_items = [] + for level, title, slug in headings: + toc_items.append( + f'
  3. {title}
  4. ' + ) + html_lines.append( + '' + ) + continue + + html_lines.append(f"

    {render_inline(line)}

    ") + + close_lists() + if in_blockquote: + html_lines.append("") + if in_code: + html_lines.append("
") + + return "\n".join(html_lines) + + +def _render_markdown_document(markdown_text: str) -> str: + body = _render_markdown_fallback(markdown_text) + wrapped = ( + "" + "" + "" + "" + f"{body}" + "" + ) + return wrapped def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False): @@ -28,18 +292,101 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_ if not os.access(fullpath, os.R_OK): raise Http404(_("“%(path)s” does not exist") % {"path": fullpath}) - # Respect the If-Modified-Since header. statobj = fullpath.stat() - if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime): - return HttpResponseNotModified() + document_root = Path(document_root) if document_root else None + rel_path = path + etag = None + if document_root: + file_hash = _hash_for_path(document_root, rel_path) + if file_hash: + etag = f'"{file_hash}"' + + if etag: + inm = request.META.get("HTTP_IF_NONE_MATCH") + if inm: + inm_list = [item.strip() for item in inm.split(",")] + if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]: + not_modified = HttpResponseNotModified() + not_modified.headers["ETag"] = etag + not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime) + return not_modified content_type, encoding = mimetypes.guess_type(str(fullpath)) content_type = content_type or "application/octet-stream" - + # Add charset for text-like types (best guess), but don't override the type. + is_text_like = ( + content_type.startswith("text/") + or content_type in { + "application/json", + "application/javascript", + "application/xml", + "application/x-ndjson", + "image/svg+xml", + } + ) + if is_text_like and "charset=" not in content_type: + content_type = f"{content_type}; charset=utf-8" + + # Respect the If-Modified-Since header for non-markdown responses. + if not (content_type.startswith("text/plain") or content_type.startswith("text/html")): + if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime): + return HttpResponseNotModified() + + # Heuristic fix: some archived HTML outputs (e.g. mercury content.html) + # are stored with HTML-escaped markup or markdown sources. If so, render sensibly. + if content_type.startswith("text/plain") or content_type.startswith("text/html"): + try: + max_unescape_size = 10 * 1024 * 1024 # 10MB cap to avoid heavy memory use + if statobj.st_size <= max_unescape_size: + raw = fullpath.read_bytes() + decoded = raw.decode("utf-8", errors="replace") + escaped_count = decoded.count("<") + decoded.count(">") + tag_count = decoded.count("<") + if escaped_count and escaped_count > tag_count * 2: + decoded = html.unescape(decoded) + markdown_candidate = _extract_markdown_candidate(decoded) + if _looks_like_markdown(markdown_candidate): + wrapped = _render_markdown_document(markdown_candidate) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return response + if escaped_count and escaped_count > tag_count * 2: + response = HttpResponse(decoded, content_type=content_type) + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return response + except Exception: + pass + # setup resposne object ranged_file = RangedFileReader(open(fullpath, "rb")) response = StreamingHttpResponse(ranged_file, content_type=content_type) response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + if is_text_like: + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if content_type.startswith("image/"): + response.headers["Cache-Control"] = "public, max-age=604800, immutable" # handle byte-range requests by serving chunk of file if stat.S_ISREG(statobj.st_mode): diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js index 4a99028a..7b73a422 100755 --- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js +++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js @@ -26,6 +26,7 @@ const PLUGIN_NAME = 'accessibility'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'accessibility.json'; const CHROME_SESSION_DIR = '../chrome'; +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; // Parse command line arguments function parseArgs() { @@ -76,6 +77,27 @@ function getCdpUrl() { return null; } +function assertChromeSession() { + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); + const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid'); + if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + try { + const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); + if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid'); + process.kill(pid, 0); + } catch (e) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + return cdpUrl; +} + // Extract accessibility info async function extractAccessibility(url) { // Output directory is current directory (hook already runs in output dir) @@ -85,10 +107,7 @@ async function extractAccessibility(url) { try { // Connect to existing Chrome session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; - } + const cdpUrl = assertChromeSession(); browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl, @@ -226,13 +245,10 @@ async function main() { } // Check if Chrome session exists, then wait for page load - const cdpUrl = getCdpUrl(); - if (cdpUrl) { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } + assertChromeSession(); + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); } const result = await extractAccessibility(url); diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py index addd51df..cccfa215 100644 --- a/archivebox/plugins/accessibility/tests/test_accessibility.py +++ b/archivebox/plugins/accessibility/tests/test_accessibility.py @@ -47,7 +47,6 @@ class TestAccessibilityPlugin(TestCase): self.assertTrue(ACCESSIBILITY_HOOK.exists(), f"Hook not found: {ACCESSIBILITY_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestAccessibilityWithChrome(TestCase): """Integration tests for accessibility plugin with Chrome.""" @@ -109,9 +108,7 @@ class TestAccessibilityWithChrome(TestCase): self.assertIn('headings', accessibility_data, f"Missing headings: {accessibility_data}") self.assertIn('url', accessibility_data, f"Missing url: {accessibility_data}") - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") + except RuntimeError: raise def test_accessibility_disabled_skips(self): diff --git a/archivebox/plugins/apt/tests/test_apt_provider.py b/archivebox/plugins/apt/tests/test_apt_provider.py index 430fde24..c8b7934e 100644 --- a/archivebox/plugins/apt/tests/test_apt_provider.py +++ b/archivebox/plugins/apt/tests/test_apt_provider.py @@ -70,9 +70,9 @@ class TestAptProviderHook(TestCase): self.assertEqual(result.returncode, 0) @pytest.mark.skipif(not is_linux(), reason="apt only available on Linux") - @pytest.mark.skipif(not apt_available(), reason="apt not installed") def test_hook_detects_apt(self): """Hook should detect apt binary when available.""" + assert apt_available(), "apt not installed" result = subprocess.run( [ sys.executable, str(INSTALL_HOOK), @@ -112,12 +112,12 @@ class TestAptProviderHook(TestCase): @pytest.mark.skipif(not is_linux(), reason="apt only available on Linux") -@pytest.mark.skipif(not apt_available(), reason="apt not installed") class TestAptProviderSystemBinaries(TestCase): """Test apt provider with system binaries.""" def test_detect_existing_binary(self): """apt provider should detect already-installed system binaries.""" + assert apt_available(), "apt not installed" # Check for a binary that's almost certainly installed (like 'ls' or 'bash') result = subprocess.run( [ diff --git a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py b/archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py similarity index 100% rename from archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py rename to archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index df43115f..e0e42a7e 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -18,6 +18,8 @@ const { finished } = require('stream/promises'); const execAsync = promisify(exec); +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; + // ============================================================================ // Environment helpers // ============================================================================ @@ -373,6 +375,7 @@ async function launchChromium(options = {}) { outputDir = 'chrome', userDataDir = getEnv('CHROME_USER_DATA_DIR'), resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), + userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''), headless = getEnvBool('CHROME_HEADLESS', true), sandbox = getEnvBool('CHROME_SANDBOX', true), checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), @@ -450,17 +453,17 @@ async function launchChromium(options = {}) { const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []); // Build dynamic Chrome arguments (these must be computed at runtime) + const inDocker = getEnvBool('IN_DOCKER', false); const dynamicArgs = [ // Remote debugging setup `--remote-debugging-port=${debugPort}`, '--remote-debugging-address=127.0.0.1', // Sandbox settings (disable in Docker) - ...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']), + ...(sandbox ? [] : (inDocker ? ['--no-sandbox', '--disable-setuid-sandbox'] : [])), // Docker-specific workarounds '--disable-dev-shm-usage', - '--disable-gpu', // Window size `--window-size=${width},${height}`, @@ -468,6 +471,9 @@ async function launchChromium(options = {}) { // User data directory (for persistent sessions with persona) ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), + // User agent + ...(userAgent ? [`--user-agent=${userAgent}`] : []), + // Headless mode ...(headless ? ['--headless=new'] : []), @@ -1387,6 +1393,18 @@ function findChromium() { return null; } +/** + * Find Chromium binary path only (never Chrome/Brave/Edge). + * Prefers CHROME_BINARY if set, then Chromium. + * + * @returns {string|null} - Absolute path or command name to browser binary + */ +function findAnyChromiumBinary() { + const chromiumBinary = findChromium(); + if (chromiumBinary) return chromiumBinary; + return null; +} + // ============================================================================ // Shared Extension Installer Utilities // ============================================================================ @@ -1658,13 +1676,13 @@ async function connectToPage(options = {}) { // Wait for chrome session to be ready const sessionReady = await waitForChromeSession(chromeSessionDir, timeoutMs); if (!sessionReady) { - throw new Error(`Chrome session not ready after ${timeoutMs/1000}s (chrome plugin must run first)`); + throw new Error(CHROME_SESSION_REQUIRED_ERROR); } // Read session files const cdpUrl = readCdpUrl(chromeSessionDir); if (!cdpUrl) { - throw new Error('No Chrome session found (cdp_url.txt missing)'); + throw new Error(CHROME_SESSION_REQUIRED_ERROR); } const targetId = readTargetId(chromeSessionDir); @@ -1749,6 +1767,7 @@ module.exports = { installPuppeteerCore, // Chromium binary finding findChromium, + findAnyChromiumBinary, // Extension utilities getExtensionId, loadExtensionManifest, diff --git a/archivebox/plugins/chrome/extract_cookies.js b/archivebox/plugins/chrome/extract_cookies.js index 2a330152..c23515dc 100644 --- a/archivebox/plugins/chrome/extract_cookies.js +++ b/archivebox/plugins/chrome/extract_cookies.js @@ -23,7 +23,7 @@ if (process.env.NODE_MODULES_DIR) { const fs = require('fs'); const path = require('path'); const { - findChromium, + findAnyChromiumBinary, launchChromium, killChrome, getEnv, @@ -109,9 +109,9 @@ async function main() { process.exit(1); } - const binary = findChromium(); + const binary = findAnyChromiumBinary(); if (!binary) { - console.error('ERROR: Chromium binary not found'); + console.error('ERROR: Chromium-based browser binary not found'); process.exit(1); } diff --git a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js index 17185786..b5cb9822 100644 --- a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js @@ -31,12 +31,15 @@ if (process.env.NODE_MODULES_DIR) { const fs = require('fs'); const path = require('path'); +const http = require('http'); const puppeteer = require('puppeteer'); const { findChromium, launchChromium, killChrome, getEnv, + getEnvBool, + getExtensionId, writePidWithMtime, getExtensionsDir, } = require('./chrome_utils.js'); @@ -154,6 +157,84 @@ async function importCookiesFromFile(browser, cookiesFile, userDataDir) { console.error(`[+] Imported ${imported}/${cookies.length} cookies`); } +function getPortFromCdpUrl(cdpUrl) { + if (!cdpUrl) return null; + const match = cdpUrl.match(/:(\d+)\/devtools\//); + return match ? match[1] : null; +} + +async function fetchDevtoolsTargets(cdpUrl) { + const port = getPortFromCdpUrl(cdpUrl); + if (!port) return []; + + const urlPath = '/json/list'; + return new Promise((resolve, reject) => { + const req = http.get( + { hostname: '127.0.0.1', port, path: urlPath }, + (res) => { + let data = ''; + res.on('data', (chunk) => (data += chunk)); + res.on('end', () => { + try { + const targets = JSON.parse(data); + resolve(Array.isArray(targets) ? targets : []); + } catch (e) { + reject(e); + } + }); + } + ); + req.on('error', reject); + }); +} + +async function discoverExtensionTargets(cdpUrl, installedExtensions) { + const builtinIds = [ + 'nkeimhogjdpnpccoofpliimaahmaaome', + 'fignfifoniblkonapihmkfakmlgkbkcf', + 'ahfgeienlihckogmohjhadlkjgocpleb', + 'mhjfbmdgcfjbbpaeojofohoefgiehjai', + ]; + + let targets = []; + for (let i = 0; i < 10; i += 1) { + try { + targets = await fetchDevtoolsTargets(cdpUrl); + if (targets.length > 0) break; + } catch (e) { + // Ignore and retry + } + await new Promise(r => setTimeout(r, 500)); + } + + const customExtTargets = targets.filter(t => { + const url = t.url || ''; + if (!url.startsWith('chrome-extension://')) return false; + const extId = url.split('://')[1].split('/')[0]; + return !builtinIds.includes(extId); + }); + + console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`); + + for (const target of customExtTargets) { + const url = target.url || ''; + const extId = url.split('://')[1].split('/')[0]; + console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`); + } + + const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0])); + for (const ext of installedExtensions) { + if (ext.id) { + ext.loaded = runtimeIds.has(ext.id); + } + } + + if (customExtTargets.length === 0 && installedExtensions.length > 0) { + console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`); + console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`); + } +} + // Parse command line arguments function parseArgs() { const args = {}; @@ -257,6 +338,17 @@ async function main() { console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); } + // Ensure extension IDs are available without chrome://extensions + for (const ext of installedExtensions) { + if (!ext.id && ext.unpacked_path) { + try { + ext.id = getExtensionId(ext.unpacked_path); + } catch (e) { + console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`); + } + } + } + // Note: PID file is written by run_hook() with hook-specific name // Snapshot.cleanup() kills all *.pid processes when done if (!fs.existsSync(OUTPUT_DIR)) { @@ -280,131 +372,31 @@ async function main() { chromePid = result.pid; const cdpUrl = result.cdpUrl; - // Connect puppeteer for extension verification - console.error(`[*] Connecting puppeteer to CDP...`); - const browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: null, - }); - browserInstance = browser; - - // Import cookies into Chrome profile at crawl start - await importCookiesFromFile(browser, cookiesFile, userDataDir); - - // Get actual extension IDs from chrome://extensions page + // Discover extension targets at launch (no chrome://extensions) if (extensionPaths.length > 0) { await new Promise(r => setTimeout(r, 2000)); + console.error('[*] Discovering extension targets via devtools /json/list...'); + await discoverExtensionTargets(cdpUrl, installedExtensions); + } + + // Only connect to CDP when cookies import is needed to reduce crash risk. + if (cookiesFile) { + console.error(`[*] Connecting puppeteer to CDP for cookie import...`); + const browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: null, + }); + browserInstance = browser; + + // Import cookies into Chrome profile at crawl start + await importCookiesFromFile(browser, cookiesFile, userDataDir); try { - const extPage = await browser.newPage(); - await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 }); - await new Promise(r => setTimeout(r, 2000)); - - // Parse extension info from the page - const extensionsFromPage = await extPage.evaluate(() => { - const extensions = []; - // Extensions manager uses shadow DOM - const manager = document.querySelector('extensions-manager'); - if (!manager || !manager.shadowRoot) return extensions; - - const itemList = manager.shadowRoot.querySelector('extensions-item-list'); - if (!itemList || !itemList.shadowRoot) return extensions; - - const items = itemList.shadowRoot.querySelectorAll('extensions-item'); - for (const item of items) { - const id = item.getAttribute('id'); - const nameEl = item.shadowRoot?.querySelector('#name'); - const name = nameEl?.textContent?.trim() || ''; - if (id && name) { - extensions.push({ id, name }); - } - } - return extensions; - }); - - console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`); - for (const e of extensionsFromPage) { - console.error(` - ${e.id}: "${e.name}"`); - } - - // Match extensions by name (strict matching) - for (const ext of installedExtensions) { - // Read the extension's manifest to get its display name - const manifestPath = path.join(ext.unpacked_path, 'manifest.json'); - if (fs.existsSync(manifestPath)) { - const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); - let manifestName = manifest.name || ''; - - // Resolve message placeholder (e.g., __MSG_extName__) - if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) { - const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__ - const defaultLocale = manifest.default_locale || 'en'; - const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json'); - if (fs.existsSync(messagesPath)) { - try { - const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8')); - if (messages[msgKey] && messages[msgKey].message) { - manifestName = messages[msgKey].message; - } - } catch (e) { - console.error(`[!] Failed to read messages.json: ${e.message}`); - } - } - } - - console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`); - - // Find matching extension from page by exact name match first - let match = extensionsFromPage.find(e => e.name === manifestName); - - // If no exact match, try case-insensitive exact match - if (!match) { - match = extensionsFromPage.find(e => - e.name.toLowerCase() === manifestName.toLowerCase() - ); - } - - if (match) { - ext.id = match.id; - console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`); - } else { - console.error(`[!] No match found for: ${ext.name} (${manifestName})`); - } - } - } - - await extPage.close(); - } catch (e) { - console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`); - } - - // Fallback: check browser targets - const targets = browser.targets(); - const builtinIds = [ - 'nkeimhogjdpnpccoofpliimaahmaaome', - 'fignfifoniblkonapihmkfakmlgkbkcf', - 'ahfgeienlihckogmohjhadlkjgocpleb', - 'mhjfbmdgcfjbbpaeojofohoefgiehjai', - ]; - const customExtTargets = targets.filter(t => { - const url = t.url(); - if (!url.startsWith('chrome-extension://')) return false; - const extId = url.split('://')[1].split('/')[0]; - return !builtinIds.includes(extId); - }); - - console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`); - - for (const target of customExtTargets) { - const url = target.url(); - const extId = url.split('://')[1].split('/')[0]; - console.error(`[+] Extension target: ${extId} (${target.type()})`); - } - - if (customExtTargets.length === 0 && extensionPaths.length > 0) { - console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`); - console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`); - } + browser.disconnect(); + } catch (e) {} + browserInstance = null; + } else { + console.error('[*] Skipping puppeteer CDP connection (no cookies to import)'); } // Write extensions metadata with actual IDs diff --git a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js index ca8e8232..4f3c6594 100755 --- a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js @@ -2,9 +2,8 @@ /** * Create a Chrome tab for this snapshot in the shared crawl Chrome session. * - * If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js), - * this connects to it and creates a new tab. Otherwise, falls back to launching - * its own Chrome instance. + * Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js) + * and creates a new tab. This hook does NOT launch its own Chrome instance. * * Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= --crawl-id= * Output: Creates chrome/ directory under snapshot output dir with: @@ -15,11 +14,7 @@ * * Environment variables: * CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session) - * CHROME_BINARY: Path to Chromium binary (for fallback) - * CHROME_RESOLUTION: Page resolution (default: 1440,2000) - * CHROME_USER_AGENT: User agent string (optional) - * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - * CHROME_HEADLESS: Run in headless mode (default: true) + * CHROME_BINARY: Path to Chromium binary (optional, for version info) * * This is a background hook that stays alive until SIGTERM so the tab * can be closed cleanly at the end of the snapshot run. @@ -27,24 +22,18 @@ const fs = require('fs'); const path = require('path'); -const { spawn } = require('child_process'); +const { execSync } = require('child_process'); // Add NODE_MODULES_DIR to module resolution paths if set if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer'); -const { - findChromium, - getEnv, - getEnvBool, - parseResolution, - findFreePort, - waitForDebugPort, -} = require('./chrome_utils.js'); +const { getEnv, getEnvInt } = require('./chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'chrome_tab'; const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory const CHROME_SESSION_DIR = '.'; +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; let finalStatus = 'failed'; let finalOutput = ''; @@ -118,61 +107,75 @@ process.on('SIGTERM', () => cleanup('SIGTERM')); process.on('SIGINT', () => cleanup('SIGINT')); // Try to find the crawl's Chrome session -function findCrawlChromeSession() { +function getCrawlChromeSession() { // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', ''); - if (!crawlOutputDir) return null; + if (!crawlOutputDir) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } const crawlChromeDir = path.join(crawlOutputDir, 'chrome'); const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt'); const pidFile = path.join(crawlChromeDir, 'chrome.pid'); - if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) { - try { - const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); - const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10); - - // Verify the process is still running - try { - process.kill(pid, 0); // Signal 0 = check if process exists - return { cdpUrl, pid }; - } catch (e) { - // Process not running - return null; - } - } catch (e) { - return null; - } + if (!fs.existsSync(cdpFile)) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + if (!fs.existsSync(pidFile)) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); } - return null; + const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); + const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10); + if (!cdpUrl) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + if (!pid || Number.isNaN(pid)) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + + // Verify the process is still running + try { + process.kill(pid, 0); // Signal 0 = check if process exists + } catch (e) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + + return { cdpUrl, pid }; +} + +async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) { + const startTime = Date.now(); + let lastError = null; + + while (Date.now() - startTime < timeoutMs) { + try { + return getCrawlChromeSession(); + } catch (e) { + lastError = e; + } + await new Promise(resolve => setTimeout(resolve, intervalMs)); + } + + if (lastError) { + throw lastError; + } + throw new Error(CHROME_SESSION_REQUIRED_ERROR); } // Create a new tab in an existing Chrome session async function createTabInExistingChrome(cdpUrl, url, pid) { - const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); - const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); - const { width, height } = parseResolution(resolution); - console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`); // Connect Puppeteer to the running Chrome const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl, - defaultViewport: { width, height }, + defaultViewport: null, }); // Create a new tab for this snapshot const page = await browser.newPage(); - // Set viewport - await page.setViewport({ width, height }); - - // Set user agent if specified - if (userAgent) { - await page.setUserAgent(userAgent); - } - // Get the page target ID const target = page.target(); const targetId = target._targetId; @@ -189,112 +192,6 @@ async function createTabInExistingChrome(cdpUrl, url, pid) { return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid }; } -// Fallback: Launch a new Chrome instance for this snapshot -async function launchNewChrome(url, binary) { - const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); - const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); - const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); - const headless = getEnvBool('CHROME_HEADLESS', true); - - const { width, height } = parseResolution(resolution); - - // Find a free port for Chrome DevTools - const debugPort = await findFreePort(); - console.log(`[*] Launching new Chrome on port: ${debugPort}`); - - // Build Chrome arguments - const chromeArgs = [ - `--remote-debugging-port=${debugPort}`, - '--remote-debugging-address=127.0.0.1', - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--disable-sync', - '--no-first-run', - '--no-default-browser-check', - '--disable-default-apps', - '--disable-infobars', - '--disable-blink-features=AutomationControlled', - '--disable-component-update', - '--disable-domain-reliability', - '--disable-breakpad', - '--disable-background-networking', - '--disable-background-timer-throttling', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-ipc-flooding-protection', - '--password-store=basic', - '--use-mock-keychain', - '--font-render-hinting=none', - '--force-color-profile=srgb', - `--window-size=${width},${height}`, - ...(headless ? ['--headless=new'] : []), - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - 'about:blank', - ]; - - // Launch Chrome as a detached process (since no crawl-level Chrome exists) - const chromeProcess = spawn(binary, chromeArgs, { - detached: true, - stdio: ['ignore', 'ignore', 'ignore'], - }); - chromeProcess.unref(); - - const chromePid = chromeProcess.pid; - console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`); - - // Write PID immediately for cleanup - fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid)); - - try { - // Wait for Chrome to be ready - const versionInfo = await waitForDebugPort(debugPort, 30000); - console.log(`[+] Chrome ready: ${versionInfo.Browser}`); - - const wsUrl = versionInfo.webSocketDebuggerUrl; - fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl); - - // Connect Puppeteer to get page info - const browser = await puppeteer.connect({ - browserWSEndpoint: wsUrl, - defaultViewport: { width, height }, - }); - - let pages = await browser.pages(); - let page = pages[0]; - - if (!page) { - page = await browser.newPage(); - } - - await page.setViewport({ width, height }); - - if (userAgent) { - await page.setUserAgent(userAgent); - } - - const target = page.target(); - const targetId = target._targetId; - - fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid)); - fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId); - fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url); - - browser.disconnect(); - - return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid }; - - } catch (e) { - try { - process.kill(chromePid, 'SIGTERM'); - } catch (killErr) { - // Ignore - } - return { success: false, error: `${e.name}: ${e.message}` }; - } -} - async function main() { const args = parseArgs(); const url = args.url; @@ -312,33 +209,21 @@ async function main() { let version = ''; try { - const binary = findChromium(); - if (!binary) { - console.error('ERROR: Chromium binary not found'); - console.error('DEPENDENCY_NEEDED=chromium'); - console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); - console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest'); - process.exit(1); - } - // Get Chrome version try { - const { execSync } = require('child_process'); - version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64); + const binary = getEnv('CHROME_BINARY', '').trim(); + if (binary) { + version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64); + } } catch (e) { version = ''; } - // Try to use existing crawl Chrome session - const crawlSession = findCrawlChromeSession(); - let result; - - if (crawlSession) { - console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); - result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); - } else { - result = { success: false, error: 'No crawl Chrome session found (CRAWL_OUTPUT_DIR missing or chrome not running)' }; - } + // Try to use existing crawl Chrome session (wait for readiness) + const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60))); + const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000); + console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); + const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); if (result.success) { status = 'succeeded'; diff --git a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js index 219b58b9..dae2a3db 100644 --- a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js +++ b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js @@ -21,6 +21,7 @@ const { } = require('./chrome_utils.js'); const CHROME_SESSION_DIR = '.'; +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; function parseArgs() { const args = {}; @@ -50,7 +51,7 @@ async function main() { const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs); if (!ready) { - const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`; + const error = CHROME_SESSION_REQUIRED_ERROR; console.error(`[chrome_wait] ERROR: ${error}`); console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error })); process.exit(1); @@ -59,7 +60,7 @@ async function main() { const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); const targetId = readTargetId(CHROME_SESSION_DIR); if (!cdpUrl || !targetId) { - const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)'; + const error = CHROME_SESSION_REQUIRED_ERROR; console.error(`[chrome_wait] ERROR: ${error}`); console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error })); process.exit(1); diff --git a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js index 242c9853..33c515ec 100644 --- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js +++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js @@ -24,6 +24,7 @@ const puppeteer = require('puppeteer'); const PLUGIN_NAME = 'chrome_navigate'; const CHROME_SESSION_DIR = '.'; const OUTPUT_DIR = '.'; +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; function parseArgs() { const args = {}; @@ -175,13 +176,13 @@ async function main() { // Wait for chrome tab to be open (up to 60s) const tabOpen = await waitForChromeTabOpen(60000); if (!tabOpen) { - console.error('ERROR: Chrome tab not open after 60s (chrome_tab must run first)'); + console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`); process.exit(1); } const cdpUrl = getCdpUrl(); if (!cdpUrl) { - console.error('ERROR: Chrome CDP URL not found (chrome tab not initialized)'); + console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`); process.exit(1); } diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 8be2bb3c..3e37ce26 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -229,6 +229,33 @@ def get_extensions_dir() -> str: return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') +def link_puppeteer_cache(lib_dir: Path) -> None: + """Best-effort symlink from system Puppeteer cache into test lib_dir. + + Avoids repeated Chromium downloads across tests by reusing the + default Puppeteer cache directory. + """ + cache_dir = lib_dir / 'puppeteer' + cache_dir.mkdir(parents=True, exist_ok=True) + + candidates = [ + Path.home() / 'Library' / 'Caches' / 'puppeteer', + Path.home() / '.cache' / 'puppeteer', + ] + for src_root in candidates: + if not src_root.exists(): + continue + for item in src_root.iterdir(): + dst = cache_dir / item.name + if dst.exists(): + continue + try: + os.symlink(item, dst, target_is_directory=item.is_dir()) + except Exception: + # Best-effort only; if symlink fails, leave as-is. + pass + + def find_chromium(data_dir: Optional[str] = None) -> Optional[str]: """Find the Chromium binary path. @@ -632,9 +659,8 @@ def setup_test_env(tmpdir: Path) -> dict: tmpdir: Base temporary directory for the test Returns: - Environment dict with all paths set, or pytest.skip() if Chrome install fails + Environment dict with all paths set. """ - import pytest # Determine machine type (matches archivebox.config.paths.get_machine_type()) machine = platform.machine().lower() @@ -688,7 +714,7 @@ def setup_test_env(tmpdir: Path) -> dict: try: install_chromium_with_hooks(env) except RuntimeError as e: - pytest.skip(str(e)) + raise RuntimeError(str(e)) return env @@ -873,6 +899,7 @@ def chrome_session( lib_dir = data_dir / 'lib' / machine_type npm_dir = lib_dir / 'npm' node_modules_dir = npm_dir / 'node_modules' + puppeteer_cache_dir = lib_dir / 'puppeteer' # Create lib structure for puppeteer installation node_modules_dir.mkdir(parents=True, exist_ok=True) @@ -893,8 +920,12 @@ def chrome_session( 'NODE_PATH': str(node_modules_dir), 'NPM_BIN_DIR': str(npm_dir / '.bin'), 'CHROME_HEADLESS': 'true', + 'PUPPETEER_CACHE_DIR': str(puppeteer_cache_dir), }) + # Reuse system Puppeteer cache to avoid redundant Chromium downloads + link_puppeteer_cache(lib_dir) + # Install Chromium via npm + puppeteer hooks using normal Binary flow install_chromium_with_hooks(env) diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 554a2539..33d328c9 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -125,10 +125,10 @@ def ensure_chromium_and_puppeteer_installed(tmp_path_factory): try: chromium_binary = install_chromium_with_hooks(env) except RuntimeError as e: - pytest.skip(str(e)) + raise RuntimeError(str(e)) if not chromium_binary: - pytest.skip("Chromium not found after install") + raise RuntimeError("Chromium not found after install") os.environ['CHROME_BINARY'] = chromium_binary for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'): diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py index 7d590aaa..ab851d15 100644 --- a/archivebox/plugins/consolelog/tests/test_consolelog.py +++ b/archivebox/plugins/consolelog/tests/test_consolelog.py @@ -13,27 +13,18 @@ import tempfile import time from pathlib import Path -import pytest from django.test import TestCase # Import chrome test helpers sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) from chrome_test_helpers import ( chrome_session, - get_test_env, + CHROME_NAVIGATE_HOOK, get_plugin_dir, get_hook_script, ) -def chrome_available() -> bool: - """Check if Chrome/Chromium is available.""" - for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']: - if shutil.which(name): - return True - return False - - # Get the path to the consolelog hook PLUGIN_DIR = get_plugin_dir(__file__) CONSOLELOG_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_consolelog.*') @@ -48,7 +39,6 @@ class TestConsolelogPlugin(TestCase): self.assertTrue(CONSOLELOG_HOOK.exists(), f"Hook not found: {CONSOLELOG_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestConsolelogWithChrome(TestCase): """Integration tests for consolelog plugin with Chrome.""" @@ -62,68 +52,75 @@ class TestConsolelogWithChrome(TestCase): def test_consolelog_captures_output(self): """Consolelog hook should capture console output from page.""" - test_url = 'https://example.com' + test_url = 'data:text/html,' snapshot_id = 'test-consolelog-snapshot' - try: - with chrome_session( - self.temp_dir, - crawl_id='test-consolelog-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=True, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + with chrome_session( + self.temp_dir, + crawl_id='test-consolelog-crawl', + snapshot_id=snapshot_id, + test_url=test_url, + navigate=False, + timeout=30, + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + console_dir = snapshot_chrome_dir.parent / 'consolelog' + console_dir.mkdir(exist_ok=True) + # Run consolelog hook with the active Chrome session (background hook) + result = subprocess.Popen( + ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(console_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - # Run consolelog hook with the active Chrome session (background hook) - result = subprocess.Popen( - ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - # Check for output file - console_output = snapshot_chrome_dir / 'console.jsonl' + # Check for output file + console_output = console_dir / 'console.jsonl' - # Allow it to run briefly, then terminate (background hook) - time.sleep(3) - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: + # Allow it to run briefly, then terminate (background hook) + for _ in range(10): + if console_output.exists() and console_output.stat().st_size > 0: + break + time.sleep(1) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() - # At minimum, verify no crash - self.assertNotIn('Traceback', stderr) + # At minimum, verify no crash + self.assertNotIn('Traceback', stderr) - # If output file exists, verify it's valid JSONL - if console_output.exists(): - with open(console_output) as f: - content = f.read().strip() - if content: - for line in content.split('\n'): - if line.strip(): - try: - record = json.loads(line) - # Verify structure - self.assertIn('timestamp', record) - self.assertIn('type', record) - except json.JSONDecodeError: - pass # Some lines may be incomplete - - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") - raise + # If output file exists, verify it's valid JSONL and has output + if console_output.exists(): + with open(console_output) as f: + content = f.read().strip() + self.assertTrue(content, "Console output should not be empty") + for line in content.split('\n'): + if line.strip(): + try: + record = json.loads(line) + # Verify structure + self.assertIn('timestamp', record) + self.assertIn('type', record) + except json.JSONDecodeError: + pass # Some lines may be incomplete if __name__ == '__main__': diff --git a/archivebox/plugins/dns/tests/test_dns.py b/archivebox/plugins/dns/tests/test_dns.py new file mode 100644 index 00000000..ac10a478 --- /dev/null +++ b/archivebox/plugins/dns/tests/test_dns.py @@ -0,0 +1,126 @@ +""" +Tests for the DNS plugin. + +Tests the real DNS hook with an actual URL to verify +DNS resolution capture. +""" + +import json +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +from django.test import TestCase + +# Import chrome test helpers +sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) +from chrome_test_helpers import ( + chrome_session, + CHROME_NAVIGATE_HOOK, + get_plugin_dir, + get_hook_script, +) + + +# Get the path to the DNS hook +PLUGIN_DIR = get_plugin_dir(__file__) +DNS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dns.*') + + +class TestDNSPlugin(TestCase): + """Test the DNS plugin.""" + + def test_dns_hook_exists(self): + """DNS hook script should exist.""" + self.assertIsNotNone(DNS_HOOK, "DNS hook not found in plugin directory") + self.assertTrue(DNS_HOOK.exists(), f"Hook not found: {DNS_HOOK}") + + +class TestDNSWithChrome(TestCase): + """Integration tests for DNS plugin with Chrome.""" + + def setUp(self): + """Set up test environment.""" + self.temp_dir = Path(tempfile.mkdtemp()) + + def tearDown(self): + """Clean up.""" + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_dns_records_captured(self): + """DNS hook should capture DNS records from a real URL.""" + test_url = 'https://example.com' + snapshot_id = 'test-dns-snapshot' + + with chrome_session( + self.temp_dir, + crawl_id='test-dns-crawl', + snapshot_id=snapshot_id, + test_url=test_url, + navigate=False, + timeout=30, + ) as (_process, _pid, snapshot_chrome_dir, env): + dns_dir = snapshot_chrome_dir.parent / 'dns' + dns_dir.mkdir(exist_ok=True) + + result = subprocess.Popen( + ['node', str(DNS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(dns_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") + + dns_output = dns_dir / 'dns.jsonl' + for _ in range(30): + if dns_output.exists() and dns_output.stat().st_size > 0: + break + time.sleep(1) + + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() + stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() + + self.assertNotIn('Traceback', stderr) + + self.assertTrue(dns_output.exists(), "dns.jsonl not created") + content = dns_output.read_text().strip() + self.assertTrue(content, "DNS output should not be empty") + + records = [] + for line in content.split('\n'): + line = line.strip() + if not line: + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + + self.assertTrue(records, "No DNS records parsed") + has_ip_record = any(r.get('hostname') and r.get('ip') for r in records) + self.assertTrue(has_ip_record, f"No DNS record with hostname + ip: {records}") + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js index f62662f8..db8a2420 100644 --- a/archivebox/plugins/dom/on_Snapshot__53_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js @@ -2,19 +2,12 @@ /** * Dump the DOM of a URL using Chrome/Puppeteer. * - * If a Chrome session exists (from chrome plugin), connects to it via CDP. - * Otherwise launches a new Chrome instance. + * Requires a Chrome session (from chrome plugin) and connects to it via CDP. * * Usage: on_Snapshot__53_dom.js --url= --snapshot-id= * Output: Writes dom/output.html * * Environment variables: - * CHROME_BINARY: Path to Chrome/Chromium binary - * CHROME_TIMEOUT: Timeout in seconds (default: 60) - * CHROME_RESOLUTION: Page resolution (default: 1440,2000) - * CHROME_USER_AGENT: User agent string (optional) - * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - * CHROME_HEADLESS: Run in headless mode (default: true) * DOM_ENABLED: Enable DOM extraction (default: true) */ @@ -24,11 +17,7 @@ const path = require('path'); if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const { - findChromium, - getEnv, getEnvBool, - getEnvInt, - parseResolution, parseArgs, readCdpUrl, } = require('../chrome/chrome_utils.js'); @@ -86,81 +75,30 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) { } async function dumpDom(url) { - const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; - const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); - const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); - const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); - const headless = getEnvBool('CHROME_HEADLESS', true); - - const { width, height } = parseResolution(resolution); - // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; let page = null; - let connectedToSession = false; try { - // Try to connect to existing Chrome session + // Connect to existing Chrome session (required) const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (cdpUrl) { - try { - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: { width, height }, - }); - connectedToSession = true; - - // Get existing pages or create new one - const pages = await browser.pages(); - page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - page = await browser.newPage(); - } - - // Set viewport on the page - await page.setViewport({ width, height }); - - } catch (e) { - console.error(`Failed to connect to CDP session: ${e.message}`); - browser = null; - } + if (!cdpUrl) { + return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; } - // Fall back to launching new browser - if (!browser) { - const executablePath = findChromium(); - if (!executablePath) { - return { success: false, error: 'Chrome binary not found' }; - } + browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: null, + }); - browser = await puppeteer.launch({ - executablePath, - headless: headless ? 'new' : false, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - `--window-size=${width},${height}`, - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - ], - defaultViewport: { width, height }, - }); + // Get existing pages or create new one + const pages = await browser.pages(); + page = pages.find(p => p.url().startsWith('http')) || pages[0]; + if (!page) { page = await browser.newPage(); - - // Navigate to URL (only if we launched fresh browser) - if (userAgent) { - await page.setUserAgent(userAgent); - } - - await page.goto(url, { - waitUntil: 'networkidle2', - timeout, - }); } // Get the full DOM content @@ -176,9 +114,8 @@ async function dumpDom(url) { } catch (e) { return { success: false, error: `${e.name}: ${e.message}` }; } finally { - // Only close browser if we launched it (not if we connected to session) - if (browser && !connectedToSession) { - await browser.close(); + if (browser) { + browser.disconnect(); } } } @@ -206,14 +143,15 @@ async function main() { process.exit(0); } - // Only wait for page load if using shared Chrome session const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (cdpUrl) { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } + if (!cdpUrl) { + throw new Error('No Chrome session found (chrome plugin must run first)'); + } + + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); } const result = await dumpDom(url); diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index 7312a72f..2d98d873 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -28,6 +28,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( LIB_DIR, NODE_MODULES_DIR, PLUGINS_ROOT, + chrome_session, ) @@ -61,15 +62,19 @@ def test_extracts_dom_from_example_com(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Run DOM extraction hook - result = subprocess.run( - ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=120 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env): + dom_dir = snapshot_chrome_dir.parent / 'dom' + dom_dir.mkdir(exist_ok=True) + + # Run DOM extraction hook + result = subprocess.run( + ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], + cwd=dom_dir, + capture_output=True, + text=True, + timeout=120, + env=env + ) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -90,7 +95,7 @@ def test_extracts_dom_from_example_com(): assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" # Verify filesystem output (hook writes directly to working dir) - dom_file = tmpdir / 'output.html' + dom_file = dom_dir / 'output.html' assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}" # Verify HTML content contains REAL example.com text diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py index cb62dfe3..fc4604f4 100644 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py +++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py @@ -128,8 +128,6 @@ def main(url: str, snapshot_id: str): success, output, error = get_favicon(url) if success: status = 'succeeded' - elif error == 'No favicon found': - status = 'skipped' else: status = 'failed' @@ -148,7 +146,7 @@ def main(url: str, snapshot_id: str): } print(json.dumps(result)) - sys.exit(0 if status in ('succeeded', 'skipped') else 1) + sys.exit(0 if status == 'succeeded' else 1) if __name__ == '__main__': diff --git a/archivebox/plugins/favicon/templates/card.html b/archivebox/plugins/favicon/templates/card.html index 8555e174..c5df1617 100644 --- a/archivebox/plugins/favicon/templates/card.html +++ b/archivebox/plugins/favicon/templates/card.html @@ -3,7 +3,7 @@ {% if output_path %} Favicon {% endif %} diff --git a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py index 73a72a24..b30ca715 100755 --- a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py +++ b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py @@ -48,7 +48,9 @@ def main(): 'pip': { 'packages': [ '--no-deps', + '--prefer-binary', 'forum-dl', + 'chardet==5.2.0', 'pydantic', 'pydantic-core', 'typing-extensions', diff --git a/archivebox/plugins/forumdl/tests/test_forumdl.py b/archivebox/plugins/forumdl/tests/test_forumdl.py index f965d898..18a692c9 100644 --- a/archivebox/plugins/forumdl/tests/test_forumdl.py +++ b/archivebox/plugins/forumdl/tests/test_forumdl.py @@ -13,6 +13,7 @@ Tests verify: """ import json +import os import subprocess import sys import tempfile @@ -28,6 +29,7 @@ TEST_URL = 'https://example.com' # Module-level cache for binary path _forumdl_binary_path = None +_forumdl_lib_root = None def get_forumdl_binary_path(): """Get the installed forum-dl binary path from cache or by running installation.""" @@ -50,11 +52,48 @@ def get_forumdl_binary_path(): except Exception: pass - # If not found, try to install via pip - pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py' + # If not found, try to install via pip using the crawl hook overrides + pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py' + crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py' if pip_hook.exists(): binary_id = str(uuid.uuid4()) machine_id = str(uuid.uuid4()) + overrides = None + + if crawl_hook.exists(): + crawl_result = subprocess.run( + [sys.executable, str(crawl_hook)], + capture_output=True, + text=True, + timeout=30, + ) + for crawl_line in crawl_result.stdout.strip().split('\n'): + if crawl_line.strip().startswith('{'): + try: + crawl_record = json.loads(crawl_line) + if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl': + overrides = crawl_record.get('overrides') + break + except json.JSONDecodeError: + continue + + # Create a persistent temp LIB_DIR for the pip provider + import platform + global _forumdl_lib_root + if not _forumdl_lib_root: + _forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-') + machine = platform.machine().lower() + system = platform.system().lower() + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + machine_type = f"{machine}-{system}" + lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type + lib_dir.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env['LIB_DIR'] = str(lib_dir) + env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data') cmd = [ sys.executable, str(pip_hook), @@ -62,12 +101,15 @@ def get_forumdl_binary_path(): '--machine-id', machine_id, '--name', 'forum-dl' ] + if overrides: + cmd.append(f'--overrides={json.dumps(overrides)}') install_result = subprocess.run( cmd, capture_output=True, text=True, - timeout=300 + timeout=300, + env=env, ) # Parse Binary from pip installation @@ -212,8 +254,7 @@ def test_real_forum_url(): import os binary_path = get_forumdl_binary_path() - if not binary_path: - pytest.skip("forum-dl binary not available") + assert binary_path, "forum-dl binary not available" assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/archivebox/plugins/git/tests/test_git.py b/archivebox/plugins/git/tests/test_git.py index 7701039a..c7449495 100644 --- a/archivebox/plugins/git/tests/test_git.py +++ b/archivebox/plugins/git/tests/test_git.py @@ -19,7 +19,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None) -TEST_URL = 'https://github.com/example/repo.git' +TEST_URL = 'https://github.com/ArchiveBox/abx-pkg.git' def test_hook_script_exists(): assert GIT_HOOK.exists() @@ -31,10 +31,7 @@ def test_verify_deps_with_abx_pkg(): git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) git_loaded = git_binary.load() - if git_loaded and git_loaded.abspath: - assert True, "git is available" - else: - pass + assert git_loaded and git_loaded.abspath, "git is required for git plugin tests" def test_reports_missing_git(): with tempfile.TemporaryDirectory() as tmpdir: @@ -48,9 +45,7 @@ def test_reports_missing_git(): assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined def test_handles_non_git_url(): - pass - if not shutil.which('git'): - pass + assert shutil.which('git'), "git binary not available" with tempfile.TemporaryDirectory() as tmpdir: result = subprocess.run( @@ -83,8 +78,7 @@ def test_real_git_repo(): """Test that git can clone a real GitHub repository.""" import os - if not shutil.which('git'): - pytest.skip("git binary not available") + assert shutil.which('git'), "git binary not available" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) diff --git a/archivebox/plugins/merkletree/config.json b/archivebox/plugins/hashes/config.json similarity index 78% rename from archivebox/plugins/merkletree/config.json rename to archivebox/plugins/hashes/config.json index 6070a026..b57db14a 100644 --- a/archivebox/plugins/merkletree/config.json +++ b/archivebox/plugins/hashes/config.json @@ -3,13 +3,13 @@ "type": "object", "additionalProperties": false, "properties": { - "MERKLETREE_ENABLED": { + "HASHES_ENABLED": { "type": "boolean", "default": true, - "x-aliases": ["SAVE_MERKLETREE", "USE_MERKLETREE"], + "x-aliases": ["SAVE_HASHES", "USE_HASHES"], "description": "Enable merkle tree hash generation" }, - "MERKLETREE_TIMEOUT": { + "HASHES_TIMEOUT": { "type": "integer", "default": 30, "minimum": 5, diff --git a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py b/archivebox/plugins/hashes/on_Snapshot__93_hashes.py similarity index 84% rename from archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py rename to archivebox/plugins/hashes/on_Snapshot__93_hashes.py index 164a0f6a..2738d85f 100755 --- a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py +++ b/archivebox/plugins/hashes/on_Snapshot__93_hashes.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 """ -Create a Merkle tree of all archived outputs. +Create a hashed Merkle tree of all archived outputs. This plugin runs after all extractors complete (priority 93) and generates -a cryptographic Merkle tree of all files in the snapshot directory. +a cryptographic Merkle hash tree of all files in the snapshot directory. -Output: merkletree.json containing root_hash, tree structure, file list, metadata +Output: hashes.json containing root_hash, tree structure, file list, metadata -Usage: on_Snapshot__93_merkletree.py --url= --snapshot-id= +Usage: on_Snapshot__93_hashes.py --url= --snapshot-id= Environment variables: - SAVE_MERKLETREE: Enable merkle tree generation (default: true) + SAVE_HASHES: Enable hash merkle tree generation (default: true) DATA_DIR: ArchiveBox data directory ARCHIVE_DIR: Archive output directory """ @@ -45,7 +45,7 @@ def sha256_data(data: bytes) -> str: def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]: """Recursively collect all files in snapshot directory.""" - exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__'] + exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__'] files = [] for root, dirs, filenames in os.walk(snapshot_dir): @@ -94,8 +94,8 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]: return root_hash, tree_levels -def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]: - """Create a complete Merkle tree of all files in snapshot directory.""" +def create_hashes(snapshot_dir: Path) -> Dict[str, Any]: + """Create a complete Merkle hash tree of all files in snapshot directory.""" files = collect_files(snapshot_dir) file_hashes = [file_hash for _, file_hash, _ in files] root_hash, tree_levels = build_merkle_tree(file_hashes) @@ -132,14 +132,14 @@ def main(url: str, snapshot_id: str): try: # Check if enabled - save_merkletree = os.getenv('MERKLETREE_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on') + save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on') - if not save_merkletree: + if not save_hashes: status = 'skipped' - click.echo(json.dumps({'status': status, 'output': 'MERKLETREE_ENABLED=false'})) + click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'})) sys.exit(0) - # Working directory is the extractor output dir (e.g., /merkletree/) + # Working directory is the extractor output dir (e.g., /hashes/) # Parent is the snapshot directory output_dir = Path.cwd() snapshot_dir = output_dir.parent @@ -149,17 +149,17 @@ def main(url: str, snapshot_id: str): # Ensure output directory exists output_dir.mkdir(exist_ok=True) - output_path = output_dir / 'merkletree.json' + output_path = output_dir / 'hashes.json' # Generate Merkle tree - merkle_data = create_merkle_tree(snapshot_dir) + merkle_data = create_hashes(snapshot_dir) # Write output with open(output_path, 'w', encoding='utf-8') as f: json.dump(merkle_data, f, indent=2) status = 'succeeded' - output = 'merkletree.json' + output = 'hashes.json' root_hash = merkle_data['root_hash'] file_count = merkle_data['metadata']['file_count'] diff --git a/archivebox/plugins/hashes/templates/icon.html b/archivebox/plugins/hashes/templates/icon.html new file mode 100644 index 00000000..211930f0 --- /dev/null +++ b/archivebox/plugins/hashes/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/merkletree/tests/test_merkletree.py b/archivebox/plugins/hashes/tests/test_hashes.py similarity index 71% rename from archivebox/plugins/merkletree/tests/test_merkletree.py rename to archivebox/plugins/hashes/tests/test_hashes.py index ebdd5808..0eb7d7f1 100644 --- a/archivebox/plugins/merkletree/tests/test_merkletree.py +++ b/archivebox/plugins/hashes/tests/test_hashes.py @@ -1,5 +1,5 @@ """ -Tests for the merkletree plugin. +Tests for the hashes plugin. Tests the real merkle tree generation with actual files. """ @@ -15,27 +15,27 @@ import pytest from django.test import TestCase -# Get the path to the merkletree hook +# Get the path to the hashes hook PLUGIN_DIR = Path(__file__).parent.parent -MERKLETREE_HOOK = PLUGIN_DIR / 'on_Snapshot__93_merkletree.py' +HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py' -class TestMerkletreePlugin(TestCase): - """Test the merkletree plugin.""" +class TestHashesPlugin(TestCase): + """Test the hashes plugin.""" - def test_merkletree_hook_exists(self): - """Merkletree hook script should exist.""" - self.assertTrue(MERKLETREE_HOOK.exists(), f"Hook not found: {MERKLETREE_HOOK}") + def test_hashes_hook_exists(self): + """Hashes hook script should exist.""" + self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}") - def test_merkletree_generates_tree_for_files(self): - """Merkletree hook should generate merkle tree for files in snapshot directory.""" + def test_hashes_generates_tree_for_files(self): + """Hashes hook should generate merkle tree for files in snapshot directory.""" with tempfile.TemporaryDirectory() as temp_dir: # Create a mock snapshot directory structure snapshot_dir = Path(temp_dir) / 'snapshot' snapshot_dir.mkdir() - # Create output directory for merkletree - output_dir = snapshot_dir / 'merkletree' + # Create output directory for hashes + output_dir = snapshot_dir / 'hashes' output_dir.mkdir() # Create some test files @@ -48,11 +48,11 @@ class TestMerkletreePlugin(TestCase): # Run the hook from the output directory env = os.environ.copy() - env['MERKLETREE_ENABLED'] = 'true' + env['HASHES_ENABLED'] = 'true' result = subprocess.run( [ - sys.executable, str(MERKLETREE_HOOK), + sys.executable, str(HASHES_HOOK), '--url=https://example.com', '--snapshot-id=test-snapshot', ], @@ -67,8 +67,8 @@ class TestMerkletreePlugin(TestCase): self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") # Check output file exists - output_file = output_dir / 'merkletree.json' - self.assertTrue(output_file.exists(), "merkletree.json not created") + output_file = output_dir / 'hashes.json' + self.assertTrue(output_file.exists(), "hashes.json not created") # Parse and verify output with open(output_file) as f: @@ -87,20 +87,20 @@ class TestMerkletreePlugin(TestCase): self.assertGreater(data['metadata']['file_count'], 0) self.assertGreater(data['metadata']['total_size'], 0) - def test_merkletree_skips_when_disabled(self): - """Merkletree hook should skip when MERKLETREE_ENABLED=false.""" + def test_hashes_skips_when_disabled(self): + """Hashes hook should skip when HASHES_ENABLED=false.""" with tempfile.TemporaryDirectory() as temp_dir: snapshot_dir = Path(temp_dir) / 'snapshot' snapshot_dir.mkdir() - output_dir = snapshot_dir / 'merkletree' + output_dir = snapshot_dir / 'hashes' output_dir.mkdir() env = os.environ.copy() - env['MERKLETREE_ENABLED'] = 'false' + env['HASHES_ENABLED'] = 'false' result = subprocess.run( [ - sys.executable, str(MERKLETREE_HOOK), + sys.executable, str(HASHES_HOOK), '--url=https://example.com', '--snapshot-id=test-snapshot', ], @@ -115,20 +115,20 @@ class TestMerkletreePlugin(TestCase): self.assertEqual(result.returncode, 0) self.assertIn('skipped', result.stdout) - def test_merkletree_handles_empty_directory(self): - """Merkletree hook should handle empty snapshot directory.""" + def test_hashes_handles_empty_directory(self): + """Hashes hook should handle empty snapshot directory.""" with tempfile.TemporaryDirectory() as temp_dir: snapshot_dir = Path(temp_dir) / 'snapshot' snapshot_dir.mkdir() - output_dir = snapshot_dir / 'merkletree' + output_dir = snapshot_dir / 'hashes' output_dir.mkdir() env = os.environ.copy() - env['MERKLETREE_ENABLED'] = 'true' + env['HASHES_ENABLED'] = 'true' result = subprocess.run( [ - sys.executable, str(MERKLETREE_HOOK), + sys.executable, str(HASHES_HOOK), '--url=https://example.com', '--snapshot-id=test-snapshot', ], @@ -143,7 +143,7 @@ class TestMerkletreePlugin(TestCase): self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") # Check output file exists - output_file = output_dir / 'merkletree.json' + output_file = output_dir / 'hashes.json' self.assertTrue(output_file.exists()) with open(output_file) as f: diff --git a/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js b/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js new file mode 100644 index 00000000..7ca72994 --- /dev/null +++ b/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js @@ -0,0 +1,247 @@ +#!/usr/bin/env node +/** + * Capture original request + response headers for the main navigation. + * + * This hook sets up CDP listeners BEFORE chrome_navigate loads the page, + * then waits for navigation to complete. It records the first top-level + * request headers and the corresponding response headers (with :status). + * + * Usage: on_Snapshot__27_headers.bg.js --url= --snapshot-id= + * Output: Writes headers.json + */ + +const fs = require('fs'); +const path = require('path'); + +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); + +const puppeteer = require('puppeteer-core'); + +// Import shared utilities from chrome_utils.js +const { + getEnvBool, + getEnvInt, + parseArgs, + connectToPage, + waitForPageLoaded, +} = require('../chrome/chrome_utils.js'); + +const PLUGIN_NAME = 'headers'; +const OUTPUT_DIR = '.'; +const OUTPUT_FILE = 'headers.json'; +const CHROME_SESSION_DIR = '../chrome'; +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; + +let browser = null; +let page = null; +let client = null; +let shuttingDown = false; +let headersWritten = false; + +let requestId = null; +let requestUrl = null; +let requestHeaders = null; +let responseHeaders = null; +let responseStatus = null; +let responseStatusText = null; +let responseUrl = null; +let originalUrl = null; + +function getFinalUrl() { + const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt'); + if (fs.existsSync(finalUrlFile)) { + return fs.readFileSync(finalUrlFile, 'utf8').trim(); + } + return page ? page.url() : null; +} + +function writeHeadersFile() { + if (headersWritten) return; + if (!responseHeaders) return; + + const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); + const responseHeadersWithStatus = { + ...(responseHeaders || {}), + }; + + if (responseStatus !== null && responseStatus !== undefined && + responseHeadersWithStatus[':status'] === undefined) { + responseHeadersWithStatus[':status'] = String(responseStatus); + } + + const record = { + url: requestUrl || originalUrl, + final_url: getFinalUrl(), + status: responseStatus !== undefined ? responseStatus : null, + request_headers: requestHeaders || {}, + response_headers: responseHeadersWithStatus, + headers: responseHeadersWithStatus, // backwards compatibility + }; + + if (responseStatusText) { + record.statusText = responseStatusText; + } + if (responseUrl) { + record.response_url = responseUrl; + } + + fs.writeFileSync(outputPath, JSON.stringify(record, null, 2)); + headersWritten = true; +} + +async function setupListener(url) { + const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); + const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid'); + + if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + try { + const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); + if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid'); + process.kill(pid, 0); + } catch (e) { + throw new Error(CHROME_SESSION_REQUIRED_ERROR); + } + + const { browser, page } = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs: timeout, + puppeteer, + }); + + client = await page.target().createCDPSession(); + await client.send('Network.enable'); + + client.on('Network.requestWillBeSent', (params) => { + try { + if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) { + responseHeaders = params.redirectResponse.headers || {}; + responseStatus = params.redirectResponse.status || null; + responseStatusText = params.redirectResponse.statusText || null; + responseUrl = params.redirectResponse.url || null; + writeHeadersFile(); + } + + if (requestId) return; + if (params.type && params.type !== 'Document') return; + if (!params.request || !params.request.url) return; + if (!params.request.url.startsWith('http')) return; + + requestId = params.requestId; + requestUrl = params.request.url; + requestHeaders = params.request.headers || {}; + } catch (e) { + // Ignore errors + } + }); + + client.on('Network.responseReceived', (params) => { + try { + if (!requestId || params.requestId !== requestId || responseHeaders) return; + const response = params.response || {}; + responseHeaders = response.headers || {}; + responseStatus = response.status || null; + responseStatusText = response.statusText || null; + responseUrl = response.url || null; + writeHeadersFile(); + } catch (e) { + // Ignore errors + } + }); + + return { browser, page }; +} + +function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) { + if (shuttingDown) return; + shuttingDown = true; + + console.log(JSON.stringify({ + type: 'ArchiveResult', + status, + output_str: outputStr, + })); +} + +async function handleShutdown(signal) { + console.error(`\nReceived ${signal}, emitting final results...`); + if (!headersWritten) { + writeHeadersFile(); + } + if (headersWritten) { + emitResult('succeeded', OUTPUT_FILE); + } else { + emitResult('failed', 'No headers captured'); + } + + if (browser) { + try { + browser.disconnect(); + } catch (e) {} + } + process.exit(headersWritten ? 0 : 1); +} + +async function main() { + const args = parseArgs(); + const url = args.url; + const snapshotId = args.snapshot_id; + + if (!url || !snapshotId) { + console.error('Usage: on_Snapshot__27_headers.bg.js --url= --snapshot-id='); + process.exit(1); + } + + originalUrl = url; + + if (!getEnvBool('HEADERS_ENABLED', true)) { + console.error('Skipping (HEADERS_ENABLED=False)'); + console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'})); + process.exit(0); + } + + try { + // Set up listeners BEFORE navigation + const connection = await setupListener(url); + browser = connection.browser; + page = connection.page; + + // Register signal handlers for graceful shutdown + process.on('SIGTERM', () => handleShutdown('SIGTERM')); + process.on('SIGINT', () => handleShutdown('SIGINT')); + + // Wait for chrome_navigate to complete (non-fatal) + try { + const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200); + } catch (e) { + console.error(`WARN: ${e.message}`); + } + + // Keep alive until SIGTERM + await new Promise(() => {}); + return; + + } catch (e) { + const errorMessage = (e && e.message) + ? `${e.name || 'Error'}: ${e.message}` + : String(e || 'Unknown error'); + console.error(`ERROR: ${errorMessage}`); + + console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'failed', + output_str: errorMessage, + })); + process.exit(1); + } +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/headers/on_Snapshot__55_headers.js b/archivebox/plugins/headers/on_Snapshot__55_headers.js deleted file mode 100644 index 098b95e7..00000000 --- a/archivebox/plugins/headers/on_Snapshot__55_headers.js +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env node -/** - * Extract HTTP response headers for a URL. - * - * If a Chrome session exists (from chrome plugin), reads the captured - * response headers from chrome plugin/response_headers.json. - * Otherwise falls back to making an HTTP HEAD request. - * - * Usage: on_Snapshot__55_headers.js --url= --snapshot-id= - * Output: Writes headers/headers.json - * - * Environment variables: - * TIMEOUT: Timeout in seconds (default: 30) - * USER_AGENT: User agent string (optional) - * CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - */ - -const fs = require('fs'); -const path = require('path'); -const https = require('https'); -const http = require('http'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const { - getEnv, - getEnvBool, - getEnvInt, - parseArgs, -} = require('../chrome/chrome_utils.js'); - -// Extractor metadata -const PLUGIN_NAME = 'headers'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'headers.json'; -const CHROME_SESSION_DIR = '../chrome'; -const CHROME_HEADERS_FILE = 'response_headers.json'; - -// Get headers from chrome plugin if available -function getHeadersFromChromeSession() { - const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE); - if (fs.existsSync(headersFile)) { - try { - const data = JSON.parse(fs.readFileSync(headersFile, 'utf8')); - return data; - } catch (e) { - return null; - } - } - return null; -} - -// Fetch headers via HTTP HEAD request (fallback) -function fetchHeaders(url) { - return new Promise((resolve, reject) => { - const timeout = getEnvInt('TIMEOUT', 30) * 1000; - const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)'); - const checkSsl = getEnvBool('CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); - - const parsedUrl = new URL(url); - const client = parsedUrl.protocol === 'https:' ? https : http; - - const options = { - method: 'HEAD', - hostname: parsedUrl.hostname, - port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80), - path: parsedUrl.pathname + parsedUrl.search, - headers: { 'User-Agent': userAgent }, - timeout, - rejectUnauthorized: checkSsl, - }; - - const req = client.request(options, (res) => { - resolve({ - url: url, - status: res.statusCode, - statusText: res.statusMessage, - headers: res.headers, - }); - }); - - req.on('error', reject); - req.on('timeout', () => { - req.destroy(); - reject(new Error('Request timeout')); - }); - - req.end(); - }); -} - -async function extractHeaders(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - - // Try Chrome session first - const chromeHeaders = getHeadersFromChromeSession(); - if (chromeHeaders && chromeHeaders.headers) { - fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8'); - return { success: true, output: outputPath, method: 'chrome', status: chromeHeaders.status }; - } - - // Fallback to HTTP HEAD request - try { - const headers = await fetchHeaders(url); - fs.writeFileSync(outputPath, JSON.stringify(headers, null, 2), 'utf8'); - return { success: true, output: outputPath, method: 'http', status: headers.status }; - } catch (e) { - return { success: false, error: e.message }; - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__55_headers.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - - try { - const result = await extractHeaders(url); - - if (result.success) { - status = 'succeeded'; - output = result.output; - console.log(`Headers extracted (${result.method}): HTTP ${result.status}`); - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - - if (error) console.error(`ERROR: ${error}`); - - // Output clean JSONL (no RESULT_JSON= prefix) - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: output || error || '', - })); - - process.exit(status === 'succeeded' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/headers/tests/test_headers.py b/archivebox/plugins/headers/tests/test_headers.py index 0930737c..09ec86fb 100644 --- a/archivebox/plugins/headers/tests/test_headers.py +++ b/archivebox/plugins/headers/tests/test_headers.py @@ -7,23 +7,68 @@ Tests verify: 2. Node.js is available 3. Headers extraction works for real example.com 4. Output JSON contains actual HTTP headers -5. HTTP fallback works correctly -6. Config options work (TIMEOUT, USER_AGENT) +5. Config options work (TIMEOUT, USER_AGENT) """ import json import shutil import subprocess import tempfile +import time from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + CHROME_NAVIGATE_HOOK, + get_test_env, + chrome_session, +) PLUGIN_DIR = Path(__file__).parent.parent HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None) TEST_URL = 'https://example.com' +def normalize_root_url(url: str) -> str: + return url.rstrip('/') + +def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id): + hook_proc = subprocess.Popen( + ['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], + cwd=headers_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env, + ) + + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], + cwd=snapshot_chrome_dir, + capture_output=True, + text=True, + timeout=120, + env=env, + ) + + headers_file = headers_dir / 'headers.json' + for _ in range(60): + if headers_file.exists() and headers_file.stat().st_size > 0: + break + time.sleep(1) + + if hook_proc.poll() is None: + hook_proc.terminate() + try: + stdout, stderr = hook_proc.communicate(timeout=5) + except subprocess.TimeoutExpired: + hook_proc.kill() + stdout, stderr = hook_proc.communicate() + else: + stdout, stderr = hook_proc.communicate() + + return hook_proc.returncode, stdout, stderr, nav_result, headers_file + def test_hook_script_exists(): """Verify hook script exists.""" @@ -66,21 +111,25 @@ def test_extracts_headers_from_example_com(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Run headers extraction - result = subprocess.run( - ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + headers_dir = snapshot_chrome_dir.parent / 'headers' + headers_dir.mkdir(exist_ok=True) - assert result.returncode == 0, f"Extraction failed: {result.stderr}" + result = run_headers_capture( + headers_dir, + snapshot_chrome_dir, + env, + TEST_URL, + 'test789', + ) + + hook_code, stdout, stderr, nav_result, headers_file = result + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" + assert hook_code == 0, f"Extraction failed: {stderr}" # Parse clean JSONL output result_json = None - for line in result.stdout.strip().split('\n'): + for line in stdout.strip().split('\n'): line = line.strip() if line.startswith('{'): pass @@ -96,28 +145,36 @@ def test_extracts_headers_from_example_com(): assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" # Verify output file exists (hook writes to current directory) - headers_file = tmpdir / 'headers.json' assert headers_file.exists(), "headers.json not created" # Verify headers JSON contains REAL example.com response headers_data = json.loads(headers_file.read_text()) assert 'url' in headers_data, "Should have url field" - assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}" + assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}" assert 'status' in headers_data, "Should have status field" assert headers_data['status'] in [200, 301, 302], \ f"Should have valid HTTP status, got {headers_data['status']}" + assert 'request_headers' in headers_data, "Should have request_headers field" + assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict" + + assert 'response_headers' in headers_data, "Should have response_headers field" + assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict" + assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty" + assert 'headers' in headers_data, "Should have headers field" assert isinstance(headers_data['headers'], dict), "Headers should be a dict" - assert len(headers_data['headers']) > 0, "Headers dict should not be empty" # Verify common HTTP headers are present - headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()} + headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()} assert 'content-type' in headers_lower or 'content-length' in headers_lower, \ "Should have at least one common HTTP header" + assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \ + "Response headers should include :status pseudo header" + def test_headers_output_structure(): """Test that headers plugin produces correctly structured output.""" @@ -128,21 +185,25 @@ def test_headers_output_structure(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Run headers extraction against real example.com - result = subprocess.run( - ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testformat'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + headers_dir = snapshot_chrome_dir.parent / 'headers' + headers_dir.mkdir(exist_ok=True) - assert result.returncode == 0, f"Extraction failed: {result.stderr}" + result = run_headers_capture( + headers_dir, + snapshot_chrome_dir, + env, + TEST_URL, + 'testformat', + ) + + hook_code, stdout, stderr, nav_result, headers_file = result + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" + assert hook_code == 0, f"Extraction failed: {stderr}" # Parse clean JSONL output result_json = None - for line in result.stdout.strip().split('\n'): + for line in stdout.strip().split('\n'): line = line.strip() if line.startswith('{'): pass @@ -158,27 +219,30 @@ def test_headers_output_structure(): assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" # Verify output structure - output_headers_file = tmpdir / 'headers.json' - assert output_headers_file.exists(), "Output headers.json not created" + assert headers_file.exists(), "Output headers.json not created" - output_data = json.loads(output_headers_file.read_text()) + output_data = json.loads(headers_file.read_text()) # Verify all required fields are present assert 'url' in output_data, "Output should have url field" assert 'status' in output_data, "Output should have status field" + assert 'request_headers' in output_data, "Output should have request_headers field" + assert 'response_headers' in output_data, "Output should have response_headers field" assert 'headers' in output_data, "Output should have headers field" # Verify data types assert isinstance(output_data['status'], int), "Status should be integer" + assert isinstance(output_data['request_headers'], dict), "Request headers should be dict" + assert isinstance(output_data['response_headers'], dict), "Response headers should be dict" assert isinstance(output_data['headers'], dict), "Headers should be dict" # Verify example.com returns expected headers - assert output_data['url'] == TEST_URL + assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL) assert output_data['status'] in [200, 301, 302] -def test_falls_back_to_http_when_chrome_unavailable(): - """Test that headers plugin falls back to HTTP HEAD when chrome unavailable.""" +def test_fails_without_chrome_session(): + """Test that headers plugin fails when chrome session is missing.""" if not shutil.which('node'): pass @@ -186,8 +250,6 @@ def test_falls_back_to_http_when_chrome_unavailable(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Don't create chrome directory - force HTTP fallback - # Run headers extraction result = subprocess.run( ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'], @@ -198,34 +260,8 @@ def test_falls_back_to_http_when_chrome_unavailable(): , env=get_test_env()) - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output exists and has real HTTP headers - output_headers_file = tmpdir / 'headers.json' - assert output_headers_file.exists(), "Output headers.json not created" - - output_data = json.loads(output_headers_file.read_text()) - assert output_data['url'] == TEST_URL - assert output_data['status'] in [200, 301, 302] - assert isinstance(output_data['headers'], dict) - assert len(output_data['headers']) > 0 + assert result.returncode != 0, "Should fail without chrome session" + assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr) def test_config_timeout_honored(): @@ -239,20 +275,26 @@ def test_config_timeout_honored(): # Set very short timeout (but example.com should still succeed) import os - env = os.environ.copy() - env['TIMEOUT'] = '5' + env_override = os.environ.copy() + env_override['TIMEOUT'] = '5' - result = subprocess.run( - ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) + with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + headers_dir = snapshot_chrome_dir.parent / 'headers' + headers_dir.mkdir(exist_ok=True) + env.update(env_override) + + result = run_headers_capture( + headers_dir, + snapshot_chrome_dir, + env, + TEST_URL, + 'testtimeout', + ) # Should complete (success or fail, but not hang) - assert result.returncode in (0, 1), "Should complete without hanging" + hook_code, _stdout, _stderr, nav_result, _headers_file = result + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" + assert hook_code in (0, 1), "Should complete without hanging" def test_config_user_agent(): @@ -266,23 +308,29 @@ def test_config_user_agent(): # Set custom user agent import os - env = os.environ.copy() - env['USER_AGENT'] = 'TestBot/1.0' + env_override = os.environ.copy() + env_override['USER_AGENT'] = 'TestBot/1.0' - result = subprocess.run( - ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=60 - ) + with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + headers_dir = snapshot_chrome_dir.parent / 'headers' + headers_dir.mkdir(exist_ok=True) + env.update(env_override) + + result = run_headers_capture( + headers_dir, + snapshot_chrome_dir, + env, + TEST_URL, + 'testua', + ) # Should succeed (example.com doesn't block) - if result.returncode == 0: + hook_code, stdout, _stderr, nav_result, _headers_file = result + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" + if hook_code == 0: # Parse clean JSONL output result_json = None - for line in result.stdout.strip().split('\n'): + for line in stdout.strip().split('\n'): line = line.strip() if line.startswith('{'): pass @@ -307,20 +355,23 @@ def test_handles_https_urls(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - result = subprocess.run( - ['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + headers_dir = snapshot_chrome_dir.parent / 'headers' + headers_dir.mkdir(exist_ok=True) + result = run_headers_capture( + headers_dir, + snapshot_chrome_dir, + env, + 'https://example.org', + 'testhttps', + ) - if result.returncode == 0: - output_headers_file = tmpdir / 'headers.json' - if output_headers_file.exists(): - output_data = json.loads(output_headers_file.read_text()) - assert output_data['url'] == 'https://example.org' + hook_code, _stdout, _stderr, nav_result, headers_file = result + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" + if hook_code == 0: + if headers_file.exists(): + output_data = json.loads(headers_file.read_text()) + assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org') assert output_data['status'] in [200, 301, 302] @@ -333,21 +384,24 @@ def test_handles_404_gracefully(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - result = subprocess.run( - ['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + headers_dir = snapshot_chrome_dir.parent / 'headers' + headers_dir.mkdir(exist_ok=True) + result = run_headers_capture( + headers_dir, + snapshot_chrome_dir, + env, + 'https://example.com/nonexistent-page-404', + 'test404', + ) # May succeed or fail depending on server behavior # If it succeeds, verify 404 status is captured - if result.returncode == 0: - output_headers_file = tmpdir / 'headers.json' - if output_headers_file.exists(): - output_data = json.loads(output_headers_file.read_text()) + hook_code, _stdout, _stderr, nav_result, headers_file = result + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" + if hook_code == 0: + if headers_file.exists(): + output_data = json.loads(headers_file.read_text()) assert output_data['status'] == 404, "Should capture 404 status" diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js index 3003d370..8275d61c 100755 --- a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js +++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js @@ -42,6 +42,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'infiniscroll'; const CHROME_SESSION_DIR = '../chrome'; +const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; function parseArgs() { const args = {}; @@ -330,7 +331,7 @@ async function main() { const cdpUrl = getCdpUrl(); if (!cdpUrl) { - console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)'); + console.error(CHROME_SESSION_REQUIRED_ERROR); process.exit(1); } @@ -363,10 +364,6 @@ async function main() { page = pages[pages.length - 1]; } - // Set viewport to ensure proper page rendering - const resolution = getEnv('CHROME_RESOLUTION', '1440,2000').split(',').map(x => parseInt(x.trim(), 10)); - await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 }); - console.error(`Starting infinite scroll on ${url}`); // Expand
and comments before scrolling (if enabled) diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index 1248518a..a2c1cb58 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -79,10 +79,12 @@ def test_fails_gracefully_without_chrome_session(): """Test that hook fails gracefully when no chrome session exists.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) + infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll' + infiniscroll_dir.mkdir(parents=True, exist_ok=True) result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'], - cwd=tmpdir, + cwd=infiniscroll_dir, capture_output=True, text=True, env=get_test_env(), diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index 7fdc1c4a..1371b5c7 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -16,6 +16,7 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( setup_test_env, + get_test_env, launch_chromium_session, kill_chromium_session, CHROME_LAUNCH_HOOK, @@ -291,8 +292,7 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(tmpdir, - env=get_test_env()), + cwd=str(tmpdir), capture_output=True, text=True, env=env, @@ -444,8 +444,7 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(script_dir, - env=get_test_env()), + cwd=str(script_dir), capture_output=True, text=True, env=env, @@ -539,7 +538,7 @@ def test_hides_cookie_consent_on_filmin(): print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}") print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}") - pytest.skip( + pytest.fail( f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. " f"Elements found: {len(baseline_result['elements_found'])}. " f"The site may have changed or cookie consent may be region-specific." @@ -559,8 +558,7 @@ def test_hides_cookie_consent_on_filmin(): result = subprocess.run( ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir, - env=get_test_env()), + cwd=str(tmpdir), capture_output=True, text=True, env=env_with_ext, diff --git a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py index b131c14c..1af0bdb6 100644 --- a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py @@ -15,11 +15,13 @@ Environment variables: Note: Requires postlight-parser: npm install -g @postlight/parser """ +import html import json import os import subprocess import sys from pathlib import Path +from urllib.parse import urlparse import rich_click as click @@ -115,13 +117,39 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: # Save HTML content and metadata html_content = html_json.pop('content', '') + # Some sources return HTML-escaped markup inside the content blob. + # If it looks heavily escaped, unescape once so it renders properly. + if html_content: + escaped_count = html_content.count('<') + html_content.count('>') + tag_count = html_content.count('<') + if escaped_count and escaped_count > tag_count * 2: + html_content = html.unescape(html_content) (output_dir / 'content.html').write_text(html_content, encoding='utf-8') # Save article metadata metadata = {k: v for k, v in text_json.items() if k != 'content'} (output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8') - return True, OUTPUT_DIR, '' + # Link images/ to responses capture (if available) + try: + hostname = urlparse(url).hostname or '' + if hostname: + responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve() + link_path = output_dir / 'images' + if responses_images.exists() and responses_images.is_dir(): + if link_path.exists() or link_path.is_symlink(): + if link_path.is_symlink() or link_path.is_file(): + link_path.unlink() + else: + # Don't remove real directories + responses_images = None + if responses_images: + rel_target = os.path.relpath(str(responses_images), str(output_dir)) + link_path.symlink_to(rel_target) + except Exception: + pass + + return True, 'content.html', '' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout} seconds' diff --git a/archivebox/plugins/merkletree/templates/icon.html b/archivebox/plugins/merkletree/templates/icon.html deleted file mode 100644 index b8d3579c..00000000 --- a/archivebox/plugins/merkletree/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js index 38b2a604..7f9e664b 100644 --- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js +++ b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js @@ -237,7 +237,7 @@ async function main() { const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); if (!cdpUrl) { - console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)'); + console.error('No Chrome session found (chrome plugin must run first)'); process.exit(1); } diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py index b66d20d2..53c62479 100644 --- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py +++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py @@ -81,10 +81,12 @@ def test_fails_gracefully_without_chrome_session(): """Test that hook fails gracefully when no chrome session exists.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) + modalcloser_dir = tmpdir / 'snapshot' / 'modalcloser' + modalcloser_dir.mkdir(parents=True, exist_ok=True) result = subprocess.run( ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'], - cwd=tmpdir, + cwd=modalcloser_dir, capture_output=True, text=True, env=get_test_env(), diff --git a/archivebox/plugins/npm/tests/test_npm_provider.py b/archivebox/plugins/npm/tests/test_npm_provider.py index 5492738a..9f00d9d7 100644 --- a/archivebox/plugins/npm/tests/test_npm_provider.py +++ b/archivebox/plugins/npm/tests/test_npm_provider.py @@ -91,9 +91,9 @@ class TestNpmProviderHook(TestCase): self.assertIn('npm provider not allowed', result.stderr) self.assertEqual(result.returncode, 0) - @pytest.mark.skipif(not npm_available(), reason="npm not installed") def test_hook_creates_npm_prefix(self): """Hook should create npm prefix directory.""" + assert npm_available(), "npm not installed" env = os.environ.copy() env['LIB_DIR'] = str(self.lib_dir) diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js index e900d9b5..3076fe61 100755 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js +++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js @@ -81,7 +81,7 @@ function getCdpUrl() { } // Extract outlinks -async function extractOutlinks(url) { +async function extractOutlinks(url, snapshotId, crawlId, depth) { // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); @@ -253,7 +253,7 @@ async function main() { } } - const result = await extractOutlinks(url); + const result = await extractOutlinks(url, snapshotId, crawlId, depth); if (result.success) { status = 'succeeded'; diff --git a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py index cf6df8ed..6f45eb4b 100644 --- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py +++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py @@ -47,7 +47,6 @@ class TestParseDomOutlinksPlugin(TestCase): self.assertTrue(OUTLINKS_HOOK.exists(), f"Hook not found: {OUTLINKS_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestParseDomOutlinksWithChrome(TestCase): """Integration tests for parse_dom_outlinks plugin with Chrome.""" @@ -112,9 +111,7 @@ class TestParseDomOutlinksWithChrome(TestCase): # example.com has at least one link (to iana.org) self.assertIsInstance(outlinks_data['hrefs'], list) - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") + except RuntimeError: raise diff --git a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js index 05648a81..d46a3779 100644 --- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js +++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js @@ -2,19 +2,12 @@ /** * Print a URL to PDF using Chrome/Puppeteer. * - * If a Chrome session exists (from chrome plugin), connects to it via CDP. - * Otherwise launches a new Chrome instance. + * Requires a Chrome session (from chrome plugin) and connects to it via CDP. * * Usage: on_Snapshot__52_pdf.js --url= --snapshot-id= * Output: Writes pdf/output.pdf * * Environment variables: - * CHROME_BINARY: Path to Chrome/Chromium binary - * CHROME_TIMEOUT: Timeout in seconds (default: 60) - * CHROME_RESOLUTION: Page resolution (default: 1440,2000) - * CHROME_USER_AGENT: User agent string (optional) - * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - * CHROME_HEADLESS: Run in headless mode (default: true) * PDF_ENABLED: Enable PDF generation (default: true) */ @@ -24,11 +17,7 @@ const path = require('path'); if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const { - findChromium, - getEnv, getEnvBool, - getEnvInt, - parseResolution, parseArgs, readCdpUrl, } = require('../chrome/chrome_utils.js'); @@ -86,81 +75,30 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) { } async function printToPdf(url) { - const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; - const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); - const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); - const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); - const headless = getEnvBool('CHROME_HEADLESS', true); - - const { width, height } = parseResolution(resolution); - // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; let page = null; - let connectedToSession = false; try { - // Try to connect to existing Chrome session + // Connect to existing Chrome session (required) const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (cdpUrl) { - try { - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: { width, height }, - }); - connectedToSession = true; - - // Get existing pages or create new one - const pages = await browser.pages(); - page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - page = await browser.newPage(); - } - - // Set viewport on the page - await page.setViewport({ width, height }); - - } catch (e) { - console.error(`Failed to connect to CDP session: ${e.message}`); - browser = null; - } + if (!cdpUrl) { + return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; } - // Fall back to launching new browser - if (!browser) { - const executablePath = findChromium(); - if (!executablePath) { - return { success: false, error: 'Chrome binary not found' }; - } + browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: null, + }); - browser = await puppeteer.launch({ - executablePath, - headless: headless ? 'new' : false, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - `--window-size=${width},${height}`, - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - ], - defaultViewport: { width, height }, - }); + // Get existing pages or create new one + const pages = await browser.pages(); + page = pages.find(p => p.url().startsWith('http')) || pages[0]; + if (!page) { page = await browser.newPage(); - - // Navigate to URL (only if we launched fresh browser) - if (userAgent) { - await page.setUserAgent(userAgent); - } - - await page.goto(url, { - waitUntil: 'networkidle2', - timeout, - }); } // Print to PDF @@ -185,9 +123,8 @@ async function printToPdf(url) { } catch (e) { return { success: false, error: `${e.name}: ${e.message}` }; } finally { - // Only close browser if we launched it (not if we connected to session) - if (browser && !connectedToSession) { - await browser.close(); + if (browser) { + browser.disconnect(); } } } @@ -215,14 +152,15 @@ async function main() { process.exit(0); } - // Only wait for page load if using shared Chrome session const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (cdpUrl) { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } + if (!cdpUrl) { + throw new Error('No Chrome session found (chrome plugin must run first)'); + } + + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); } const result = await printToPdf(url); diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py index 8751faef..f9388129 100644 --- a/archivebox/plugins/pdf/tests/test_pdf.py +++ b/archivebox/plugins/pdf/tests/test_pdf.py @@ -29,6 +29,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( LIB_DIR, NODE_MODULES_DIR, PLUGINS_ROOT, + chrome_session, ) @@ -62,15 +63,19 @@ def test_extracts_pdf_from_example_com(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Run PDF extraction hook - result = subprocess.run( - ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=120 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env): + pdf_dir = snapshot_chrome_dir.parent / 'pdf' + pdf_dir.mkdir(exist_ok=True) + + # Run PDF extraction hook + result = subprocess.run( + ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], + cwd=pdf_dir, + capture_output=True, + text=True, + timeout=120, + env=env + ) # Parse clean JSONL output (hook might fail due to network issues) result_json = None @@ -98,7 +103,7 @@ def test_extracts_pdf_from_example_com(): assert result.returncode == 0, f"Should exit 0 on success: {result.stderr}" # Verify filesystem output (hook writes to current directory) - pdf_file = tmpdir / 'output.pdf' + pdf_file = pdf_dir / 'output.pdf' assert pdf_file.exists(), "output.pdf not created" # Verify file is valid PDF @@ -117,7 +122,7 @@ def test_config_save_pdf_false_skips(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - env = os.environ.copy() + env = get_test_env() env['PDF_ENABLED'] = 'False' result = subprocess.run( @@ -140,50 +145,46 @@ def test_config_save_pdf_false_skips(): def test_reports_missing_chrome(): - """Test that script reports error when Chrome is not found.""" + """Test that script reports error when Chrome session is missing.""" import os with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - - # Set CHROME_BINARY to nonexistent path - env = os.environ.copy() - env['CHROME_BINARY'] = '/nonexistent/chrome' + env = get_test_env() + pdf_dir = tmpdir / 'snapshot' / 'pdf' + pdf_dir.mkdir(parents=True, exist_ok=True) result = subprocess.run( ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'], - cwd=tmpdir, + cwd=pdf_dir, capture_output=True, text=True, env=env, timeout=30 ) - # Should fail and report missing Chrome - if result.returncode != 0: - combined = result.stdout + result.stderr - assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined + assert result.returncode != 0, "Should fail without shared Chrome session" + combined = result.stdout + result.stderr + assert 'chrome session' in combined.lower() or 'chrome plugin' in combined.lower() -def test_config_timeout_honored(): - """Test that CHROME_TIMEOUT config is respected.""" - import os - +def test_runs_with_shared_chrome_session(): + """Test that PDF hook completes when shared Chrome session is available.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Set very short timeout - env = os.environ.copy() - env['CHROME_TIMEOUT'] = '5' + with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env): + pdf_dir = snapshot_chrome_dir.parent / 'pdf' + pdf_dir.mkdir(exist_ok=True) - result = subprocess.run( - ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) + result = subprocess.run( + ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'], + cwd=pdf_dir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) # Should complete (success or fail, but not hang) assert result.returncode in (0, 1), "Should complete without hanging" diff --git a/archivebox/plugins/pip/tests/test_pip_provider.py b/archivebox/plugins/pip/tests/test_pip_provider.py index 4a4fe610..d24c7e64 100644 --- a/archivebox/plugins/pip/tests/test_pip_provider.py +++ b/archivebox/plugins/pip/tests/test_pip_provider.py @@ -142,13 +142,14 @@ class TestPipProviderIntegration(TestCase): import shutil shutil.rmtree(self.temp_dir, ignore_errors=True) - @pytest.mark.skipif( - subprocess.run([sys.executable, '-m', 'pip', '--version'], - capture_output=True).returncode != 0, - reason="pip not available" - ) def test_hook_finds_pip_installed_binary(self): """Hook should find binaries installed via pip.""" + pip_check = subprocess.run( + [sys.executable, '-m', 'pip', '--version'], + capture_output=True, + text=True, + ) + assert pip_check.returncode == 0, "pip not available" env = os.environ.copy() env['DATA_DIR'] = self.temp_dir diff --git a/archivebox/plugins/puppeteer/tests/test_puppeteer.py b/archivebox/plugins/puppeteer/tests/test_puppeteer.py index 5d230a7d..a35db7a1 100644 --- a/archivebox/plugins/puppeteer/tests/test_puppeteer.py +++ b/archivebox/plugins/puppeteer/tests/test_puppeteer.py @@ -46,8 +46,8 @@ def test_crawl_hook_emits_puppeteer_binary(): assert 'npm' in binaries[0].get('binproviders', ''), "puppeteer should be installable via npm provider" -@pytest.mark.skipif(shutil.which('npm') is None, reason='npm is required for puppeteer installation') def test_puppeteer_installs_chromium(): + assert shutil.which('npm'), "npm is required for puppeteer installation" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) lib_dir = tmpdir / 'lib' / 'arm64-darwin' diff --git a/archivebox/plugins/readability/on_Snapshot__56_readability.py b/archivebox/plugins/readability/on_Snapshot__56_readability.py index e02e24e6..2c083fb6 100644 --- a/archivebox/plugins/readability/on_Snapshot__56_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py @@ -22,6 +22,7 @@ import subprocess import sys import tempfile from pathlib import Path +from urllib.parse import urlparse import rich_click as click @@ -135,6 +136,24 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: (output_dir / 'content.txt').write_text(text_content, encoding='utf-8') (output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8') + # Link images/ to responses capture (if available) + try: + hostname = urlparse(url).hostname or '' + if hostname: + responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve() + link_path = output_dir / 'images' + if responses_images.exists() and responses_images.is_dir(): + if link_path.exists() or link_path.is_symlink(): + if link_path.is_symlink() or link_path.is_file(): + link_path.unlink() + else: + responses_images = None + if responses_images: + rel_target = os.path.relpath(str(responses_images), str(output_dir)) + link_path.symlink_to(rel_target) + except Exception: + pass + return True, OUTPUT_FILE, '' except subprocess.TimeoutExpired: diff --git a/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js index 66aac407..96defe1b 100755 --- a/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js +++ b/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js @@ -38,6 +38,7 @@ let originalUrl = ''; let finalUrl = ''; let page = null; let browser = null; +let initialRecorded = false; async function setupRedirectListener() { const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); @@ -62,6 +63,20 @@ async function setupRedirectListener() { client.on('Network.requestWillBeSent', (params) => { const { requestId, request, redirectResponse } = params; + if (!initialRecorded && request.url && request.url.startsWith('http')) { + const initialEntry = { + timestamp: new Date().toISOString(), + from_url: null, + to_url: request.url, + status: null, + type: 'initial', + request_id: requestId, + }; + redirectChain.push(initialEntry); + fs.appendFileSync(outputPath, JSON.stringify(initialEntry) + '\n'); + initialRecorded = true; + } + if (redirectResponse) { // This is a redirect const redirectEntry = { diff --git a/archivebox/plugins/redirects/tests/test_redirects.py b/archivebox/plugins/redirects/tests/test_redirects.py index 452c5dd6..c26ac273 100644 --- a/archivebox/plugins/redirects/tests/test_redirects.py +++ b/archivebox/plugins/redirects/tests/test_redirects.py @@ -48,7 +48,6 @@ class TestRedirectsPlugin(TestCase): self.assertTrue(REDIRECTS_HOOK.exists(), f"Hook not found: {REDIRECTS_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestRedirectsWithChrome(TestCase): """Integration tests for redirects plugin with Chrome.""" @@ -142,9 +141,7 @@ class TestRedirectsWithChrome(TestCase): self.assertNotIn('Traceback', stderr) self.assertNotIn('Error:', stderr) - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") + except RuntimeError: raise diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index c7dd6491..7f4587c1 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -39,7 +39,7 @@ let responseCount = 0; let shuttingDown = false; // Resource types to capture (by default, capture everything) -const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket']; +const DEFAULT_TYPES = ['document', 'script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket']; function getExtensionFromMimeType(mimeType) { const mimeMap = { @@ -176,11 +176,17 @@ async function setupListener() { const hostname = urlObj.hostname; const pathname = urlObj.pathname || '/'; const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : ''); - const dirPath = path.dirname(pathname); + const dirPathRaw = path.dirname(pathname); + const dirPath = dirPathRaw === '.' ? '' : dirPathRaw.replace(/^\/+/, ''); const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath); const symlinkPath = path.join(symlinkDir, filename); await createSymlink(uniquePath, symlinkPath); + + // Also create a site-style symlink without resource type for easy browsing + const siteDir = path.join(OUTPUT_DIR, hostname, dirPath); + const sitePath = path.join(siteDir, filename); + await createSymlink(uniquePath, sitePath); } catch (e) { // URL parsing or symlink creation failed, skip } diff --git a/archivebox/plugins/responses/tests/test_responses.py b/archivebox/plugins/responses/tests/test_responses.py index 82a5fa77..b6404dcd 100644 --- a/archivebox/plugins/responses/tests/test_responses.py +++ b/archivebox/plugins/responses/tests/test_responses.py @@ -13,27 +13,18 @@ import tempfile import time from pathlib import Path -import pytest from django.test import TestCase # Import chrome test helpers sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) from chrome_test_helpers import ( chrome_session, - get_test_env, + CHROME_NAVIGATE_HOOK, get_plugin_dir, get_hook_script, ) -def chrome_available() -> bool: - """Check if Chrome/Chromium is available.""" - for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']: - if shutil.which(name): - return True - return False - - # Get the path to the responses hook PLUGIN_DIR = get_plugin_dir(__file__) RESPONSES_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_responses.*') @@ -48,7 +39,6 @@ class TestResponsesPlugin(TestCase): self.assertTrue(RESPONSES_HOOK.exists(), f"Hook not found: {RESPONSES_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestResponsesWithChrome(TestCase): """Integration tests for responses plugin with Chrome.""" @@ -65,68 +55,72 @@ class TestResponsesWithChrome(TestCase): test_url = 'https://example.com' snapshot_id = 'test-responses-snapshot' - try: - with chrome_session( - self.temp_dir, - crawl_id='test-responses-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=True, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + with chrome_session( + self.temp_dir, + crawl_id='test-responses-crawl', + snapshot_id=snapshot_id, + test_url=test_url, + navigate=False, + timeout=30, + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + responses_dir = snapshot_chrome_dir.parent / 'responses' + responses_dir.mkdir(exist_ok=True) + # Run responses hook with the active Chrome session (background hook) + result = subprocess.Popen( + ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(responses_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - # Run responses hook with the active Chrome session (background hook) - result = subprocess.Popen( - ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - # Check for output directory and index file - index_output = snapshot_chrome_dir / 'index.jsonl' + # Check for output directory and index file + index_output = responses_dir / 'index.jsonl' - # Wait briefly for background hook to write output - for _ in range(10): - if index_output.exists() and index_output.stat().st_size > 0: - break - time.sleep(1) + # Wait briefly for background hook to write output + for _ in range(30): + if index_output.exists() and index_output.stat().st_size > 0: + break + time.sleep(1) - # Verify hook ran (may keep running waiting for cleanup signal) - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: + # Verify hook ran (may keep running waiting for cleanup signal) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() stdout, stderr = result.communicate() - self.assertNotIn('Traceback', stderr) + else: + stdout, stderr = result.communicate() + self.assertNotIn('Traceback', stderr) - # If index file exists, verify it's valid JSONL - if index_output.exists(): - with open(index_output) as f: - content = f.read().strip() - if content: - for line in content.split('\n'): - if line.strip(): - try: - record = json.loads(line) - # Verify structure - self.assertIn('url', record) - self.assertIn('resourceType', record) - except json.JSONDecodeError: - pass # Some lines may be incomplete - - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") - raise + # If index file exists, verify it's valid JSONL + if index_output.exists(): + with open(index_output) as f: + content = f.read().strip() + self.assertTrue(content, "Responses output should not be empty") + for line in content.split('\n'): + if line.strip(): + try: + record = json.loads(line) + # Verify structure + self.assertIn('url', record) + self.assertIn('resourceType', record) + except json.JSONDecodeError: + pass # Some lines may be incomplete if __name__ == '__main__': diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js index 76390846..34cd7a44 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js @@ -9,7 +9,6 @@ * Output: Writes screenshot/screenshot.png * * Environment variables: - * CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000) * SCREENSHOT_ENABLED: Enable screenshot capture (default: true) */ @@ -34,9 +33,10 @@ function flushCoverageAndExit(exitCode) { const { getEnv, getEnvBool, - parseResolution, parseArgs, - readCdpUrl, + connectToPage, + waitForPageLoaded, + readTargetId, } = require('../chrome/chrome_utils.js'); // Check if screenshot is enabled BEFORE requiring puppeteer @@ -75,77 +75,58 @@ function hasStaticFileOutput() { return false; } -// Wait for chrome tab to be fully loaded -async function waitForChromeTabLoaded(timeoutMs = 10000) { - const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(navigationFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - async function takeScreenshot(url) { - const resolution = getEnv('CHROME_RESOLUTION', '1440,2000'); - const { width, height } = parseResolution(resolution); - // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); // Wait for chrome_navigate to complete (writes navigation.json) const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10); const timeoutMs = timeoutSeconds * 1000; - const pageLoaded = await waitForChromeTabLoaded(timeoutMs); - if (!pageLoaded) { - throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`); + const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); + if (!fs.existsSync(navigationFile)) { + await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs); } - // Connect to existing Chrome session (required - no fallback) - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (!cdpUrl) { + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + const targetFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); + if (!fs.existsSync(cdpFile)) { throw new Error('No Chrome session found (chrome plugin must run first)'); } - - // Read target_id.txt to get the specific tab for this snapshot - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - if (!fs.existsSync(targetIdFile)) { + if (!fs.existsSync(targetFile)) { throw new Error('No target_id.txt found (chrome_tab must run first)'); } - const targetId = fs.readFileSync(targetIdFile, 'utf8').trim(); + const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim(); + if (!cdpUrl.startsWith('ws://') && !cdpUrl.startsWith('wss://')) { + throw new Error('Invalid CDP URL in cdp_url.txt'); + } - const browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: { width, height }, + const { browser, page } = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs, + puppeteer, }); try { - // Get the specific page for this snapshot by target ID - const targets = await browser.targets(); - const target = targets.find(t => t._targetId === targetId); - if (!target) { - throw new Error(`Target ${targetId} not found in Chrome session`); + const expectedTargetId = readTargetId(CHROME_SESSION_DIR); + if (!expectedTargetId) { + throw new Error('No target_id.txt found (chrome_tab must run first)'); + } + const actualTargetId = page.target()._targetId; + if (actualTargetId !== expectedTargetId) { + throw new Error(`Target ${expectedTargetId} not found in Chrome session`); } - const page = await target.page(); - if (!page) { - throw new Error(`Could not get page for target ${targetId}`); - } - - // Set viewport on the page - await page.setViewport({ width, height }); - - // Take screenshot (Puppeteer throws on failure) - await page.screenshot({ - path: outputPath, - fullPage: true, + const captureTimeoutMs = Math.max(timeoutMs, 10000); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Screenshot capture timed out')), captureTimeoutMs); }); + await page.bringToFront(); + await Promise.race([ + page.screenshot({ path: outputPath, fullPage: true }), + timeoutPromise, + ]); + return outputPath; } finally { @@ -188,6 +169,7 @@ async function main() { status: 'succeeded', output_str: outputPath, })); + flushCoverageAndExit(0); } main().catch(e => { diff --git a/archivebox/plugins/screenshot/templates/card.html b/archivebox/plugins/screenshot/templates/card.html index 5d49374d..83cc2adc 100644 --- a/archivebox/plugins/screenshot/templates/card.html +++ b/archivebox/plugins/screenshot/templates/card.html @@ -2,7 +2,7 @@ Screenshot of page
📷 Screenshot
diff --git a/archivebox/plugins/screenshot/templates/full.html b/archivebox/plugins/screenshot/templates/full.html index b5f8901a..62226828 100644 --- a/archivebox/plugins/screenshot/templates/full.html +++ b/archivebox/plugins/screenshot/templates/full.html @@ -1,8 +1,7 @@ - -
+ +
Screenshot of page + style="width: auto; max-width: 100%; height: auto; display: block;">
diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index 9627ec02..ddc466d3 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -112,27 +112,7 @@ def test_screenshot_with_chrome_session(): assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000 assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n' - # Scenario 2: Custom resolution - screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2' - screenshot_dir2.mkdir() - env['CHROME_RESOLUTION'] = '800,600' - - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(screenshot_dir2), - capture_output=True, - text=True, - timeout=30, - env=env - ) - - assert result.returncode == 0 - screenshot_file2 = screenshot_dir2 / 'screenshot.png' - assert screenshot_file2.exists() - file_size = screenshot_file2.stat().st_size - assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}" - - # Scenario 3: Wrong target ID (error case) + # Scenario 2: Wrong target ID (error case) screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3' screenshot_dir3.mkdir() (snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id') @@ -149,9 +129,7 @@ def test_screenshot_with_chrome_session(): assert result.returncode != 0 assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower() - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - pytest.skip(f"Chrome session setup failed: {e}") + except RuntimeError: raise @@ -362,30 +340,6 @@ def test_missing_snapshot_id_argument(): assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower() -def test_invalid_resolution_format(): - """Test that invalid CHROME_RESOLUTION format is handled gracefully.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - # Invalid resolution formats to test parseResolution error handling - for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']: - env['CHROME_RESOLUTION'] = bad_resolution - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - # Should either fail gracefully or fall back to default - # (depending on implementation - script should not crash with uncaught error) - assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}" - def test_no_cdp_url_fails(): """Test error when chrome dir exists but no cdp_url.txt.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/archivebox/plugins/search_backend_ripgrep/search.py b/archivebox/plugins/search_backend_ripgrep/search.py index 171b60bb..dd94f153 100644 --- a/archivebox/plugins/search_backend_ripgrep/search.py +++ b/archivebox/plugins/search_backend_ripgrep/search.py @@ -18,6 +18,8 @@ import shutil from pathlib import Path from typing import List, Iterable +from django.conf import settings + def get_env(name: str, default: str = '') -> str: return os.environ.get(name, default).strip() @@ -51,6 +53,12 @@ def _get_archive_dir() -> Path: data_dir = os.environ.get('DATA_DIR', '').strip() if data_dir: return Path(data_dir) / 'archive' + settings_archive_dir = getattr(settings, 'ARCHIVE_DIR', None) + if settings_archive_dir: + return Path(settings_archive_dir) + settings_data_dir = getattr(settings, 'DATA_DIR', None) + if settings_data_dir: + return Path(settings_data_dir) / 'archive' return Path.cwd() / 'archive' diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py index 60eb6e3a..26b3f118 100644 --- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py +++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py @@ -25,9 +25,7 @@ def test_ripgrep_hook_detects_binary_from_path(): """Test that ripgrep hook finds binary using abx-pkg when env var is just a name.""" hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' - # Skip if rg is not installed - if not shutil.which('rg'): - pass + assert shutil.which('rg'), "ripgrep not installed" # Set SEARCH_BACKEND_ENGINE to enable the hook env = os.environ.copy() @@ -78,8 +76,7 @@ def test_ripgrep_hook_handles_absolute_path(): hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' rg_path = shutil.which('rg') - if not rg_path: - pytest.skip("ripgrep not installed") + assert rg_path, "ripgrep not installed" env = os.environ.copy() env['SEARCH_BACKEND_ENGINE'] = 'ripgrep' @@ -214,8 +211,7 @@ def test_ripgrep_only_detected_when_backend_enabled(): import sys from pathlib import Path - if not shutil.which('rg'): - pytest.skip("ripgrep not installed") + assert shutil.which('rg'), "ripgrep not installed" hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py index 1f0ce7fa..8c1f957a 100644 --- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py +++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py @@ -151,7 +151,6 @@ class TestRipgrepSearch(TestCase): results = search('test') self.assertEqual(results, []) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_single_match(self): """search should find matching snapshot.""" results = search('Python programming') @@ -160,7 +159,6 @@ class TestRipgrepSearch(TestCase): self.assertNotIn('snap-002', results) self.assertNotIn('snap-003', results) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_multiple_matches(self): """search should find all matching snapshots.""" # 'guide' appears in snap-002 (JavaScript guide) and snap-003 (Archiving Guide) @@ -170,7 +168,6 @@ class TestRipgrepSearch(TestCase): self.assertIn('snap-003', results) self.assertNotIn('snap-001', results) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_case_insensitive_by_default(self): """search should be case-sensitive (ripgrep default).""" # By default rg is case-sensitive @@ -181,13 +178,11 @@ class TestRipgrepSearch(TestCase): self.assertIsInstance(results_upper, list) self.assertIsInstance(results_lower, list) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_no_results(self): """search should return empty list for no matches.""" results = search('xyznonexistent123') self.assertEqual(results, []) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_regex(self): """search should support regex patterns.""" results = search('(Python|JavaScript)') @@ -195,7 +190,6 @@ class TestRipgrepSearch(TestCase): self.assertIn('snap-001', results) self.assertIn('snap-002', results) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_distinct_snapshots(self): """search should return distinct snapshot IDs.""" # Query matches both files in snap-001 @@ -212,7 +206,6 @@ class TestRipgrepSearch(TestCase): search('test') self.assertIn('ripgrep binary not found', str(context.exception)) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_with_custom_args(self): """search should use custom RIPGREP_ARGS.""" with patch.dict(os.environ, {'RIPGREP_ARGS': '["-i"]'}): # Case insensitive @@ -220,7 +213,6 @@ class TestRipgrepSearch(TestCase): # With -i flag, should find regardless of case self.assertIn('snap-001', results) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_timeout(self): """search should handle timeout gracefully.""" with patch.dict(os.environ, {'RIPGREP_TIMEOUT': '1'}): @@ -285,19 +277,16 @@ class TestRipgrepSearchIntegration(TestCase): else: file_path.write_text(content) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_archivebox(self): """Search for archivebox should find documentation snapshot.""" results = search('archivebox') self.assertIn('1704067200.123456', results) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_python(self): """Search for python should find Python news snapshot.""" results = search('Python') self.assertIn('1704153600.654321', results) - @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed") def test_search_pip_install(self): """Search for installation command.""" results = search('pip install') diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js index e7e905f0..cc107d64 100755 --- a/archivebox/plugins/seo/on_Snapshot__38_seo.js +++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js @@ -21,86 +21,37 @@ const path = require('path'); if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); +// Import shared utilities from chrome_utils.js +const { + getEnvBool, + getEnvInt, + parseArgs, + connectToPage, + waitForPageLoaded, +} = require('../chrome/chrome_utils.js'); + // Extractor metadata const PLUGIN_NAME = 'seo'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'seo.json'; const CHROME_SESSION_DIR = '../chrome'; -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -// Wait for chrome tab to be fully loaded -async function waitForChromeTabLoaded(timeoutMs = 60000) { - const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(navigationFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -// Get CDP URL from chrome plugin -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - // Extract SEO metadata async function extractSeo(url) { // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - + const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; let browser = null; try { - // Connect to existing Chrome session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; - } - - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, + // Connect to existing Chrome session and get target page + const connection = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs: timeout, + puppeteer, }); - - // Get the page - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - return { success: false, error: 'No page found in Chrome session' }; - } + browser = connection.browser; + const page = connection.page; // Extract all meta tags const seoData = await page.evaluate(() => { @@ -179,15 +130,8 @@ async function main() { process.exit(0); } - // Check if Chrome session exists, then wait for page load - const cdpUrl = getCdpUrl(); - if (cdpUrl) { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } - } + const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200); const result = await extractSeo(url); diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py index 63233b16..d0e2f09f 100644 --- a/archivebox/plugins/seo/tests/test_seo.py +++ b/archivebox/plugins/seo/tests/test_seo.py @@ -6,33 +6,24 @@ meta tag extraction. """ import json -import shutil import subprocess import sys import tempfile +import shutil from pathlib import Path -import pytest from django.test import TestCase # Import chrome test helpers sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) from chrome_test_helpers import ( chrome_session, - get_test_env, + CHROME_NAVIGATE_HOOK, get_plugin_dir, get_hook_script, ) -def chrome_available() -> bool: - """Check if Chrome/Chromium is available.""" - for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']: - if shutil.which(name): - return True - return False - - # Get the path to the SEO hook PLUGIN_DIR = get_plugin_dir(__file__) SEO_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_seo.*') @@ -47,7 +38,6 @@ class TestSEOPlugin(TestCase): self.assertTrue(SEO_HOOK.exists(), f"Hook not found: {SEO_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestSEOWithChrome(TestCase): """Integration tests for SEO plugin with Chrome.""" @@ -64,71 +54,75 @@ class TestSEOWithChrome(TestCase): test_url = 'https://example.com' snapshot_id = 'test-seo-snapshot' - try: - with chrome_session( - self.temp_dir, - crawl_id='test-seo-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=True, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + with chrome_session( + self.temp_dir, + crawl_id='test-seo-crawl', + snapshot_id=snapshot_id, + test_url=test_url, + navigate=False, + timeout=30, + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + seo_dir = snapshot_chrome_dir.parent / 'seo' + seo_dir.mkdir(exist_ok=True) + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - # Run SEO hook with the active Chrome session - result = subprocess.run( - ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) + # Run SEO hook with the active Chrome session + result = subprocess.run( + ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(seo_dir), + capture_output=True, + text=True, + timeout=60, + env=env + ) - # Check for output file - seo_output = snapshot_chrome_dir / 'seo.json' + # Check for output file + seo_output = seo_dir / 'seo.json' - seo_data = None + seo_data = None - # Try parsing from file first - if seo_output.exists(): - with open(seo_output) as f: + # Try parsing from file first + if seo_output.exists(): + with open(seo_output) as f: + try: + seo_data = json.load(f) + except json.JSONDecodeError: + pass + + # Try parsing from stdout if not in file + if not seo_data: + for line in result.stdout.split('\n'): + line = line.strip() + if line.startswith('{'): try: - seo_data = json.load(f) + record = json.loads(line) + # SEO data typically has title, description, or og: tags + if any(key in record for key in ['title', 'description', 'og:title', 'canonical']): + seo_data = record + break except json.JSONDecodeError: - pass + continue - # Try parsing from stdout if not in file - if not seo_data: - for line in result.stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - # SEO data typically has title, description, or og: tags - if any(key in record for key in ['title', 'description', 'og:title', 'canonical']): - seo_data = record - break - except json.JSONDecodeError: - continue + # Verify hook ran successfully + self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") + self.assertNotIn('Traceback', result.stderr) + self.assertNotIn('Error:', result.stderr) - # Verify hook ran successfully - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - self.assertNotIn('Traceback', result.stderr) - self.assertNotIn('Error:', result.stderr) + # example.com has a title, so we MUST get SEO data + self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout") - # example.com has a title, so we MUST get SEO data - self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout") - - # Verify we got some SEO data - has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta']) - self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}") - - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") - raise + # Verify we got some SEO data + has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta']) + self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}") if __name__ == '__main__': diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index 3590c793..4d91e0e7 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -9,12 +9,12 @@ Environment variables: SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True) SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file) SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) - SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) + SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) [unused; shared Chrome session required] SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT) SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) - SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) + SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) [unused; shared Chrome session required] SINGLEFILE_ARGS: Default SingleFile arguments (JSON array) SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array) """ @@ -138,8 +138,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: """ Archive URL using SingleFile. - If a Chrome session exists (from chrome plugin), connects to it via CDP. - Otherwise launches a new Chrome instance. + Requires a Chrome session (from chrome plugin) and connects to it via CDP. Returns: (success, output_path, error_message) """ @@ -151,8 +150,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '') singlefile_args = get_env_array('SINGLEFILE_ARGS', []) singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', []) - chrome_args = get_env_array('SINGLEFILE_CHROME_ARGS') or get_env_array('CHROME_ARGS', []) - chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '') + # Chrome args/binary are intentionally ignored because we require a shared Chrome session cmd = [binary, *singlefile_args] @@ -176,14 +174,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: if cdp_remote_url: print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr) cmd.extend(['--browser-server', cdp_remote_url]) - elif chrome: - print(f'[singlefile] Launching Chrome binary: {chrome}', file=sys.stderr) - cmd.extend(['--browser-executable-path', chrome]) - - # Pass Chrome arguments (only when launching a new browser) - if chrome_args and not cdp_remote_url: - # SingleFile expects --browser-args as a JSON array string - cmd.extend(['--browser-args', json.dumps(chrome_args)]) + else: + return False, None, 'No Chrome session found (chrome plugin must run first)' # SSL handling if not check_ssl: @@ -267,8 +259,8 @@ def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | # Only attempt if chrome session exists cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10))) if not cdp_url: - print('[singlefile] No chrome session (cdp_url.txt missing)', file=sys.stderr) - return False, None, 'No Chrome session available' + print('[singlefile] No Chrome session found (chrome plugin must run first)', file=sys.stderr) + return False, None, 'No Chrome session found (chrome plugin must run first)' if not EXTENSION_SAVE_SCRIPT.exists(): print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr) diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index dd533e3c..8de0a163 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -59,27 +59,71 @@ def test_verify_deps_with_abx_pkg(): def test_singlefile_cli_archives_example_com(): - """Test that singlefile CLI archives example.com and produces valid HTML.""" + """Test that singlefile archives example.com and produces valid HTML.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - env = get_test_env() - env['SINGLEFILE_ENABLED'] = 'true' + data_dir = tmpdir / 'data' + extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' + downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads' + user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data' + extensions_dir.mkdir(parents=True, exist_ok=True) + downloads_dir.mkdir(parents=True, exist_ok=True) + user_data_dir.mkdir(parents=True, exist_ok=True) + + env_install = os.environ.copy() + env_install.update({ + 'DATA_DIR': str(data_dir), + 'CHROME_EXTENSIONS_DIR': str(extensions_dir), + 'CHROME_DOWNLOADS_DIR': str(downloads_dir), + }) - # Run singlefile snapshot hook result = subprocess.run( - [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=tmpdir, + ['node', str(INSTALL_SCRIPT)], capture_output=True, text=True, - env=env, - timeout=120 + env=env_install, + timeout=120, ) + assert result.returncode == 0, f"Extension install failed: {result.stderr}" + + old_env = os.environ.copy() + os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir) + os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir) + os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir) + try: + with chrome_session( + tmpdir=tmpdir, + crawl_id='singlefile-cli-crawl', + snapshot_id='singlefile-cli-snap', + test_url=TEST_URL, + navigate=True, + timeout=30, + ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env): + env['SINGLEFILE_ENABLED'] = 'true' + env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir) + env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir) + + singlefile_output_dir = snapshot_chrome_dir.parent / 'singlefile' + singlefile_output_dir.mkdir(parents=True, exist_ok=True) + + # Run singlefile snapshot hook + result = subprocess.run( + [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], + cwd=singlefile_output_dir, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + finally: + os.environ.clear() + os.environ.update(old_env) assert result.returncode == 0, f"Hook execution failed: {result.stderr}" # Verify output file exists - output_file = tmpdir / 'singlefile.html' + output_file = singlefile_output_dir / 'singlefile.html' assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}" # Verify it contains real HTML diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index 59740e5c..6559d9fd 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -34,18 +34,26 @@ const CHROME_SESSION_DIR = '../chrome'; let browser = null; let page = null; +let client = null; let sslCaptured = false; let shuttingDown = false; async function setupListener(url) { const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; + let targetHost = null; // Only extract SSL for HTTPS URLs if (!url.startsWith('https://')) { throw new Error('URL is not HTTPS'); } + try { + targetHost = new URL(url).host; + } catch (e) { + targetHost = null; + } + // Connect to Chrome page using shared utility const { browser, page } = await connectToPage({ chromeSessionDir: CHROME_SESSION_DIR, @@ -53,54 +61,54 @@ async function setupListener(url) { puppeteer, }); - // Set up listener to capture SSL details during navigation - page.on('response', async (response) => { + client = await page.target().createCDPSession(); + await client.send('Network.enable'); + + client.on('Network.responseReceived', (params) => { try { - const request = response.request(); + if (sslCaptured) return; + if (params.type && params.type !== 'Document') return; + const response = params.response || {}; + const responseUrl = response.url || ''; + if (!responseUrl.startsWith('http')) return; - // Only capture the main navigation request - if (!request.isNavigationRequest() || request.frame() !== page.mainFrame()) { - return; + if (targetHost) { + try { + const responseHost = new URL(responseUrl).host; + if (responseHost !== targetHost) return; + } catch (e) { + // Ignore URL parse errors, fall through + } } - // Only capture if it's for our target URL - if (!response.url().startsWith(url.split('?')[0])) { - return; - } - - // Get security details from the response - const securityDetails = response.securityDetails(); - let sslInfo = {}; + const securityDetails = response.securityDetails || null; + let sslInfo = { url: responseUrl }; if (securityDetails) { - sslInfo.protocol = securityDetails.protocol(); - sslInfo.subjectName = securityDetails.subjectName(); - sslInfo.issuer = securityDetails.issuer(); - sslInfo.validFrom = securityDetails.validFrom(); - sslInfo.validTo = securityDetails.validTo(); - sslInfo.certificateId = securityDetails.subjectName(); - sslInfo.securityState = 'secure'; + sslInfo.protocol = securityDetails.protocol; + sslInfo.subjectName = securityDetails.subjectName; + sslInfo.issuer = securityDetails.issuer; + sslInfo.validFrom = securityDetails.validFrom; + sslInfo.validTo = securityDetails.validTo; + sslInfo.certificateId = securityDetails.subjectName; + sslInfo.securityState = response.securityState || 'secure'; sslInfo.schemeIsCryptographic = true; - const sanList = securityDetails.sanList(); + const sanList = securityDetails.sanList; if (sanList && sanList.length > 0) { sslInfo.subjectAlternativeNames = sanList; } - } else if (response.url().startsWith('https://')) { - // HTTPS URL but no security details means something went wrong - sslInfo.securityState = 'unknown'; + } else if (responseUrl.startsWith('https://')) { + sslInfo.securityState = response.securityState || 'unknown'; sslInfo.schemeIsCryptographic = true; sslInfo.error = 'No security details available'; } else { - // Non-HTTPS URL sslInfo.securityState = 'insecure'; sslInfo.schemeIsCryptographic = false; } - // Write output directly to file fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2)); sslCaptured = true; - } catch (e) { // Ignore errors } diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py index 5dfa17df..6f8375c1 100644 --- a/archivebox/plugins/ssl/tests/test_ssl.py +++ b/archivebox/plugins/ssl/tests/test_ssl.py @@ -13,26 +13,18 @@ import tempfile import time from pathlib import Path -import pytest from django.test import TestCase # Import chrome test helpers sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) from chrome_test_helpers import ( chrome_session, + CHROME_NAVIGATE_HOOK, get_plugin_dir, get_hook_script, ) -def chrome_available() -> bool: - """Check if Chrome/Chromium is available.""" - for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']: - if shutil.which(name): - return True - return False - - # Get the path to the SSL hook PLUGIN_DIR = get_plugin_dir(__file__) SSL_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_ssl.*') @@ -47,7 +39,6 @@ class TestSSLPlugin(TestCase): self.assertTrue(SSL_HOOK.exists(), f"Hook not found: {SSL_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestSSLWithChrome(TestCase): """Integration tests for SSL plugin with Chrome.""" @@ -64,88 +55,92 @@ class TestSSLWithChrome(TestCase): test_url = 'https://example.com' snapshot_id = 'test-ssl-snapshot' - try: - with chrome_session( - self.temp_dir, - crawl_id='test-ssl-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=True, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + with chrome_session( + self.temp_dir, + crawl_id='test-ssl-crawl', + snapshot_id=snapshot_id, + test_url=test_url, + navigate=False, + timeout=30, + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + ssl_dir = snapshot_chrome_dir.parent / 'ssl' + ssl_dir.mkdir(exist_ok=True) + # Run SSL hook with the active Chrome session (background hook) + result = subprocess.Popen( + ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(ssl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - # Run SSL hook with the active Chrome session (background hook) - result = subprocess.Popen( - ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - # Allow it to run briefly, then terminate (background hook) - time.sleep(3) - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: + # Check for output file + ssl_output = ssl_dir / 'ssl.jsonl' + for _ in range(30): + if ssl_output.exists() and ssl_output.stat().st_size > 0: + break + time.sleep(1) + + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() - # Check for output file - ssl_output = snapshot_chrome_dir / 'ssl.jsonl' + ssl_data = None - ssl_data = None + # Try parsing from file first + if ssl_output.exists(): + with open(ssl_output) as f: + content = f.read().strip() + if content.startswith('{'): + try: + ssl_data = json.loads(content) + except json.JSONDecodeError: + pass - # Try parsing from file first - if ssl_output.exists(): - with open(ssl_output) as f: - for line in f: - line = line.strip() - if line.startswith('{'): - try: - ssl_data = json.loads(line) - break - except json.JSONDecodeError: - continue + # Try parsing from stdout if not in file + if not ssl_data: + for line in stdout.split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL': + ssl_data = record + break + except json.JSONDecodeError: + continue - # Try parsing from stdout if not in file - if not ssl_data: - for line in stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL': - ssl_data = record - break - except json.JSONDecodeError: - continue + # Verify hook ran successfully + self.assertNotIn('Traceback', stderr) + self.assertNotIn('Error:', stderr) - # Verify hook ran successfully - self.assertNotIn('Traceback', stderr) - self.assertNotIn('Error:', stderr) + # example.com uses HTTPS, so we MUST get SSL certificate data + self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL") - # example.com uses HTTPS, so we MUST get SSL certificate data - self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL") - - # Verify we got certificate info - self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}") - self.assertTrue( - ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'), - f"Unexpected protocol: {ssl_data['protocol']}" - ) - - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") - raise + # Verify we got certificate info + self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}") + self.assertTrue( + ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'), + f"Unexpected protocol: {ssl_data['protocol']}" + ) if __name__ == '__main__': diff --git a/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js index 33531d93..984e15c7 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js +++ b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js @@ -149,6 +149,17 @@ function getFilenameFromUrl(url) { } } +function normalizeUrl(url) { + try { + const parsed = new URL(url); + let path = parsed.pathname || ''; + if (path === '/') path = ''; + return `${parsed.origin}${path}`; + } catch (e) { + return url; + } +} + async function setupStaticFileListener() { const timeout = getEnvInt('STATICFILE_TIMEOUT', 30) * 1000; @@ -174,7 +185,7 @@ async function setupStaticFileListener() { const status = response.status(); // Only process the main document response - if (url !== originalUrl) return; + if (normalizeUrl(url) !== normalizeUrl(originalUrl)) return; if (status < 200 || status >= 300) return; firstResponseHandled = true; @@ -313,6 +324,19 @@ async function main() { // Wait for chrome_navigate to complete (non-fatal) try { await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + if (!detectedContentType && page) { + try { + const inferred = await page.evaluate(() => document.contentType || ''); + if (inferred) { + detectedContentType = inferred.split(';')[0].trim(); + if (isStaticContentType(detectedContentType)) { + isStaticFile = true; + } + } + } catch (e) { + // Best-effort only + } + } } catch (e) { console.error(`WARN: ${e.message}`); } diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py index b99be87c..f40b0677 100644 --- a/archivebox/plugins/staticfile/tests/test_staticfile.py +++ b/archivebox/plugins/staticfile/tests/test_staticfile.py @@ -48,7 +48,6 @@ class TestStaticfilePlugin(TestCase): self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}") -@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed") class TestStaticfileWithChrome(TestCase): """Integration tests for staticfile plugin with Chrome.""" @@ -116,9 +115,7 @@ class TestStaticfileWithChrome(TestCase): except json.JSONDecodeError: continue - except RuntimeError as e: - if 'Chrome' in str(e) or 'CDP' in str(e): - self.skipTest(f"Chrome session setup failed: {e}") + except RuntimeError: raise diff --git a/archivebox/plugins/title/on_Snapshot__54_title.js b/archivebox/plugins/title/on_Snapshot__54_title.js index cfad4add..af89e779 100644 --- a/archivebox/plugins/title/on_Snapshot__54_title.js +++ b/archivebox/plugins/title/on_Snapshot__54_title.js @@ -2,22 +2,27 @@ /** * Extract the title of a URL. * - * If a Chrome session exists (from chrome plugin), connects to it via CDP + * Requires a Chrome session (from chrome plugin) and connects to it via CDP * to get the page title (which includes JS-rendered content). - * Otherwise falls back to fetching the URL and parsing HTML. * * Usage: on_Snapshot__10_title.js --url= --snapshot-id= * Output: Writes title/title.txt * * Environment variables: - * TIMEOUT: Timeout in seconds (default: 30) - * USER_AGENT: User agent string (optional) + * TITLE_TIMEOUT: Timeout in seconds (default: 30) */ const fs = require('fs'); const path = require('path'); -const https = require('https'); -const http = require('http'); +const puppeteer = require('puppeteer-core'); + +// Import shared utilities from chrome_utils.js +const { + getEnvInt, + parseArgs, + connectToPage, + waitForPageLoaded, +} = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'title'; @@ -25,189 +30,47 @@ const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'title.txt'; const CHROME_SESSION_DIR = '../chrome'; -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - -// Wait for chrome tab to be fully loaded -async function waitForChromeTabLoaded(timeoutMs = 60000) { - const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(navigationFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -// Get CDP URL from chrome plugin if available -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -// Extract title from HTML -function extractTitleFromHtml(html) { - // Try tag - const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i); - if (titleMatch) { - return titleMatch[1].trim(); - } - - // Try og:title - const ogMatch = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i); - if (ogMatch) { - return ogMatch[1].trim(); - } - - // Try twitter:title - const twitterMatch = html.match(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i); - if (twitterMatch) { - return twitterMatch[1].trim(); - } - - return null; -} - -// Fetch URL and extract title (fallback method) -function fetchTitle(url) { - return new Promise((resolve, reject) => { - const timeout = getEnvInt('TIMEOUT', 30) * 1000; - const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)'); - - const client = url.startsWith('https') ? https : http; - - const req = client.get(url, { - headers: { 'User-Agent': userAgent }, - timeout, - }, (res) => { - // Handle redirects - if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { - fetchTitle(res.headers.location).then(resolve).catch(reject); - return; - } - - let data = ''; - res.on('data', chunk => { - data += chunk; - // Only need first 64KB to find title - if (data.length > 65536) { - req.destroy(); - } - }); - res.on('end', () => { - const title = extractTitleFromHtml(data); - if (title) { - resolve(title); - } else { - reject(new Error('No title found in HTML')); - } - }); - }); - - req.on('error', reject); - req.on('timeout', () => { - req.destroy(); - reject(new Error('Request timeout')); - }); - }); -} - -// Get title using Puppeteer CDP connection -async function getTitleFromCdp(cdpUrl) { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } - - const puppeteer = require('puppeteer-core'); - - const browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - }); +async function extractTitle(url) { + // Output directory is current directory (hook already runs in output dir) + const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); + const timeoutMs = getEnvInt('TITLE_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; + let browser = null; try { - // Get existing pages - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; + const connection = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs, + puppeteer, + }); + browser = connection.browser; + const page = connection.page; - if (!page) { - throw new Error('No page found in Chrome session'); - } + await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs * 4, 200); // Get title from page - const title = await page.title(); + let title = await page.title(); if (!title) { // Try getting from DOM directly - const domTitle = await page.evaluate(() => { + title = await page.evaluate(() => { return document.title || document.querySelector('meta[property="og:title"]')?.content || document.querySelector('meta[name="twitter:title"]')?.content || document.querySelector('h1')?.textContent?.trim(); }); - return domTitle; } - return title; - } finally { - // Disconnect without closing browser - browser.disconnect(); - } -} - -async function extractTitle(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - - // Try Chrome session first - const cdpUrl = getCdpUrl(); - if (cdpUrl) { - try { - const title = await getTitleFromCdp(cdpUrl); - if (title) { - fs.writeFileSync(outputPath, title, 'utf8'); - return { success: true, output: outputPath, title, method: 'cdp' }; - } - } catch (e) { - console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`); + if (title) { + fs.writeFileSync(outputPath, title, 'utf8'); + return { success: true, output: outputPath, title, method: 'cdp' }; } - } - - // Fallback to HTTP fetch - try { - const title = await fetchTitle(url); - fs.writeFileSync(outputPath, title, 'utf8'); - return { success: true, output: outputPath, title, method: 'http' }; + return { success: false, error: 'No title found in Chrome session' }; } catch (e) { return { success: false, error: e.message }; + } finally { + if (browser) { + browser.disconnect(); + } } } diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py index 91b548d6..78b2ffbd 100644 --- a/archivebox/plugins/title/tests/test_title.py +++ b/archivebox/plugins/title/tests/test_title.py @@ -7,8 +7,7 @@ Tests verify: 3. Title extraction works for real example.com 4. Output file contains actual page title 5. Handles various title sources (<title>, og:title, twitter:title) -6. Config options work (TIMEOUT, USER_AGENT) -7. Fallback to HTTP when chrome not available +6. Config options work (TITLE_TIMEOUT) """ import json @@ -23,6 +22,9 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_plugin_dir, get_hook_script, parse_jsonl_output, + get_test_env, + chrome_session, + CHROME_NAVIGATE_HOOK, ) @@ -30,6 +32,25 @@ PLUGIN_DIR = get_plugin_dir(__file__) TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*') TEST_URL = 'https://example.com' +def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id): + nav_result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env, + ) + result = subprocess.run( + ['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], + cwd=title_dir, + capture_output=True, + text=True, + timeout=60, + env=env, + ) + return nav_result, result + def test_hook_script_exists(): """Verify hook script exists.""" @@ -46,15 +67,18 @@ def test_extracts_title_from_example_com(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Run title extraction - result = subprocess.run( - ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + title_dir = snapshot_chrome_dir.parent / 'title' + title_dir.mkdir(exist_ok=True) + + nav_result, result = run_title_capture( + title_dir, + snapshot_chrome_dir, + env, + TEST_URL, + 'test789', + ) + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -76,7 +100,7 @@ def test_extracts_title_from_example_com(): assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" # Verify output file exists (hook writes to current directory) - title_file = tmpdir / 'title.txt' + title_file = title_dir / 'title.txt' assert title_file.exists(), "title.txt not created" # Verify title contains REAL example.com title @@ -88,56 +112,33 @@ def test_extracts_title_from_example_com(): assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}" -def test_falls_back_to_http_when_chrome_unavailable(): - """Test that title plugin falls back to HTTP when chrome unavailable.""" +def test_fails_without_chrome_session(): + """Test that title plugin fails when chrome session is missing.""" if not shutil.which('node'): pass with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - - # Don't create chrome directory - force HTTP fallback + title_dir = tmpdir / 'snapshot' / 'title' + title_dir.mkdir(parents=True, exist_ok=True) # Run title extraction result = subprocess.run( ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'], - cwd=tmpdir, + cwd=title_dir, capture_output=True, text=True, - timeout=60 - , - env=get_test_env()) + timeout=60, + env=get_test_env(), + ) - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output exists and has real title (hook writes to current directory) - output_title_file = tmpdir / 'title.txt' - assert output_title_file.exists(), "Output title.txt not created" - - title_text = output_title_file.read_text().strip() - assert 'example' in title_text.lower() + assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}" + assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr) def test_config_timeout_honored(): - """Test that TIMEOUT config is respected.""" + """Test that TITLE_TIMEOUT config is respected.""" if not shutil.which('node'): pass @@ -147,65 +148,27 @@ def test_config_timeout_honored(): # Set very short timeout (but example.com should still succeed) import os - env = os.environ.copy() - env['TIMEOUT'] = '5' + env_override = os.environ.copy() + env_override['TITLE_TIMEOUT'] = '5' - result = subprocess.run( - ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) + with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + title_dir = snapshot_chrome_dir.parent / 'title' + title_dir.mkdir(exist_ok=True) + env.update(env_override) + + nav_result, result = run_title_capture( + title_dir, + snapshot_chrome_dir, + env, + TEST_URL, + 'testtimeout', + ) + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" # Should complete (success or fail, but not hang) assert result.returncode in (0, 1), "Should complete without hanging" -def test_config_user_agent(): - """Test that USER_AGENT config is used.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set custom user agent - import os - env = os.environ.copy() - env['USER_AGENT'] = 'TestBot/1.0' - - result = subprocess.run( - ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should succeed (example.com doesn't block) - if result.returncode == 0: - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - def test_handles_https_urls(): """Test that HTTPS URLs work correctly.""" @@ -215,18 +178,22 @@ def test_handles_https_urls(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - result = subprocess.run( - ['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env): + title_dir = snapshot_chrome_dir.parent / 'title' + title_dir.mkdir(exist_ok=True) + + nav_result, result = run_title_capture( + title_dir, + snapshot_chrome_dir, + env, + 'https://example.org', + 'testhttps', + ) + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" if result.returncode == 0: # Hook writes to current directory - output_title_file = tmpdir / 'title.txt' + output_title_file = title_dir / 'title.txt' if output_title_file.exists(): title_text = output_title_file.read_text().strip() assert len(title_text) > 0, "Title should not be empty" @@ -246,14 +213,23 @@ def test_handles_404_gracefully(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - result = subprocess.run( - ['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as ( + _process, + _pid, + snapshot_chrome_dir, + env, + ): + title_dir = snapshot_chrome_dir.parent / 'title' + title_dir.mkdir(exist_ok=True) + + nav_result, result = run_title_capture( + title_dir, + snapshot_chrome_dir, + env, + 'https://example.com/nonexistent-page-404', + 'test404', + ) + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" # May succeed or fail depending on server behavior # example.com returns "Example Domain" even for 404s @@ -269,20 +245,29 @@ def test_handles_redirects(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # http://example.com redirects to https://example.com - result = subprocess.run( - ['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) + with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as ( + _process, + _pid, + snapshot_chrome_dir, + env, + ): + title_dir = snapshot_chrome_dir.parent / 'title' + title_dir.mkdir(exist_ok=True) + + # http://example.com redirects to https://example.com + nav_result, result = run_title_capture( + title_dir, + snapshot_chrome_dir, + env, + 'http://example.com', + 'testredirect', + ) + assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" # Should succeed and follow redirect if result.returncode == 0: # Hook writes to current directory - output_title_file = tmpdir / 'title.txt' + output_title_file = title_dir / 'title.txt' if output_title_file.exists(): title_text = output_title_file.read_text().strip() assert 'example' in title_text.lower() diff --git a/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js index 3fe8a10a..2dd2002f 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js @@ -174,7 +174,7 @@ async function configure2Captcha() { // Connect to the existing Chrome session via CDP const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); if (!fs.existsSync(cdpFile)) { - return { success: false, error: 'CDP URL not found - chrome plugin must run first' }; + return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; } const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py index 5738cc05..4569cb49 100644 --- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py +++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py @@ -44,7 +44,7 @@ class TestTwoCaptcha: def setup(self): self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA') if not self.api_key: - pytest.skip("TWOCAPTCHA_API_KEY required") + pytest.fail("TWOCAPTCHA_API_KEY required") def test_install_and_load(self): """Extension installs and loads in Chromium.""" diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index debea7f3..a3ab08a8 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -14,6 +14,7 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( setup_test_env, + get_test_env, launch_chromium_session, kill_chromium_session, CHROME_LAUNCH_HOOK, @@ -283,8 +284,7 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(script_dir, - env=get_test_env()), + cwd=str(script_dir), capture_output=True, text=True, env=env, @@ -301,11 +301,10 @@ const puppeteer = require('puppeteer-core'); return json.loads(output_lines[-1]) -# Test URL: Yahoo has many ads that uBlock should block +# Test URL: Yahoo has many ads that uBlock should block (no mocks) TEST_URL = 'https://www.yahoo.com/' -@pytest.mark.timeout(15) def test_extension_loads_in_chromium(): """Verify uBlock extension loads in Chromium by visiting its dashboard page. @@ -519,15 +518,15 @@ const puppeteer = require('puppeteer-core'); pass -def test_blocks_ads_on_test_page(): - """Live test: verify uBlock Origin blocks ads on a test page. +def test_blocks_ads_on_yahoo_com(): + """Live test: verify uBlock Origin blocks ads on yahoo.com (real network). This test runs TWO browser sessions: 1. WITHOUT extension - verifies ads are NOT blocked (baseline) 2. WITH extension - verifies ads ARE blocked This ensures we're actually testing the extension's effect, not just - that a test page happens to show ads as blocked. + that a test page happens to show ads as blocked. No mocks are used. """ import time @@ -581,20 +580,15 @@ def test_blocks_ads_on_test_page(): # Verify baseline shows ads ARE visible (not blocked) if baseline_result['adElementsFound'] == 0: - pytest.skip( - f"Cannot test extension: no ad elements found on {TEST_URL}. " - f"The page may have changed or loaded differently." + pytest.fail( + f"Baseline must find ad elements on {TEST_URL}, but found none. " + f"This test requires a real ad-heavy page." ) if baseline_result['adElementsVisible'] == 0: - print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!") - print("This suggests either:") - print(" - There's another ad blocker interfering") - print(" - Network-level ad blocking is in effect") - - pytest.skip( - f"Cannot test extension: baseline shows no visible ads " - f"despite finding {baseline_result['adElementsFound']} ad elements." + pytest.fail( + f"Baseline must have visible ads on {TEST_URL}, but none were visible. " + f"This likely means another ad blocker is active or network-level blocking is in effect." ) print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension") @@ -713,6 +707,10 @@ const puppeteer = require('{env_base['NODE_MODULES_DIR']}/puppeteer-core'); f"With extension: {ext_result['adElementsVisible']} visible ads\n" \ f"Expected fewer ads with extension." + # Ensure uBlock actually blocks at least some ad/track requests + assert ext_result['blockedRequests'] > 0, \ + "uBlock should block at least one ad/track request on yahoo.com" + # Extension should block at least 20% of ads (was consistently blocking 5-13% without proper init time) assert reduction_percent >= 20, \ f"uBlock should block at least 20% of ads.\n" \ diff --git a/archivebox/plugins/ytdlp/templates/card.html b/archivebox/plugins/ytdlp/templates/card.html index 1694ceae..6fe32098 100644 --- a/archivebox/plugins/ytdlp/templates/card.html +++ b/archivebox/plugins/ytdlp/templates/card.html @@ -1,14 +1,17 @@ -<!-- YT-DLP thumbnail - shows video/audio player or placeholder --> -<div class="extractor-thumbnail ytdlp-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;"> - <video src="{{ output_path }}" - style="width: 100%; height: 100px; object-fit: contain;" - poster="" - preload="metadata" - muted - onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';"> - </video> - <div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;"> - <span style="font-size: 32px;">🎬</span> - <span>YT-DLP</span> +<!-- YT-DLP output list --> +{% if media_files %} + <div class="loose-items" style="pointer-events: auto;"> + {% for file in media_files %} + <a href="{{ file.url|default:file.path|urlencode }}" target="preview" + title="{{ file.name }}"> + 📄 {{ file.name }} + </a> + {% endfor %} </div> -</div> +{% else %} + <div class="thumbnail-compact" data-plugin="ytdlp" data-compact="1"> + <span class="thumbnail-compact-icon">🎬</span> + <span class="thumbnail-compact-label">YT-DLP</span> + <span class="thumbnail-compact-meta">media</span> + </div> +{% endif %} diff --git a/archivebox/templates/admin/actions.html b/archivebox/templates/admin/actions.html new file mode 100644 index 00000000..cd481a58 --- /dev/null +++ b/archivebox/templates/admin/actions.html @@ -0,0 +1,31 @@ +{% load i18n %} +<div class="actions"> + <div class="actions-left"> + {% block actions %} + {% block actions-form %} + {% for field in action_form %} + {% if field.name == "tags" %} + <span class="actions-tags">{{ field }}</span> + {% else %} + {% if field.label %}<label>{{ field.label }} {{ field }}</label>{% else %}{{ field }}{% endif %} + {% endif %} + {% endfor %} + {% endblock %} + {% block actions-submit %} + <button type="submit" class="button" name="index" value="{{ action_index|default:0 }}">{% translate "Run" %}</button> + {% endblock %} + {% block actions-counter %} + {% if actions_selection_counter %} + <span class="action-counter" data-actions-icnt="{{ cl.result_list|length }}">{{ selection_note }}</span> + {% if cl.result_count != cl.result_list|length %} + <span class="all hidden">{{ selection_note_all }}</span> + <span class="question hidden"> + <a role="button" href="#" title="{% translate "Click here to select the objects across all pages" %}">{% blocktranslate with cl.result_count as total_count %}Select all {{ total_count }} {{ module_name }}{% endblocktranslate %}</a> + </span> + <span class="clear hidden"><a role="button" href="#">{% translate "Clear selection" %}</a></span> + {% endif %} + {% endif %} + {% endblock %} + {% endblock %} + </div> +</div> diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index c6270ed9..86bd85c8 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -1,4 +1,4 @@ -{% load i18n static tz %} +{% load i18n static tz core_tags %} {% get_current_language as LANGUAGE_CODE %} {% get_current_language_bidi as LANGUAGE_BIDI %} @@ -12,6 +12,10 @@ {% endblock %} <link rel="stylesheet" type="text/css" href="{% block stylesheet %}{% static "admin/css/base.css" %}{% endblock %}"> + {% api_token as api_token %} + <script> + window.ARCHIVEBOX_API_KEY = "{{ api_token|escapejs }}"; + </script> {% block extrastyle %} <style> #upgrade-banner { @@ -55,8 +59,8 @@ } /* Main form container - flexbox grid */ - #content-main form > div, - #content form > div { + body:not(.change-list) #content-main form > div, + body:not(.change-list) #content form > div { display: flex; flex-wrap: wrap; gap: 20px; @@ -909,8 +913,8 @@ } /* Toolbar / search bar */ - #toolbar { - padding: 16px; + #changelist #toolbar { + padding: 12px 16px; background: #fff; border-bottom: 1px solid #e2e8f0; display: flex; @@ -926,6 +930,21 @@ flex: 0 1 auto; max-width: 500px; } + body.change-list #toolbar form > div { + display: flex !important; + align-items: center; + gap: 8px; + flex-wrap: nowrap !important; + white-space: nowrap; + } + body.change-list #toolbar label { + margin: 0; + display: inline-flex; + align-items: center; + } + body.change-list #toolbar input[type="submit"] { + margin: 0; + } #searchbar { flex: 1; @@ -961,6 +980,36 @@ letter-spacing: 0.025em; margin: 0; border-bottom: 1px solid #e2e8f0; + display: flex; + align-items: center; + justify-content: space-between; + gap: 8px; + } + + #changelist-filter .filter-toggle { + border: 1px solid #e2e8f0; + background: #ffffff; + color: #64748b; + font-size: 11px; + padding: 4px 8px; + border-radius: 999px; + cursor: pointer; + text-transform: none; + letter-spacing: normal; + } + + #changelist-filter .filter-toggle:hover { + background: #f1f5f9; + color: #334155; + } + + .filter-toggle-floating { + position: static; + box-shadow: none; + padding: 2px 6px; + font-size: 11px; + line-height: 1.2; + height: 20px; } #changelist-filter h3 { @@ -1004,15 +1053,62 @@ font-weight: 500; } + body.filters-collapsed #changelist-filter { + display: none !important; + } + + body.filters-collapsed.change-list .results, + body.filters-collapsed.change-list .paginator, + body.filters-collapsed.change-list #toolbar, + body.filters-collapsed.change-list div.xfull, + body.filters-collapsed.change-list #changelist .changelist-form-container, + body.filters-collapsed.change-list #changelist-form, + body.filters-collapsed.change-list #result_list { + margin-right: 0 !important; + width: 100% !important; + } + + body.filters-collapsed.change-list #changelist .changelist-form-container > div { + max-width: 100% !important; + } + /* Actions bar */ - .actions { + body.change-list #changelist .actions { padding: 12px 16px; background: #f8fafc; - border-bottom: 1px solid #e2e8f0; + border-bottom: 0; + display: flex !important; + align-items: center; + gap: 8px; + flex-wrap: nowrap !important; + overflow-x: auto; + } + body.change-list #changelist { + border: 0 !important; + } + body.change-list #changelist .actions .button, + body.change-list #changelist .actions select, + body.change-list #changelist .actions label { + line-height: 1.5rem; + height: 1.5rem; + display: inline-flex; + align-items: center; + } + body.change-list #changelist .actions-left { display: flex; align-items: center; - gap: 12px; - flex-wrap: wrap; + gap: 8px; + flex-wrap: nowrap !important; + flex: 1 1 auto; + min-width: 0; + white-space: nowrap; + } + body.change-list #changelist .actions-right { + display: flex; + align-items: center; + gap: 8px; + margin-left: auto; + flex: 0 0 auto; } .actions label { @@ -1098,22 +1194,23 @@ align-items: center; gap: 4px; padding: 4px 8px 4px 10px; - background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%); - color: #fff; + background: var(--tag-bg, #e2e8f0); + color: var(--tag-fg, #1e293b); font-size: 13px; font-weight: 500; border-radius: 16px; white-space: nowrap; transition: all 0.15s ease; -webkit-font-smoothing: antialiased; + border: 1px solid var(--tag-border, #cbd5e1); } .tag-pill:hover { - background: linear-gradient(135deg, #2563eb 0%, #1d4ed8 100%); + filter: brightness(0.98); } .tag-pill a.tag-link { - color: #fff; + color: inherit; text-decoration: none; } @@ -1130,10 +1227,10 @@ height: 16px; padding: 0; margin: 0; - background: rgba(255, 255, 255, 0.2); - border: none; + background: rgba(15, 23, 42, 0.08); + border: 1px solid rgba(15, 23, 42, 0.12); border-radius: 50%; - color: #fff; + color: inherit; font-size: 14px; font-weight: 600; line-height: 1; @@ -1143,7 +1240,7 @@ } .tag-remove-btn:hover { - background: rgba(255, 255, 255, 0.4); + background: rgba(15, 23, 42, 0.18); opacity: 1; } @@ -1196,29 +1293,94 @@ font-size: 12px; } - .tag-inline-input-sm { - width: 24px; - min-width: 24px; - max-width: 100px; - padding: 2px 4px; - border: none; + #content .tag-editor-inline input.tag-inline-input-sm { + width: 22px; + min-width: 22px; + max-width: 140px; + height: 22px; + padding: 0 6px; + border: 1px solid #e2e8f0; outline: none; - font-size: 11px; + font-size: 12px; font-family: inherit; - background: transparent; - color: #64748b; - transition: width 0.15s ease; + background: #f1f5f9; + color: #94a3b8; + border-radius: 999px; + text-align: center; + cursor: text; + transition: width 0.15s ease, color 0.15s ease, border-color 0.15s ease, background 0.15s ease; } - .tag-inline-input-sm:focus { - width: 80px; + #content .tag-editor-inline input.tag-inline-input-sm:focus { + width: 120px; color: #1e293b; + border-color: #94a3b8; + background: #ffffff; + text-align: left; } - .tag-inline-input-sm::placeholder { + #content .tag-editor-inline input.tag-inline-input-sm::placeholder { color: #94a3b8; } + /* Actions bar tag editor (compact to avoid crowding buttons) */ + body.change-list #changelist .actions .tag-editor-container { + padding: 2px 6px; + min-height: 24px; + height: 24px; + width: 160px; + max-width: 160px; + flex: 0 0 160px; + flex-wrap: nowrap; + overflow-x: auto; + overflow-y: hidden; + gap: 4px; + } + body.change-list #changelist .actions-tags { + display: none; + align-items: center; + } + + /* Ensure changelist filter sidebar is visible */ + body.change-list #changelist .changelist-form-container { + display: flex; + align-items: flex-start; + width: 100%; + gap: 20px; + flex-wrap: nowrap; + } + body.change-list #changelist-filter { + flex: 0 0 260px; + max-width: 260px; + display: block; + margin: 0; + order: 2; + align-self: flex-start; + } + body.change-list #changelist .changelist-form-container > div { + flex: 1 1 auto; + min-width: 0; + order: 1; + max-width: calc(100% - 280px); + } + + .actions .tag-pills { + gap: 4px; + flex-wrap: nowrap; + } + + .actions .tag-pill { + padding: 1px 6px 1px 8px; + font-size: 10px; + } + + .actions .tag-inline-input { + min-width: 40px; + padding: 0; + font-size: 11px; + } + + /* Container in list view title column */ .tags-inline-editor { display: inline; @@ -1497,6 +1659,12 @@ console.log('Converted', buttons.children().length, 'admin actions from dropdown to buttons') jQuery('select[multiple]').select2(); } + function updateTagWidgetVisibility() { + const tagContainer = document.querySelector('.actions-tags'); + if (!tagContainer) return; + const checked = document.querySelectorAll('#changelist-form input.action-select:checked').length; + tagContainer.style.display = checked > 0 ? 'inline-flex' : 'none'; + } function fixInlineAddRow() { $('#id_snapshottag-MAX_NUM_FORMS').val('1000') $('.add-row').show() @@ -1536,11 +1704,87 @@ } $(document).ready(function() { fix_actions() + updateTagWidgetVisibility() + const form = document.querySelector('#changelist-form') + if (form) { + form.addEventListener('change', updateTagWidgetVisibility) + } fixInlineAddRow() setupSnapshotGridListToggle() setTimeOffset() selectSnapshotIfHotlinked() }) </script> + <script> + (function() { + if (!document.body.classList.contains('change-list')) return; + var filter = document.getElementById('changelist-filter'); + if (!filter) return; + var header = filter.querySelector('h2'); + if (!header) return; + + var toggle = document.getElementById('changelist-filter-toggle'); + if (!toggle) { + toggle = document.createElement('button'); + toggle.type = 'button'; + toggle.id = 'changelist-filter-toggle'; + toggle.className = 'filter-toggle'; + toggle.setAttribute('aria-expanded', 'true'); + toggle.dataset.showLabel = '{% translate "Filters" %}'; + toggle.dataset.hideLabel = '{% translate "Hide" %}'; + toggle.textContent = toggle.dataset.hideLabel; + header.appendChild(toggle); + } + + var storageKey = 'admin-filters-collapsed'; + var changelist = document.getElementById('changelist'); + var hadFiltered = changelist && changelist.classList.contains('filtered'); + + var floating = document.getElementById('changelist-filter-float-toggle'); + if (!floating) { + floating = document.createElement('button'); + floating.type = 'button'; + floating.id = 'changelist-filter-float-toggle'; + floating.className = 'filter-toggle filter-toggle-floating'; + floating.textContent = toggle.dataset.showLabel; + } + + var actionsRight = document.querySelector('#changelist .actions .actions-right'); + var actionsBar = document.querySelector('#changelist .actions'); + if (actionsRight) { + actionsRight.appendChild(floating); + } else if (actionsBar) { + actionsBar.appendChild(floating); + } + + function applyState() { + var collapsed = localStorage.getItem(storageKey) === 'true'; + document.body.classList.toggle('filters-collapsed', collapsed); + filter.style.display = collapsed ? 'none' : ''; + toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel; + toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true'); + floating.style.display = collapsed ? 'inline-flex' : 'none'; + if (changelist) { + if (collapsed) { + changelist.classList.remove('filtered'); + } else if (hadFiltered) { + changelist.classList.add('filtered'); + } + } + } + + function toggleFilters() { + var collapsed = !document.body.classList.contains('filters-collapsed'); + localStorage.setItem(storageKey, collapsed ? 'true' : 'false'); + applyState(); + } + + toggle.addEventListener('click', toggleFilters); + floating.addEventListener('click', toggleFilters); + + applyState(); + })(); + </script> + <script src="{% static 'admin-inline-tags.js' %}"></script> </body> </html> diff --git a/archivebox/templates/admin/private_index.html b/archivebox/templates/admin/private_index.html index b60f3a3e..370343e6 100644 --- a/archivebox/templates/admin/private_index.html +++ b/archivebox/templates/admin/private_index.html @@ -78,7 +78,19 @@ {% block filters %} {% if cl.has_filters %} <div id="changelist-filter"> - <h2>{% translate 'Filter' %}</h2> + <h2> + {% translate 'Filter' %} + <button + type="button" + id="changelist-filter-toggle" + class="filter-toggle" + aria-expanded="true" + data-show-label="{% translate 'Filters' %}" + data-hide-label="{% translate 'Hide' %}" + > + {% translate 'Hide' %} + </button> + </h2> {% if cl.has_active_filters %}<h3 id="changelist-filter-clear"> <a href="{{ cl.clear_all_filters_qs }}">✖ {% translate "Clear all filters" %}</a> </h3>{% endif %} @@ -88,4 +100,28 @@ {% endblock %} </div> </div> + {% if cl.has_filters %} + <script> + (function() { + var storageKey = 'admin-filters-collapsed'; + var toggle = document.getElementById('changelist-filter-toggle'); + if (!toggle) return; + + function applyState() { + var collapsed = localStorage.getItem(storageKey) === 'true'; + document.body.classList.toggle('filters-collapsed', collapsed); + toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel; + toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true'); + } + + toggle.addEventListener('click', function() { + var collapsed = !document.body.classList.contains('filters-collapsed'); + localStorage.setItem(storageKey, collapsed ? 'true' : 'false'); + applyState(); + }); + + applyState(); + })(); + </script> + {% endif %} {% endblock %} diff --git a/archivebox/templates/admin/private_index_grid.html b/archivebox/templates/admin/private_index_grid.html index b60f3a3e..370343e6 100644 --- a/archivebox/templates/admin/private_index_grid.html +++ b/archivebox/templates/admin/private_index_grid.html @@ -78,7 +78,19 @@ {% block filters %} {% if cl.has_filters %} <div id="changelist-filter"> - <h2>{% translate 'Filter' %}</h2> + <h2> + {% translate 'Filter' %} + <button + type="button" + id="changelist-filter-toggle" + class="filter-toggle" + aria-expanded="true" + data-show-label="{% translate 'Filters' %}" + data-hide-label="{% translate 'Hide' %}" + > + {% translate 'Hide' %} + </button> + </h2> {% if cl.has_active_filters %}<h3 id="changelist-filter-clear"> <a href="{{ cl.clear_all_filters_qs }}">✖ {% translate "Clear all filters" %}</a> </h3>{% endif %} @@ -88,4 +100,28 @@ {% endblock %} </div> </div> + {% if cl.has_filters %} + <script> + (function() { + var storageKey = 'admin-filters-collapsed'; + var toggle = document.getElementById('changelist-filter-toggle'); + if (!toggle) return; + + function applyState() { + var collapsed = localStorage.getItem(storageKey) === 'true'; + document.body.classList.toggle('filters-collapsed', collapsed); + toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel; + toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true'); + } + + toggle.addEventListener('click', function() { + var collapsed = !document.body.classList.contains('filters-collapsed'); + localStorage.setItem(storageKey, collapsed ? 'true' : 'false'); + applyState(); + }); + + applyState(); + })(); + </script> + {% endif %} {% endblock %} diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index 266afb70..5fc449e6 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -130,6 +130,29 @@ color: #c9d1d9; border-color: #8b949e; } + #progress-monitor .cancel-item-btn { + background: transparent; + border: 1px solid #30363d; + color: #f85149; + cursor: pointer; + padding: 2px 6px; + border-radius: 6px; + font-size: 11px; + line-height: 1; + transition: all 0.2s; + flex-shrink: 0; + } + #progress-monitor .cancel-item-btn:hover { + background: rgba(248, 81, 73, 0.12); + border-color: #f85149; + color: #ff7b72; + } + #progress-monitor .cancel-item-btn.is-busy { + opacity: 0.6; + cursor: wait; + border-color: #6e7681; + color: #6e7681; + } /* Tree Container */ #progress-monitor .tree-container { @@ -161,14 +184,21 @@ gap: 12px; padding: 10px 14px; background: rgba(0,0,0,0.2); - cursor: pointer; - text-decoration: none; - color: inherit; } #progress-monitor .crawl-header:hover { background: rgba(88, 166, 255, 0.1); } - #progress-monitor a.crawl-header:visited { + #progress-monitor .crawl-header-link { + display: flex; + align-items: center; + gap: 12px; + flex: 1; + min-width: 0; + cursor: pointer; + text-decoration: none; + color: inherit; + } + #progress-monitor a.crawl-header-link:visited { color: inherit; } #progress-monitor .crawl-icon { @@ -256,14 +286,21 @@ align-items: center; gap: 10px; padding: 8px 12px; - cursor: pointer; - text-decoration: none; - color: inherit; } #progress-monitor .snapshot-header:hover { background: rgba(88, 166, 255, 0.05); } - #progress-monitor a.snapshot-header:visited { + #progress-monitor .snapshot-header-link { + display: flex; + align-items: center; + gap: 10px; + flex: 1; + min-width: 0; + cursor: pointer; + text-decoration: none; + color: inherit; + } + #progress-monitor a.snapshot-header-link:visited { color: inherit; } #progress-monitor .snapshot-icon { @@ -342,7 +379,6 @@ } #progress-monitor .extractor-badge.started .progress-fill { background: rgba(210, 153, 34, 0.3); - width: 50%; animation: progress-pulse 1.5s ease-in-out infinite; } @keyframes progress-pulse { @@ -518,6 +554,25 @@ letter-spacing: 0.5px; flex-shrink: 0; } + #progress-monitor .pid-label { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 2px 6px; + border-radius: 999px; + font-size: 10px; + font-weight: 600; + color: #8b949e; + background: rgba(148, 163, 184, 0.12); + border: 1px solid rgba(148, 163, 184, 0.2); + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; + letter-spacing: 0.2px; + white-space: nowrap; + } + #progress-monitor .pid-label.compact { + padding: 1px 5px; + font-size: 9px; + } </style> @@ -527,6 +582,7 @@ <div class="orchestrator-status"> <span class="status-dot stopped" id="orchestrator-dot"></span> <span id="orchestrator-text">Stopped</span> + <span class="pid-label compact" id="orchestrator-pid" style="display:none;"></span> </div> <div class="stats"> <div class="stat"> @@ -572,12 +628,32 @@ const thumbnailStrip = document.getElementById('thumbnail-strip'); let pollInterval = null; + let pollDelayMs = 1000; + let idleTicks = 0; let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true'; let knownThumbnailIds = new Set(); // Baselines for resettable counters let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0'); let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0'); + + function getApiKey() { + return (window.ARCHIVEBOX_API_KEY || '').trim(); + } + + function buildApiUrl(path) { + const apiKey = getApiKey(); + if (!apiKey) return path; + const sep = path.includes('?') ? '&' : '?'; + return `${path}${sep}api_key=${encodeURIComponent(apiKey)}`; + } + + function buildApiHeaders() { + const headers = { 'Content-Type': 'application/json' }; + const apiKey = getApiKey(); + if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey; + return headers; + } let lastSucceeded = 0; let lastFailed = 0; @@ -620,6 +696,7 @@ return icons[plugin] || '📄'; } + function renderThumbnail(thumb, isNew) { const ext = (thumb.embed_path || '').toLowerCase().split('.').pop(); const isImage = ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico'].includes(ext); @@ -630,9 +707,10 @@ item.title = `${thumb.plugin}: ${thumb.snapshot_url}`; item.dataset.id = thumb.id; - if (isImage && thumb.archive_path) { + const archiveUrl = thumb.archive_url || thumb.archive_path; + if (isImage && archiveUrl) { item.innerHTML = ` - <img src="${thumb.archive_path}" alt="${thumb.plugin}" loading="lazy" onerror="this.parentElement.innerHTML='<div class=\\'thumbnail-fallback\\'>${getPluginIcon(thumb.plugin)}</div><span class=\\'thumbnail-plugin\\'>${thumb.plugin}</span>'"> + <img src="${archiveUrl}" alt="${thumb.plugin}" loading="lazy" onerror="this.parentElement.innerHTML='<div class=\\'thumbnail-fallback\\'>${getPluginIcon(thumb.plugin)}</div><span class=\\'thumbnail-plugin\\'>${thumb.plugin}</span>'"> <span class="thumbnail-plugin">${thumb.plugin}</span> `; } else { @@ -685,13 +763,19 @@ extractor.status === 'failed' ? '✗' : extractor.status === 'backoff' ? '⌛' : extractor.status === 'skipped' ? '⇢' : '○'; + const progress = typeof extractor.progress === 'number' + ? Math.max(0, Math.min(100, extractor.progress)) + : null; + const progressStyle = progress !== null ? ` style="width: ${progress}%;"` : ''; + const pidHtml = extractor.pid ? `<span class="pid-label compact">pid ${extractor.pid}</span>` : ''; return ` <span class="extractor-badge ${extractor.status || 'queued'}"> - <span class="progress-fill"></span> + <span class="progress-fill"${progressStyle}></span> <span class="badge-content"> <span class="badge-icon">${icon}</span> <span>${extractor.plugin || 'unknown'}</span> + ${pidHtml} </span> </span> `; @@ -700,6 +784,11 @@ function renderSnapshot(snapshot, crawlId) { const statusIcon = snapshot.status === 'started' ? '↻' : '📄'; const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`; + const canCancel = snapshot.status === 'queued'; + const cancelBtn = canCancel + ? `<button class="cancel-item-btn" data-cancel-type="snapshot" data-snapshot-id="${snapshot.id}" data-label="✕" title="Cancel snapshot">✕</button>` + : ''; + const snapshotPidHtml = snapshot.worker_pid ? `<span class="pid-label compact">pid ${snapshot.worker_pid}</span>` : ''; let extractorHtml = ''; if (snapshot.all_plugins && snapshot.all_plugins.length > 0) { @@ -716,18 +805,22 @@ return ` <div class="snapshot-item"> - <a class="snapshot-header" href="${adminUrl}"> - <span class="snapshot-icon">${statusIcon}</span> - <div class="snapshot-info"> - <div class="snapshot-url">${formatUrl(snapshot.url)}</div> - <div class="snapshot-meta"> - ${(snapshot.total_plugins || 0) > 0 - ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}` - : 'Waiting for extractors...'} + <div class="snapshot-header"> + <a class="snapshot-header-link" href="${adminUrl}"> + <span class="snapshot-icon">${statusIcon}</span> + <div class="snapshot-info"> + <div class="snapshot-url">${formatUrl(snapshot.url)}</div> + <div class="snapshot-meta"> + ${(snapshot.total_plugins || 0) > 0 + ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}` + : 'Waiting for extractors...'} + </div> </div> - </div> - <span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span> - </a> + ${snapshotPidHtml} + <span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span> + </a> + ${cancelBtn} + </div> <div class="snapshot-progress"> <div class="progress-bar-container"> <div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}" @@ -742,6 +835,11 @@ function renderCrawl(crawl) { const statusIcon = crawl.status === 'started' ? '↻' : '🔍'; const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`; + const canCancel = crawl.status === 'queued' || crawl.status === 'started'; + const cancelBtn = canCancel + ? `<button class="cancel-item-btn" data-cancel-type="crawl" data-crawl-id="${crawl.id}" data-label="✕" title="Cancel crawl">✕</button>` + : ''; + const crawlPidHtml = crawl.worker_pid ? `<span class="pid-label compact">pid ${crawl.worker_pid}</span>` : ''; let snapshotsHtml = ''; if (crawl.active_snapshots && crawl.active_snapshots.length > 0) { @@ -760,7 +858,7 @@ // Queued but retry_at is in future (was claimed by worker, will retry) warningHtml = ` <div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;"> - 🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''} + 🔄 Trying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''} </div> `; } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) { @@ -784,19 +882,23 @@ return ` <div class="crawl-item" data-crawl-id="${crawl.id || 'unknown'}"> - <a class="crawl-header" href="${adminUrl}"> - <span class="crawl-icon">${statusIcon}</span> - <div class="crawl-info"> - <div class="crawl-label">${crawl.label || '(no label)'}</div> - <div class="crawl-meta">${metaText}</div> - </div> - <div class="crawl-stats"> - <span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span> - <span style="color:#d29922">${crawl.started_snapshots || 0} active</span> - <span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span> - </div> - <span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span> - </a> + <div class="crawl-header"> + <a class="crawl-header-link" href="${adminUrl}"> + <span class="crawl-icon">${statusIcon}</span> + <div class="crawl-info"> + <div class="crawl-label">${crawl.label || '(no label)'}</div> + <div class="crawl-meta">${metaText}</div> + </div> + <div class="crawl-stats"> + <span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span> + <span style="color:#d29922">${crawl.started_snapshots || 0} active</span> + <span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span> + </div> + ${crawlPidHtml} + <span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span> + </a> + ${cancelBtn} + </div> <div class="crawl-progress"> <div class="progress-bar-container"> <div class="progress-bar crawl ${crawl.status === 'started' && (crawl.progress || 0) === 0 ? 'indeterminate' : ''}" @@ -820,11 +922,26 @@ data.crawls_pending > 0 || data.crawls_started > 0 || data.snapshots_pending > 0 || data.snapshots_started > 0 || data.archiveresults_pending > 0 || data.archiveresults_started > 0; + if (!hasActivity && !isCollapsed) { + setCollapsedState(true); + } + if (hasActivity) { + idleTicks = 0; + if (pollDelayMs !== 1000) { + setPollingDelay(1000); + } + } else { + idleTicks += 1; + if (idleTicks > 5 && pollDelayMs !== 10000) { + setPollingDelay(10000); + } + } // Update orchestrator status - show "Running" only when there's actual activity // Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently const dot = document.getElementById('orchestrator-dot'); const text = document.getElementById('orchestrator-text'); + const pidEl = document.getElementById('orchestrator-pid'); const hasWorkers = data.total_workers > 0; if (hasWorkers || hasActivity) { @@ -838,6 +955,14 @@ text.textContent = 'Idle'; } + if (data.orchestrator_pid) { + pidEl.textContent = `pid ${data.orchestrator_pid}`; + pidEl.style.display = 'inline-flex'; + } else { + pidEl.textContent = ''; + pidEl.style.display = 'none'; + } + // Pulse the dot to show we got fresh data dot.classList.add('flash'); setTimeout(() => dot.classList.remove('flash'), 300); @@ -909,7 +1034,7 @@ function startPolling() { if (pollInterval) return; fetchProgress(); - pollInterval = setInterval(fetchProgress, 1000); // Poll every 1 second + pollInterval = setInterval(fetchProgress, pollDelayMs); } function stopPolling() { @@ -919,10 +1044,19 @@ } } - // Collapse toggle - collapseBtn.addEventListener('click', function() { - isCollapsed = !isCollapsed; - localStorage.setItem('progress-monitor-collapsed', isCollapsed); + function setPollingDelay(ms) { + pollDelayMs = ms; + if (pollInterval) { + clearInterval(pollInterval); + pollInterval = setInterval(fetchProgress, pollDelayMs); + } + } + + function setCollapsedState(collapsed, persist = true) { + isCollapsed = collapsed; + if (persist) { + localStorage.setItem('progress-monitor-collapsed', isCollapsed); + } if (isCollapsed) { monitor.classList.add('collapsed'); collapseBtn.textContent = 'Expand'; @@ -930,12 +1064,92 @@ monitor.classList.remove('collapsed'); collapseBtn.textContent = 'Details'; } + } + + function setCancelButtonState(btn, busy) { + if (!btn) return; + const label = btn.dataset.label || '✕'; + btn.disabled = !!busy; + btn.classList.toggle('is-busy', !!busy); + btn.textContent = busy ? '…' : label; + } + + function cancelCrawl(crawlId, btn) { + if (!crawlId) return; + if (!getApiKey()) { + console.warn('API key unavailable for this session.'); + setCancelButtonState(btn, false); + return; + } + setCancelButtonState(btn, true); + + fetch(buildApiUrl(`/api/v1/crawls/crawl/${crawlId}`), { + method: 'PATCH', + headers: buildApiHeaders(), + body: JSON.stringify({ status: 'sealed', retry_at: null }), + }) + .then(response => response.json()) + .then(data => { + if (data.error) { + console.error('Cancel crawl error:', data.error); + } + fetchProgress(); + }) + .catch(error => { + console.error('Cancel crawl failed:', error); + setCancelButtonState(btn, false); + }); + } + + function cancelSnapshot(snapshotId, btn) { + if (!snapshotId) return; + if (!getApiKey()) { + console.warn('API key unavailable for this session.'); + setCancelButtonState(btn, false); + return; + } + setCancelButtonState(btn, true); + + fetch(buildApiUrl(`/api/v1/core/snapshot/${snapshotId}`), { + method: 'PATCH', + headers: buildApiHeaders(), + body: JSON.stringify({ status: 'sealed', retry_at: null }), + }) + .then(response => response.json()) + .then(data => { + if (data.error) { + console.error('Cancel snapshot error:', data.error); + } + fetchProgress(); + }) + .catch(error => { + console.error('Cancel snapshot failed:', error); + setCancelButtonState(btn, false); + }); + } + + // Collapse toggle + collapseBtn.addEventListener('click', function() { + setCollapsedState(!isCollapsed); + }); + + crawlTree.addEventListener('click', function(event) { + const btn = event.target.closest('.cancel-item-btn'); + if (!btn) return; + event.preventDefault(); + event.stopPropagation(); + + const cancelType = btn.dataset.cancelType; + if (cancelType === 'crawl') { + cancelCrawl(btn.dataset.crawlId, btn); + } else if (cancelType === 'snapshot') { + cancelSnapshot(btn.dataset.snapshotId, btn); + } }); // Apply initial state if (isCollapsed) { - monitor.classList.add('collapsed'); - collapseBtn.textContent = 'Expand'; + setCollapsedState(true, false); } // Start polling when page loads diff --git a/archivebox/templates/admin/snapshots_grid.html b/archivebox/templates/admin/snapshots_grid.html index bf115e8e..3e312338 100644 --- a/archivebox/templates/admin/snapshots_grid.html +++ b/archivebox/templates/admin/snapshots_grid.html @@ -180,7 +180,7 @@ <input type="checkbox" name="_selected_action" value="{{obj.pk}}"/> </label> </div> - <a href="/{{obj.archive_path}}/index.html" class="card-thumbnail {% if not obj.thumbnail_url %}missing{% endif %}"> + <a href="{% snapshot_base_url obj %}/index.html" class="card-thumbnail {% if not obj.thumbnail_url %}missing{% endif %}"> <img src="{{obj.thumbnail_url|default:'/static/spinner.gif' }}" alt="{{obj.title|default:'Not yet archived...'}}" /> </a> <div class="card-footer"> @@ -194,10 +194,10 @@ </div> {% endif %} <div class="card-title" title="{{obj.title}}"> - <a href="/{{obj.archive_path}}/index.html"> + <a href="{% snapshot_base_url obj %}/index.html"> <h4> {% if obj.is_archived %} - <img src="/{{obj.archive_path}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"/> + <img src="{% snapshot_base_url obj %}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"/> {% else %} <img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async"/> {% endif %} diff --git a/archivebox/templates/core/index_row.html b/archivebox/templates/core/index_row.html index 82e28a44..0b4aa265 100644 --- a/archivebox/templates/core/index_row.html +++ b/archivebox/templates/core/index_row.html @@ -6,12 +6,12 @@ </td> <td class="title-col" style="opacity: {% if link.title %}1{% else %}0.3{% endif %}" title="{{link.title|default:'Not yet archived...'}}"> {% if link.is_archived %} - <a href="/{{link.archive_path}}/index.html"><img src="/{{link.archive_path}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"></a> + <a href="{% web_base_url %}/{{link.archive_path}}/index.html"><img src="{% snapshot_url link 'favicon.ico' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async"></a> {% else %} - <a href="/{{link.archive_path}}/index.html"><img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async" style="height: 15px"></a> + <a href="{% web_base_url %}/{{link.archive_path}}/index.html"><img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async" style="height: 15px"></a> {% endif %} - <a href="/{{link.archive_path}}/index.html" title="{{link.title|default:'Not yet archived...'}}"> + <a href="{% web_base_url %}/{{link.archive_path}}/index.html" title="{{link.title|default:'Not yet archived...'}}"> <span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}"> {{link.title|default:'Loading...'|truncatechars:128}} </span> @@ -29,7 +29,7 @@ {% if link.icons %} {{link.icons}}  <small style="float:right; opacity: 0.5">{{link.num_outputs}}</small> {% else %} - <a href="/{{link.archive_path}}/index.html"> + <a href="{% web_base_url %}/{{link.archive_path}}/index.html"> 📄   {{link.num_outputs}} <img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="files-spinner" decoding="async" style="height: 15px"/> </a> diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index 048f4f12..6adbf7c4 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -97,6 +97,25 @@ vertical-align: -2px; margin-right: 4px; } + .header-archivebox { + display: inline-flex; + align-items: center; + gap: 6px; + white-space: nowrap; + } + .header-top .col-lg-2 { + white-space: nowrap; + } + .favicon { + width: 20px; + height: 20px; + max-width: 30px; + max-height: 30px; + object-fit: contain; + border-radius: 3px; + background: rgba(0,0,0,0.06); + vertical-align: -4px; + } .header-toggle { line-height: 14px; font-size: 70px; @@ -312,6 +331,9 @@ height: 138px; min-height: 138px; max-height: 138px; + display: flex; + flex-direction: column; + align-items: stretch; } .thumb-card:has([data-compact]) { height: 46px; @@ -327,6 +349,25 @@ text-overflow: ellipsis; background-color: #1a1a1a; color: #d3d3d3; + flex: 0 0 auto; + position: relative; + } + .thumb-actions { + position: absolute; + top: 2px; + right: 6px; + display: flex; + gap: 6px; + font-size: 12px; + line-height: 1; + opacity: 0.7; + } + .thumb-actions a { + text-decoration: none; + color: inherit; + } + .thumb-actions a:hover { + opacity: 1; } .thumb-card .thumb-body h4 { font-size: 1.1em; @@ -340,6 +381,59 @@ .thumb-card iframe.card-img-top { display: block; width: 100%; + flex: 1 1 auto; + min-height: 0; + } + .thumb-card .thumbnail-wrapper { + display: flex; + align-items: stretch; + width: 100% !important; + min-width: 0; + max-width: 100%; + box-sizing: border-box; + } + .thumb-card .thumbnail-wrapper > *, + .thumb-card iframe.card-img-top { + width: 100% !important; + height: 100%; + object-fit: cover; + max-width: 100%; + } + .thumb-card .card-img-top { + width: 100% !important; + max-width: 100% !important; + height: 100% !important; + margin: 0 !important; + transform: none !important; + opacity: 1 !important; + } + .thumb-card .thumbnail-wrapper img, + .thumb-card .thumbnail-wrapper video, + .thumb-card .thumbnail-wrapper canvas { + width: 100% !important; + height: 100% !important; + max-width: 100% !important; + object-fit: cover; + transform: none !important; + margin: 0 !important; + } + .thumb-card .thumbnail-wrapper iframe, + .thumb-card .thumbnail-wrapper object, + .thumb-card .thumbnail-wrapper embed { + width: 405% !important; + height: 405% !important; + transform: scale(0.25); + transform-origin: 0 0; + margin: 0 !important; + border: 0 !important; + } + .thumb-card iframe.card-img-top { + width: 405% !important; + height: 405% !important; + transform: scale(0.25); + transform-origin: 0 0; + margin: 0 !important; + border: 0 !important; } .thumb-card:has([data-compact]) .thumbnail-wrapper, .thumb-card:has([data-compact]) .thumbnail-wrapper.compact { @@ -362,6 +456,11 @@ text-overflow: ellipsis; white-space: nowrap; } + .thumb-card:has([data-compact]) .thumbnail-text-header, + .thumb-card:has([data-compact]) .thumbnail-compact-icon, + .thumb-card:has([data-compact]) .thumbnail-compact-label { + display: none; + } .thumb-card.selected-card { border: 2px solid orange; box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05); @@ -413,7 +512,8 @@ } .screenshot { background-color: #333; - transform: none; + transform: scale(1.05); + transform-origin: top center; width: 100%; min-height: 100px; max-height: 100px; @@ -521,12 +621,12 @@ <div class="row nav"> <div class="col-lg-2" style="line-height: 50px; vertical-align: middle"> <a href="../../index.html" class="header-archivebox" title="Go to Main Index..."> - <img src="../../static/archive.png" alt="Archive Icon"> + <img src="/static/archive.png" alt="Archive Icon"> ArchiveBox </a> </div> <div class="col-lg-8"> - <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon"> + <img src="{% snapshot_url snapshot 'favicon/favicon.ico' %}" onerror="this.style.opacity=0" alt="Favicon" class="favicon">    {{title|safe}}    @@ -581,14 +681,14 @@ </div> <div class="col-lg-4"> <div class="info-chunk"> - <h5>🗃  Snapshot: <a href="/admin/core/snapshot/{{snapshot_id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|truncatechars:24}}</small></code></a></h5> - <a href="index.json" title="JSON summary of archived link.">JSON</a> | - <a href="warc/" title="Any WARC archives for the page">WARC</a> | - <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | - <a href="git/" title="Any git repos at the url">Git</a> | - <a href="/admin/core/snapshot/?q={{snapshot_id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> | - <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> | - <a href="." title="Webserver-provided index of files directory.">See all files...</a><br/> + <h5>🗃  Snapshot: <a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|default:id|truncatechars:24}}</small></code></a></h5> + <a href="{% snapshot_url snapshot 'index.json' %}" title="JSON summary of archived link.">JSON</a> | + <a href="{% snapshot_url snapshot 'warc/' %}" title="Any WARC archives for the page">WARC</a> | + <a href="{% snapshot_url snapshot 'media/' %}" title="Audio, Video, and Subtitle files.">Media</a> | + <a href="{% snapshot_url snapshot 'git/' %}" title="Any git repos at the url">Git</a> | + <a href="{% admin_base_url %}/admin/core/snapshot/?q={{snapshot_id|default:id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> | + <a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> | + <a href="{% snapshot_base_url snapshot %}/?files=1" title="Webserver-provided index of files directory.">See all files...</a><br/> </div> </div> </div> @@ -596,19 +696,39 @@ {% for result_info in archiveresults %} {% if result_info.result %} {% plugin_card result_info.result as thumbnail_html %} - <div class="thumb-card{% if forloop.first %} selected-card{% endif %}"> - <div class="thumbnail-wrapper"> - {{ thumbnail_html }} - </div> + {% with display_path=result_info.path|default:result_info.result.embed_path display_url='' %} + {% if display_path %}{% snapshot_url snapshot display_path as display_url %}{% endif %} + <div class="thumb-card{% if forloop.first %} selected-card{% endif %}"{% if display_url %} data-preview-url="{{display_url}}"{% endif %}> + {% with plugin_base=result_info.name|plugin_name %} + {% if plugin_base != 'ytdlp' and plugin_base != 'yt-dlp' and plugin_base != 'youtube-dl' %} + <div class="thumbnail-wrapper"> + {{ thumbnail_html }} + </div> + {% endif %} + {% endwith %} <div class="thumb-body"> - <a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener"> + <div class="thumb-actions"> + <a href="{% snapshot_url snapshot result_info.name %}/?files=1" data-no-preview="1" title="Open output folder" target="_blank" rel="noopener">📁</a> + {% if display_path %} + <a href="{{display_url}}" data-no-preview="1" title="Download output file" download>⬇️</a> + {% endif %} + </div> + <a href="{{ display_url }}" title="Open in new tab..." target="_blank" rel="noopener"> <p class="card-text"><code>{{ result_info.path }}</code></p> </a> - <a href="{{ result_info.path }}" target="preview"> - <h4 class="card-title">{{ result_info.name|title }}</h4> - </a> - </div> + <a href="{{ display_url }}" target="preview"> + <h4 class="card-title">{{ result_info.name|plugin_display_name|title }}</h4> + </a> + {% if result_info.result %} + {% with plugin_base=result_info.name|plugin_name %} + {% if plugin_base == 'ytdlp' or plugin_base == 'yt-dlp' or plugin_base == 'youtube-dl' %} + {% plugin_card result_info.result %} + {% endif %} + {% endwith %} + {% endif %} + </div> </div> + {% endwith %} {% endif %} {% endfor %} @@ -629,7 +749,13 @@ </div> </div> </header> - <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_preview_path}}" name="preview"></iframe> + {% if best_result.result %} + <div id="main-frame-wrapper" class="full-page-wrapper"> + {% plugin_full best_result.result %} + </div> + {% else %} + <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_preview_path}}" name="preview"></iframe> + {% endif %} <script> /*! jQuery v3.2.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/parseXML,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-event/ajax,-effects,-effects/Tween,-effects/animatedSelector | (c) JS Foundation and other contributors | jquery.org/license */ @@ -647,31 +773,99 @@ return link.pathname.split('/').filter(a => a.length).slice(-1)[0].toLowerCase() } - // show selected file in iframe when preview card is clicked - jQuery('.thumb-card').on('click', function(e) { - jQuery('.selected-card').removeClass('selected-card') - jQuery(e.target).closest('.thumb-card').addClass('selected-card') + function tryCenterImageFrame(frame) { + try { + const doc = frame.contentDocument || frame.contentWindow.document + if (!doc || !doc.body || !doc.images || doc.images.length !== 1) { + return + } + const img = doc.images[0] + doc.documentElement.style.height = '100%' + doc.body.style.height = '100%' + doc.documentElement.style.width = '100%' + doc.body.style.width = '100%' + doc.body.style.margin = '0' + doc.body.style.display = 'flex' + doc.body.style.alignItems = 'flex-start' + doc.body.style.justifyContent = 'center' + doc.body.style.background = '#222' + img.style.maxWidth = '100%' + img.style.width = 'auto' + img.style.height = 'auto' + img.style.maxHeight = 'none' + img.style.display = 'block' + } catch (err) {} + } - const link = e.target.closest('a[target=preview]') || e.currentTarget.querySelector('a[target=preview]') || e.currentTarget.querySelector('a') - if (!link || !link.href || link.href.endsWith('#')) { + function attachPreviewFrameHandlers(frame) { + if (frame.src.endsWith('.pdf')) { + frame.removeAttribute('sandbox') + frame.src = frame.src + } + frame.onload = function() { + if (this.src.includes('.pdf')) { + this.removeAttribute('sandbox') + } + tryCenterImageFrame(this) + } + } + + // show selected file in iframe when preview card is clicked + function ensureMainFrame() { + let frame = document.querySelector('.full-page-iframe') + if (!frame) { + const wrapper = document.getElementById('main-frame-wrapper') + frame = document.createElement('iframe') + frame.id = 'main-frame' + frame.name = 'preview' + frame.className = 'full-page-iframe' + frame.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" + if (wrapper) { + wrapper.innerHTML = '' + wrapper.appendChild(frame) + wrapper.classList.remove('full-page-wrapper') + } + attachPreviewFrameHandlers(frame) + } + return frame + } + + function handleThumbCardClick(card, e) { + const targetEl = e.target.nodeType === Node.ELEMENT_NODE ? e.target : e.target.parentElement + if (targetEl && targetEl.closest('[data-no-preview]')) { return true } - const iframe = jQuery('.full-page-iframe')[0] + jQuery('.selected-card').removeClass('selected-card') + jQuery(card).closest('.thumb-card').addClass('selected-card') + + const link = (targetEl && targetEl.closest('a[target=preview]')) || card.querySelector('a[target=preview]') || card.querySelector('a') + const previewUrl = card.dataset.previewUrl + const target = (link && link.href) ? link.href : (previewUrl || '') + if (!target || target.endsWith('#')) { + return true + } + e.preventDefault() + const iframe = ensureMainFrame() if (!iframe) { return true } - if (link.href.endsWith('.pdf')) { + if (target.endsWith('.pdf')) { iframe.removeAttribute('sandbox') } else { iframe.sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation" } - window.location.hash = getPreviewTypeFromPath(link) - iframe.src = link.href + if (link) { + window.location.hash = getPreviewTypeFromPath(link) + } + iframe.src = target return true + } + + jQuery('.thumb-card').on('click', function(e) { + return handleThumbCardClick(this, e) }) - jQuery('.thumb-card a[target=preview]').on('click', function(e) { - e.preventDefault() - return false + jQuery('.thumb-card a').on('click', function(e) { + return handleThumbCardClick(this.closest('.thumb-card'), e) }) function hideSnapshotHeader() { @@ -737,10 +931,7 @@ // un-sandbox iframes showing pdfs (required to display pdf viewer) jQuery('iframe').map(function() { - if (this.src.endsWith('.pdf')) { - this.removeAttribute('sandbox') - this.src = this.src - } + attachPreviewFrameHandlers(this) }) // hide all preview iframes on small screens @@ -749,8 +940,10 @@ } var pdf_frame = document.querySelector('.pdf-frame'); - pdf_frame.onload = function () { - pdf_frame.contentWindow.scrollTo(0, 400); + if (pdf_frame) { + pdf_frame.onload = function () { + pdf_frame.contentWindow.scrollTo(0, 400); + } } </script> </body> diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index 22db0359..6e9756b0 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -47,37 +47,127 @@ margin: 0px; text-align: center; color: #f6f6f6; - font-size: calc(10px + 0.84vw); + font-size: calc(10px + 0.44vw); font-weight: 200; padding: 3px 4px; background-color: #aa1e55; } - .header-top .nav { + .header-top .header-nav { + display: grid; + grid-template-columns: auto minmax(0, 1fr) auto auto; + align-items: start; + gap: 16px; width: 100%; } - .nav > div { + .header-top .header-col { min-height: 30px; line-height: 1.2; + min-width: 0; + } + .header-top .header-left { + white-space: nowrap; + } + .header-top .header-main { + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 4px; + min-width: 0; + } + .header-top .header-meta { + display: flex; + flex-direction: column; + align-items: flex-end; + gap: 4px; + min-width: 0; + } + .header-top .header-right { + text-align: right; + white-space: nowrap; + padding-right: 10px; + } + .header-right .header-date { + text-align: center; + } + .snapshot-variants { + position: relative; + display: inline-block; + white-space: nowrap; + } + .snapshot-variants summary { + list-style: none; + cursor: pointer; + color: #f6f6f6 !important; + } + .snapshot-variants summary:hover { + color: #f6f6f6 !important; + } + .header-top .snapshot-variants summary { + color: #f6f6f6 !important; + } + .snapshot-variants summary::-webkit-details-marker { + display: none; + } + .snapshot-variants-list { + position: absolute; + right: 0; + top: calc(100% + 6px); + background: rgba(18, 18, 18, 0.95); + border: 1px solid rgba(255,255,255,0.15); + border-radius: 8px; + min-width: 260px; + max-width: 420px; + max-height: 240px; + overflow: auto; + box-shadow: 0 6px 20px rgba(0,0,0,0.3); + z-index: 50; + padding: 6px; + } + .snapshot-variants-list a { + display: block; + padding: 6px 8px; + color: #f6f6f6; + font-size: 12px; + line-height: 1.3; + border-radius: 6px; + } + .snapshot-variants-list a:hover { + background: rgba(255,255,255,0.08); + color: #fff; + } + .header-top .snapshot-variants-list a { + color: #f6f6f6 !important; + } + .header-top .snapshot-variants-list a:hover { + color: #fff !important; + } + .year-variants summary { + list-style: none; + cursor: pointer; + } + .year-variants summary::-webkit-details-marker { + display: none; } .header-top .header-url { - display: inline-block; width: 100%; background-color: rgb(216, 216, 235, 0.05); - text-align: center; + text-align: left; line-height: 1.3; font-family: monospace; - white-space: nowrap; font-weight: 200; - display: block; - margin-top: -1px; + margin-top: 0; font-size: 23px; opacity: 0.8; border-radius: 0px 0px 8px 8px; } .header-top .header-url a.header-url-text { + display: block; color: #f6f6f6; user-select: all; + overflow: hidden; text-overflow: ellipsis; + white-space: nowrap; + padding: 2px 10px; } .header-top .header-url a.header-url-text:hover { color: rgb(144, 161, 255); @@ -90,13 +180,53 @@ text-decoration: none; color: rgba(0,0,0,0.9); } - .header-top .header-title { + .header-title-line { color: rgba(0,0,0,0.6); + display: flex; + align-items: center; + gap: 6px; + min-width: 0; + width: 100%; + } + .header-title-text { + display: inline-block; + max-width: 100%; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + text-align: left; + } + .header-tags { + display: flex; + flex-wrap: wrap; + justify-content: flex-end; + gap: 4px; + } + .header-badges { + display: flex; + flex-wrap: wrap; + justify-content: flex-end; + align-items: center; + gap: 6px; + } + .header-year-badges { + display: flex; + flex-wrap: wrap; + justify-content: flex-end; + align-items: center; + gap: 6px; + margin-top: 4px; } .header-top .favicon { - height: 24px; - vertical-align: -5px; - margin-right: 4px; + width: 20px; + height: 20px; + max-width: 30px; + max-height: 30px; + vertical-align: -4px; + margin-right: 6px; + object-fit: contain; + border-radius: 3px; + background: rgba(255,255,255,0.08); } .header-top .col-lg-4 { text-align: center; @@ -111,6 +241,16 @@ margin-top: -4px; margin-bottom: 2px; } + .header-archivebox { + display: inline-flex; + align-items: center; + gap: 6px; + white-space: nowrap; + } + .header-right .header-date { + display: inline-block; + white-space: nowrap; + } .header-archivebox img:hover { opacity: 0.5; } @@ -129,6 +269,48 @@ vertical-align: -12px; margin-left: 4px; } + @media(max-width: 900px) { + .header-top .header-nav { + grid-template-columns: 1fr; + gap: 8px; + } + .header-top .header-left, + .header-top .header-main, + .header-top .header-meta, + .header-top .header-right { + width: 100%; + text-align: left; + align-items: flex-start; + } + .header-archivebox img { + margin-left: 0; + } + } + @media(max-width: 600px) { + .header-top { + font-size: 14px; + } + .header-top .header-url { + font-size: 16px; + } + .header-title-text, + .header-top .header-url a.header-url-text { + white-space: normal; + overflow: hidden; + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; + } + .header-badges, + .header-tags, + .header-year-badges { + justify-content: flex-start; + } + .header-toggle { + font-size: 46px; + vertical-align: -6px; + } + } .info-row { margin-top: 2px; @@ -340,6 +522,9 @@ .iframe-large { height: calc(100vh - 70px); } + .preview-hidden { + display: none !important; + } img.external { height: 30px; margin-right: -10px; @@ -353,7 +538,8 @@ } .screenshot { background-color: #333; - transform: none; + transform: scale(1.05); + transform-origin: top center; width: 100%; min-height: 100px; max-height: 100px; @@ -402,6 +588,7 @@ max-height: 138px; display: flex; flex-direction: column; + align-items: stretch; } .thumb-card:has([data-compact]) { height: 46px; @@ -418,6 +605,24 @@ color: #222; background-color: #f6f6f6; flex: 0 0 auto; + position: relative; + } + .thumb-actions { + position: absolute; + top: 2px; + right: 6px; + display: flex; + gap: 6px; + font-size: 12px; + line-height: 1; + opacity: 0.7; + } + .thumb-actions a { + text-decoration: none; + color: inherit; + } + .thumb-actions a:hover { + opacity: 1; } .thumb-card .thumb-body h4 { font-size: 0.8em; @@ -436,11 +641,57 @@ flex: 1 1 auto; min-height: 0; } + .thumb-card .thumbnail-wrapper { + display: flex; + align-items: stretch; + width: 100% !important; + min-width: 0; + max-width: 100%; + box-sizing: border-box; + height: auto !important; + } .thumb-card .thumbnail-wrapper > *, .thumb-card iframe.card-img-top { - width: 100%; + width: 100% !important; height: 100%; object-fit: cover; + max-width: 100%; + } + .thumb-card .card-img-top { + width: 100% !important; + max-width: 100% !important; + height: 100% !important; + margin: 0 !important; + transform: none !important; + opacity: 1 !important; + } + .thumb-card .thumbnail-wrapper img, + .thumb-card .thumbnail-wrapper video, + .thumb-card .thumbnail-wrapper canvas { + width: 100% !important; + height: 100% !important; + max-width: 100% !important; + object-fit: cover; + transform: none !important; + margin: 0 !important; + } + .thumb-card .thumbnail-wrapper iframe, + .thumb-card .thumbnail-wrapper object, + .thumb-card .thumbnail-wrapper embed { + width: 405% !important; + height: 405% !important; + transform: scale(0.25); + transform-origin: 0 0; + margin: 0 !important; + border: 0 !important; + } + .thumb-card iframe.card-img-top { + width: 405% !important; + height: 405% !important; + transform: scale(0.25); + transform-origin: 0 0; + margin: 0 !important; + border: 0 !important; } .thumb-card:has([data-compact]) .thumbnail-wrapper, .thumb-card:has([data-compact]) .thumbnail-wrapper.compact { @@ -463,10 +714,41 @@ text-overflow: ellipsis; white-space: nowrap; } + .thumb-card:has([data-compact]) .thumbnail-text-header, + .thumb-card:has([data-compact]) .thumbnail-compact-icon, + .thumb-card:has([data-compact]) .thumbnail-compact-label { + display: none; + } .thumb-card.selected-card { border: 2px solid orange; box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05); } + .loose-items { + display: flex; + flex-wrap: wrap; + gap: 4px 8px; + font-size: 12px; + line-height: 1.2; + max-height: 84px; + overflow: auto; + } + .loose-items a { + color: #333; + text-decoration: none; + background: rgba(0, 0, 0, 0.04); + padding: 2px 6px; + border-radius: 6px; + } + .loose-items a:hover { + background: rgba(0, 0, 0, 0.08); + } + .failed-items a { + color: #b91c1c; + background: rgba(185, 28, 28, 0.08); + } + .failed-items a:hover { + background: rgba(185, 28, 28, 0.16); + } .header-bottom { border-top: 1px solid rgba(170, 30, 85, 0.9); padding-bottom: 1px; @@ -617,62 +899,108 @@ </head> <body> <header> - <div class="header-top container-fluid"> - <div class="row nav"> - <div class="col-lg-2" style="line-height: 58px; vertical-align: middle"> - <a href="../../index.html" class="header-archivebox" title="Go to Main Index..."> - <img src="../../static/archive.png" alt="Archive Icon"> + <div class="header-top"> + <div class="header-nav"> + <div class="header-col header-left" style="line-height: 58px; vertical-align: middle"> + <a href="/" class="header-archivebox" title="Go to Main Index..."> + {% web_base_url as web_base %} + <img src="{% if web_base %}//{{ web_base|cut:'http://'|cut:'https://' }}/static/archive.png{% else %}{% static 'archive.png' %}{% endif %}" alt="Archive Icon"> ArchiveBox </a> </div> - <div class="col-lg-8"> + <div class="header-col header-main"> <div class="header-url"> <a class="header-url-text" href="{{url}}" title="Open original URL in new window..." target="_blank" rel="noreferrer"> {{url}} </a> </div> - <div class="badge badge-{{status_color}}" style="float: left"> - <a href="/admin/core/snapshot/?q={{snapshot_id}}" title="Click to see options to pull, re-snapshot, or delete this Snapshot"> - {{status|upper}} - </a> + <div class="header-title-line header-toggle-trigger"> + <img src="{% snapshot_url snapshot 'favicon/favicon.ico' %}" onerror="this.style.opacity=0" alt="Favicon" class="favicon"/> + <span class="header-title-text">{{title|truncatechars:120|safe}}</span> + <a href="#" class="header-toggle header-toggle-trigger">▾</a> </div> - <div class="badge badge-default" style="float: left; font-weight: 200"> - {{num_outputs}} - {% if num_failures %} - + {{num_failures}} <small>errors</small> - {% endif %} - </div> - <div class="badge badge-info" style="float: right"> - <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Click to edit this Snapshot in the Admin UI"> - {{size}} - </a> - </div> - <div class="badge badge-default" style="float: right"> - <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Click to edit this Snapshot in the Admin UI"> - {{extension}} - </a> - </div> - <small class="header-title header-toggle-trigger"> - <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon" class="favicon"/> - {{title|truncatechars:120|safe}} <a href="#" class="header-toggle header-toggle-trigger">▾</a> - <br/> - {% for tag in tags_str|split:',' %} - <div class="badge badge-default tag" style="word-break: break-all;">{{tag}}</div> - {% endfor %} - </small> </div> - <div class="col-lg-2" style="padding-top: 4px"> - <a href="/{{archive_path}}/index.html" title="Date Added: {{bookmarked_date}} | First Archived: {{oldest_archive_date|default:downloaded_datestr}} | Last Checked: {{downloaded_datestr}} (UTC)"> - {{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}} - </a> + <div class="header-col header-meta"> + <div class="header-badges"> + <div class="badge badge-default" style="font-weight: 200"> + {{num_outputs}} + {% if num_failures %} + + {{num_failures}} <small>errors</small> + {% endif %} + </div> + <div class="badge badge-info"> + <a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/" title="Click to edit this Snapshot in the Admin UI"> + {{size}} + </a> + </div> + <div class="badge badge-default"> + <a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/" title="Click to edit this Snapshot in the Admin UI"> + ✏️ + </a> + </div> + {% for tag in tags_str|split:',' %} + {% if tag %} + <div class="badge badge-default tag" style="word-break: break-all;">{{tag}}</div> + {% endif %} + {% endfor %} + <div class="badge badge-{{status_color}}"> + <a href="{% admin_base_url %}/admin/core/snapshot/?q={{snapshot_id|default:id}}" title="Click to see options to pull, re-snapshot, or delete this Snapshot"> + {{status|upper}} + </a> + </div> + </div> + {% if related_years %} + <div class="header-year-badges"> + {% for entry in related_years %} + {% if entry.snapshots|length > 1 %} + <details class="snapshot-variants year-variants"> + <summary class="badge badge-default">{{ entry.year }}</summary> + <div class="snapshot-variants-list"> + {% for snap in entry.snapshots %} + <a href="{% web_base_url %}/{{ snap.archive_path }}/index.html" title="{{ snap.url }}"> + {{ snap.bookmarked_at|default:snap.created_at|default:snap.downloaded_at|date:"Y-m-d H:i:s" }}   📁 {{ snap.num_outputs }} + </a> + {% endfor %} + </div> + </details> + {% else %} + <div class="badge badge-default"> + <a href="{% web_base_url %}/{{ entry.latest.archive_path }}/index.html" title="{{ entry.latest.url }}"> + {{ entry.year }} + </a> + </div> + {% endif %} + {% endfor %} + </div> + {% endif %} + </div> + <div class="header-col header-right" style="padding-top: 4px"> + {% if related_snapshots %} + <details class="snapshot-variants"> + <summary class="header-date" title="Click to see other snapshots for this URL"> + {{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}} + </summary> + <div class="snapshot-variants-list"> + {% for snap in related_snapshots %} + <a href="{% web_base_url %}/{{ snap.archive_path }}/index.html" title="{{ snap.url }}"> + {{ snap.bookmarked_at|default:snap.created_at|default:snap.downloaded_at|date:"Y-m-d H:i:s" }}   📁 {{ snap.num_outputs }} + </a> + {% endfor %} + </div> + </details> + {% else %} + <a class="header-date" href="{% web_base_url %}/{{archive_path}}/index.html" title="Date Added: {{bookmarked_date}} | First Archived: {{oldest_archive_date|default:downloaded_datestr}} | Last Checked: {{downloaded_datestr}} (UTC)"> + {{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}} + </a> + {% endif %} <br/> <div class="external-links"> - ↗️   - <a href="./index.json" title="Get the Snapshot details as a JSON file" target="_blank">JSON</a>  |  🗃️ - <a href="{{warc_path}}" title="Download the ArchiveBox-generated WARC file" target="_blank">WARC</a>  |  + 📁   + <a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse files for this snapshot" target="_blank">FILES</a>  |  🗃️ + <a href="{% snapshot_url snapshot warc_path %}" title="Download the ArchiveBox-generated WARC file" target="_blank">WARC</a>  |  <a href="https://web.archive.org/web/{{url}}" title="Search for a copy of the URL saved in Archive.org" target="_blank" rel="noreferrer">🏛️ Archive.org</a> <!--<a href="https://archive.md/{{url}}" title="Search for a copy of the URL saved in Archive.today" target="_blank" rel="noreferrer">Archive.today</a>  |  --> - <!--<a href="https://ghostarchive.org/search?term={{url|urlencode}}" title="Search for a copy of the URL saved in GhostArchive.org" target="_blank" rel="noreferrer">More...</a>--> + <!--<a href="https://ghostarchive.org/search?term={{url}}" title="Search for a copy of the URL saved in GhostArchive.org" target="_blank" rel="noreferrer">More...</a>--> </div> </div> </div> @@ -682,22 +1010,42 @@ {% for result in archiveresults %} - <div class="thumb-card{% if forloop.first %} selected-card{% endif %}"> - {% with display_path=result.path %} + {% with display_path=result.path|default:result.result.embed_path display_url='' %} + {% if display_path %}{% snapshot_url snapshot display_path as display_url %}{% endif %} + <div class="thumb-card{% if forloop.first %} selected-card{% endif %}"{% if display_url %} data-preview-url="{{display_url}}"{% endif %}> <div class="thumb-body"> + <div class="thumb-actions"> + <a href="{% snapshot_url snapshot result.name %}/?files=1" data-no-preview="1" title="Open output folder" target="_blank" rel="noopener">📁</a> + {% if display_path %} + <a href="{{display_url}}" data-no-preview="1" title="Download output file" download>⬇️</a> + {% endif %} + </div> {% if display_path %} - <a href="{{display_path|urlencode}}" target="preview" title="./{{display_path}} (downloaded {{result.ts}})"> - <h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4> + <a href="{{display_url}}" target="preview" title="./{{display_path}} (downloaded {{result.ts}})"> + <h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}}{% if result.size %} <small>({{result.size|filesizeformat}})</small>{% endif %}</h4> </a> {% else %} - <h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4> + <h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}}{% if result.size %} <small>({{result.size|filesizeformat}})</small>{% endif %}</h4> + {% endif %} + {% if result.result %} + {% with plugin_base=result.name|plugin_name %} + {% if plugin_base == 'ytdlp' or plugin_base == 'yt-dlp' or plugin_base == 'youtube-dl' %} + {% plugin_card result.result %} + {% endif %} + {% endwith %} {% endif %} </div> {% if result.result and display_path %} - {# Use plugin-specific card template when ArchiveResult is available #} - <div class="card-img-top thumbnail-wrapper"> - {% plugin_card result.result %} - </div> + {% with plugin_base=result.name|plugin_name %} + {% if plugin_base != 'ytdlp' and plugin_base != 'yt-dlp' and plugin_base != 'youtube-dl' %} + {# Use plugin-specific card template when ArchiveResult is available #} + <div class="card-img-top thumbnail-wrapper"> + {% plugin_card result.result %} + </div> + {% else %} + {# YT-DLP renders its file list in the body #} + {% endif %} + {% endwith %} {% elif result.is_metadata and display_path %} <div class="card-img-top thumbnail-wrapper compact"> <div class="thumbnail-compact" data-plugin="{{result.name}}"> @@ -708,11 +1056,49 @@ </div> {% elif display_path %} {# Fall back to generic iframe for filesystem-discovered files #} - <iframe class="card-img-top" src="{{display_path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe> + <iframe class="card-img-top" src="{{display_url}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe> {% endif %} - {% endwith %} </div> + {% endwith %} {% endfor %} + {% if loose_items %} + <div class="thumb-card"> + <div class="thumb-body"> + <div class="thumb-actions"> + <a href="{% snapshot_base_url snapshot %}/?files=1" data-no-preview="1" title="Browse all snapshot files" target="_blank" rel="noopener">📁</a> + </div> + <h4>📦 Other files</h4> + <div class="loose-items"> + {% for item in loose_items %} + {% if item.is_dir %} + <a href="{% snapshot_url snapshot item.path %}/?files=1" data-no-preview="1" target="_blank" rel="noopener">📁 {{item.name}}</a> + {% else %} + <a href="{% snapshot_url snapshot item.path %}" data-no-preview="1" target="_blank" rel="noopener">📄 {{item.name}}</a> + {% endif %} + {% endfor %} + </div> + </div> + </div> + {% endif %} + {% if failed_items %} + <div class="thumb-card"> + <div class="thumb-body"> + <div class="thumb-actions"> + <a href="{% snapshot_base_url snapshot %}/?files=1" data-no-preview="1" title="Browse all snapshot files" target="_blank" rel="noopener">📁</a> + </div> + <h4>⚠️ Failed</h4> + <div class="loose-items failed-items"> + {% for item in failed_items %} + {% if item.is_dir %} + <a href="{% snapshot_url snapshot item.path %}/?files=1" data-no-preview="1" target="_blank" rel="noopener">📁 {{item.name}}</a> + {% else %} + <a href="{% snapshot_url snapshot item.path %}" data-no-preview="1" target="_blank" rel="noopener">📄 {{item.name}}</a> + {% endif %} + {% endfor %} + </div> + </div> + </div> + {% endif %} </div> </div> </header> @@ -722,11 +1108,14 @@ {% if best_result.result %} {# Use plugin-specific fullscreen template when ArchiveResult is available #} <div id="main-frame-wrapper" class="full-page-wrapper"> - {% plugin_full best_result.result %} + <div id="plugin-full-wrapper"> + {% plugin_full best_result.result %} + </div> + <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe preview-hidden" src="{% if best_result.path %}{% snapshot_url snapshot best_result.path %}{% else %}about:blank{% endif %}" name="preview"></iframe> </div> {% else %} {# Fall back to generic iframe #} - <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|default:'about:blank'|urlencode}}" name="preview"></iframe> + <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{% if best_result.path %}{% snapshot_url snapshot best_result.path %}{% else %}about:blank{% endif %}" name="preview"></iframe> {% endif %} @@ -734,22 +1123,53 @@ <script src="{% static 'jquery.min.js' %}" type="text/javascript"></script> <script> - // un-sandbox iframes showing pdfs (required to display pdf viewer) - jQuery('iframe').map(function() { - if (this.src.endsWith('.pdf')) { - this.removeAttribute('sandbox') - this.src = this.src + '#toolbar=0' + const snapshotBaseUrl = "{% snapshot_base_url snapshot %}"; + + function tryCenterImageFrame(frame) { + try { + const doc = frame.contentDocument || frame.contentWindow.document + if (!doc || !doc.body || !doc.images || doc.images.length !== 1) { + return + } + const img = doc.images[0] + doc.documentElement.style.height = '100%' + doc.body.style.height = '100%' + doc.documentElement.style.width = '100%' + doc.body.style.width = '100%' + doc.body.style.margin = '0' + doc.body.style.display = 'flex' + doc.body.style.alignItems = 'flex-start' + doc.body.style.justifyContent = 'center' + doc.body.style.background = '#222' + img.style.maxWidth = '100%' + img.style.width = 'auto' + img.style.height = 'auto' + img.style.maxHeight = 'none' + img.style.display = 'block' + } catch (err) {} + } + + function attachPreviewFrameHandlers(frame) { + if (frame.src.endsWith('.pdf')) { + frame.removeAttribute('sandbox') + frame.src = frame.src + '#toolbar=0' } - this.onload = function() { + frame.onload = function() { if (this.src.includes('.pdf')) { this.removeAttribute('sandbox') this.src = this.src.split('?autoplay=')[0] + '#toolbar=0' } + tryCenterImageFrame(this) try { // doesnt work if frame origin rules prevent accessing its DOM via JS this.contentWindow.scrollTo(0, 0); } catch(err) {} } + } + + // un-sandbox iframes showing pdfs (required to display pdf viewer) + jQuery('iframe').map(function() { + attachPreviewFrameHandlers(this) }) function getPreviewTypeFromPath(link) { @@ -759,6 +1179,14 @@ return link.getAttribute('href') } + function resolvePreviewUrl(raw) { + if (!raw) return '' + if (raw.startsWith('http://') || raw.startsWith('https://')) return raw + if (raw.startsWith('//')) return window.location.protocol + raw + if (!snapshotBaseUrl) return raw + return snapshotBaseUrl + (raw.startsWith('/') ? raw : `/${raw}`) + } + function ensureMainFrame() { let frame = document.getElementById('main-frame') if (!frame) { @@ -773,34 +1201,55 @@ wrapper.appendChild(frame) wrapper.classList.remove('full-page-wrapper') } + attachPreviewFrameHandlers(frame) } + const pluginWrapper = document.getElementById('plugin-full-wrapper') + if (pluginWrapper) { + pluginWrapper.classList.add('preview-hidden') + } + frame.classList.remove('preview-hidden') return frame } + function handleCardClick(card, event) { + const targetEl = event.target.nodeType === Node.ELEMENT_NODE ? event.target : event.target.parentElement + if (targetEl && targetEl.closest('[data-no-preview]')) { + return + } + const link = (targetEl && targetEl.closest('a[target=preview]')) || card.querySelector('a[target=preview]') || card.querySelector('a') + const previewUrl = card.dataset.previewUrl + const rawTarget = (link ? link.getAttribute('href') : '') || previewUrl || '' + const target = resolvePreviewUrl(rawTarget) + if (!target || target.endsWith('#')) { + return + } + event.preventDefault() + + jQuery('.selected-card').removeClass('selected-card') + jQuery(card).closest('.thumb-card').addClass('selected-card') + + const iframe_elem = ensureMainFrame() + if (target.endsWith('.pdf')) { + iframe_elem.removeAttribute('sandbox') + } else { + iframe_elem.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" + } + if (link) { + window.location.hash = getPreviewTypeFromPath(link) + } + + iframe_elem.src = target + } + for (const card of [...document.querySelectorAll('.thumb-card')]) { card.addEventListener('click', function(event) { - const link = event.target.closest('a[target=preview]') || event.currentTarget.querySelector('a[target=preview]') || event.currentTarget.querySelector('a') - if (!link) { - return - } - const target = link.href - if (!target || target.endsWith('#')) { - return - } - - jQuery('.selected-card').removeClass('selected-card') - jQuery(event.currentTarget).closest('.thumb-card').addClass('selected-card') - - const iframe_elem = ensureMainFrame() - if (target.endsWith('.pdf')) { - iframe_elem.removeAttribute('sandbox') - } else { - iframe_elem.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" - } - window.location.hash = getPreviewTypeFromPath(link) - - iframe_elem.src = target + handleCardClick(card, event) }) + for (const link of card.querySelectorAll('a')) { + link.addEventListener('click', function(event) { + handleCardClick(card, event) + }) + } } diff --git a/archivebox/templates/static/admin-inline-tags.js b/archivebox/templates/static/admin-inline-tags.js new file mode 100644 index 00000000..d25aba13 --- /dev/null +++ b/archivebox/templates/static/admin-inline-tags.js @@ -0,0 +1,258 @@ +(function() { + function computeTagStyle(tagName) { + var hash = 0; + var name = String(tagName || '').toLowerCase(); + for (var i = 0; i < name.length; i++) { + hash = (hash * 31 + name.charCodeAt(i)) % 360; + } + return { + bg: 'hsl(' + hash + ', 70%, 92%)', + border: 'hsl(' + hash + ', 60%, 82%)', + fg: 'hsl(' + hash + ', 35%, 28%)' + }; + } + + function applyTagStyle(el, tagName) { + var colors = computeTagStyle(tagName); + el.style.setProperty('--tag-bg', colors.bg); + el.style.setProperty('--tag-border', colors.border); + el.style.setProperty('--tag-fg', colors.fg); + } + + function getApiKey() { + return (window.ARCHIVEBOX_API_KEY || '').trim(); + } + + function buildApiUrl(path) { + var apiKey = getApiKey(); + if (!apiKey) return path; + var sep = path.indexOf('?') !== -1 ? '&' : '?'; + return path + sep + 'api_key=' + encodeURIComponent(apiKey); + } + + function getCSRFToken() { + var cookies = document.cookie.split(';'); + for (var i = 0; i < cookies.length; i++) { + var cookie = cookies[i].trim(); + if (cookie.startsWith('csrftoken=')) { + return cookie.substring('csrftoken='.length); + } + } + var input = document.querySelector('input[name="csrfmiddlewaretoken"]'); + return input ? input.value : ''; + } + + function buildApiHeaders() { + var headers = { + 'Content-Type': 'application/json' + }; + var apiKey = getApiKey(); + if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey; + var csrfToken = getCSRFToken(); + if (csrfToken) headers['X-CSRFToken'] = csrfToken; + return headers; + } + + function parseTags(el) { + if (el._tagData) return el._tagData; + var raw = el.dataset.tags || '[]'; + try { + el._tagData = JSON.parse(raw); + } catch (e) { + el._tagData = []; + } + return el._tagData; + } + + function setTags(el, tags) { + el._tagData = tags; + el.dataset.tags = JSON.stringify(tags); + } + + function rebuildPills(el) { + var tags = parseTags(el); + var container = el.querySelector('.tag-pills-inline'); + if (!container) return; + container.innerHTML = ''; + tags.forEach(function(td) { + var pill = document.createElement('span'); + pill.className = 'tag-pill'; + pill.setAttribute('data-tag', td.name); + pill.setAttribute('data-tag-id', td.id); + applyTagStyle(pill, td.name); + + var link = document.createElement('a'); + link.href = '/admin/core/snapshot/?tags__id__exact=' + td.id; + link.className = 'tag-link'; + link.textContent = td.name; + pill.appendChild(link); + + var removeBtn = document.createElement('button'); + removeBtn.type = 'button'; + removeBtn.className = 'tag-remove-btn'; + removeBtn.setAttribute('data-tag-id', td.id); + removeBtn.setAttribute('data-tag-name', td.name); + removeBtn.innerHTML = '×'; + pill.appendChild(removeBtn); + + container.appendChild(pill); + }); + } + + function addTag(el, tagName) { + tagName = String(tagName || '').trim(); + if (!tagName) return; + + var tags = parseTags(el); + var exists = tags.some(function(t) { + return t.name.toLowerCase() === tagName.toLowerCase(); + }); + if (exists) return; + + var snapshotId = el.dataset.snapshotId || ''; + fetch(buildApiUrl('/api/v1/core/tags/add-to-snapshot/'), { + method: 'POST', + headers: buildApiHeaders(), + body: JSON.stringify({ + snapshot_id: snapshotId, + tag_name: tagName + }) + }) + .then(function(response) { return response.json(); }) + .then(function(data) { + if (data.success) { + tags.push({ id: data.tag_id, name: data.tag_name }); + tags.sort(function(a, b) { return a.name.toLowerCase().localeCompare(b.name.toLowerCase()); }); + setTags(el, tags); + rebuildPills(el); + } + }) + .catch(function(err) { + console.error('Error adding tag:', err); + }); + } + + function removeTag(el, tagId) { + var snapshotId = el.dataset.snapshotId || ''; + fetch(buildApiUrl('/api/v1/core/tags/remove-from-snapshot/'), { + method: 'POST', + headers: buildApiHeaders(), + body: JSON.stringify({ + snapshot_id: snapshotId, + tag_id: tagId + }) + }) + .then(function(response) { return response.json(); }) + .then(function(data) { + if (data.success) { + var tags = parseTags(el).filter(function(t) { return t.id !== tagId; }); + setTags(el, tags); + rebuildPills(el); + } + }) + .catch(function(err) { + console.error('Error removing tag:', err); + }); + } + + var autocompleteTimers = new WeakMap(); + + function fetchAutocomplete(el, query, datalist) { + if (!datalist) return; + var existing = autocompleteTimers.get(el); + if (existing) window.clearTimeout(existing); + + var timer = window.setTimeout(function() { + if (!query || query.length < 1) { + datalist.innerHTML = ''; + return; + } + + fetch(buildApiUrl('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))) + .then(function(response) { return response.json(); }) + .then(function(data) { + datalist.innerHTML = ''; + (data.tags || []).forEach(function(tag) { + var option = document.createElement('option'); + option.value = tag.name; + datalist.appendChild(option); + }); + }) + .catch(function(err) { + console.log('Autocomplete error:', err); + }); + }, 150); + + autocompleteTimers.set(el, timer); + } + + function handleContainerClick(event) { + var target = event.target; + var container = target.closest('.tag-editor-inline'); + if (!container) return; + + if (target.classList.contains('tag-remove-btn')) { + event.stopPropagation(); + event.preventDefault(); + var tagId = parseInt(target.getAttribute('data-tag-id'), 10); + if (tagId) removeTag(container, tagId); + return; + } + + if (!target.classList.contains('tag-link')) { + var input = container.querySelector('input.tag-inline-input-sm'); + if (input) input.focus(); + } + } + + function handleInputKeydown(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + var container = input.closest('.tag-editor-inline'); + if (!container) return; + + var value = input.value.trim(); + if (event.key === 'Enter' || event.keyCode === 13 || event.key === ' ' || event.key === ',') { + event.preventDefault(); + if (value) { + value.split(',').forEach(function(tag) { addTag(container, tag.trim()); }); + input.value = ''; + } + } + } + + function handleInputEvent(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + var container = input.closest('.tag-editor-inline'); + if (!container) return; + var datalist = container.querySelector('datalist'); + fetchAutocomplete(container, input.value, datalist); + } + + function handleInputFocus(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + input.placeholder = 'add tag...'; + } + + function handleInputBlur(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + input.placeholder = '+'; + } + + function init() { + document.addEventListener('click', handleContainerClick); + document.addEventListener('keydown', handleInputKeydown); + document.addEventListener('input', handleInputEvent); + document.addEventListener('focusin', handleInputFocus); + document.addEventListener('focusout', handleInputBlur); + } + + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', init); + } else { + init(); + } +})(); diff --git a/archivebox/templates/static/admin.css b/archivebox/templates/static/admin.css index 0afdfe72..0326eade 100755 --- a/archivebox/templates/static/admin.css +++ b/archivebox/templates/static/admin.css @@ -332,6 +332,31 @@ body.model-snapshot.change-list #content .object-tools { padding-right: 6px; } +#content img.snapshot-preview { + width: 30px; + height: 30px; + max-width: 30px; + max-height: 30px; + object-fit: contain; + border-radius: 4px; + display: block; + margin: 0 auto; +} + +#content img.snapshot-preview.screenshot { + width: 100px; + height: 100px; + max-width: 100px; + max-height: 100px; + object-fit: cover; +} + +#content th.field-preview_icon, +#content td.field-preview_icon { + width: 100px; + max-width: 100px; +} + #content td, #content th { vertical-align: middle; padding: 4px; @@ -353,11 +378,142 @@ body.model-snapshot.change-list #content .object-tools { #content th.field-title_str { min-width: 300px; + padding-left: 2px; + padding-right: 2px; +} + +#content td.field-title_str { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-preview_icon, +#content td.field-preview_icon { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-created_at, +#content td.field-created_at { + padding-left: 2px; + padding-right: 2px; +} + +#content th.column-action-checkbox, +#content th.action-checkbox-column, +#content td.action-checkbox { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-preview_icon, +#content td.field-preview_icon { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-created_at, +#content td.field-created_at { + padding-left: 2px; + padding-right: 2px; +} + +#content th.column-action-checkbox, +#content th.action-checkbox-column, +#content td.action-checkbox { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-status_with_progress, +#content td.field-status_with_progress { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-size_with_stats, +#content td.field-size_with_stats { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-files, +#content td.field-files { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-files, +#content td.field-files { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-size_with_stats, +#content td.field-size_with_stats { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-status_with_progress, +#content td.field-status_with_progress { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-tags_inline, +#content td.field-tags_inline { + max-width: 220px; + width: 220px; + padding-left: 2px; + padding-right: 2px; +} + +#content td.field-tags_inline .tag-pills-inline { + flex-wrap: wrap; +} + +#content td.field-tags_inline .tag-editor-inline { + max-width: 220px; +} + +#content th.field-tags_inline, +#content td.field-tags_inline { + max-width: 220px; + width: 220px; + padding-left: 2px; + padding-right: 2px; +} + +#content td.field-tags_inline .tag-pills-inline { + flex-wrap: wrap; +} + +#content td.field-tags_inline .tag-editor-inline { + max-width: 220px; } #content td.field-files { white-space: nowrap; } +#content td.field-files .files-icons a { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 0; + margin: 0; + line-height: 1; + width: 16px; + height: 16px; + min-width: 16px; +} +#content td.field-files .files-icons svg, +#content td.field-files .files-icons img { + display: block; + margin: 0; + width: 16px; + height: 16px; +} #content td.field-files .exists-True { opacity: 1; } @@ -406,7 +562,7 @@ body.model-snapshot.change-list #content .object-tools { .files-icons { display: inline-flex; flex-wrap: wrap; - gap: 4px; + gap: 2px; vertical-align: middle; } @@ -418,20 +574,20 @@ body.model-snapshot.change-list #content .object-tools { } .files-icons .abx-output-icon { - width: 18px; - height: 18px; + width: 16px; + height: 16px; display: inline-flex; align-items: center; justify-content: center; - border-radius: 4px; + border-radius: 0; color: #1f2937; - background: rgba(15, 23, 42, 0.08); - box-shadow: inset 0 0 0 1px rgba(15, 23, 42, 0.08); + background: transparent; + box-shadow: none; } .files-icons .abx-output-icon svg { - width: 14px; - height: 14px; + width: 16px; + height: 16px; display: block; } @@ -454,6 +610,28 @@ body.model-snapshot.change-list #content .object-tools { border-radius: 4px; } +body.filters-collapsed #content #changelist-filter { + display: none !important; +} + +body.filters-collapsed .change-list .filtered .results, +body.filters-collapsed .change-list .filtered .paginator, +body.filters-collapsed .filtered #toolbar, +body.filters-collapsed .filtered div.xfull { + margin-right: 0 !important; +} + +body.filters-collapsed #content #changelist-filter { + display: none !important; +} + +body.filters-collapsed .change-list .filtered .results, +body.filters-collapsed .change-list .filtered .paginator, +body.filters-collapsed .filtered #toolbar, +body.filters-collapsed .filtered div.xfull { + margin-right: 0 !important; +} + #result_list tbody td.field-extractor { font-weight: 800; font-variant: small-caps; diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py index ed2e5316..cc757609 100644 --- a/archivebox/tests/conftest.py +++ b/archivebox/tests/conftest.py @@ -1,8 +1,10 @@ """archivebox/tests/conftest.py - Pytest fixtures for CLI tests.""" import os +import shutil import sys import subprocess +import textwrap from pathlib import Path from typing import List, Dict, Any, Optional, Tuple @@ -104,6 +106,234 @@ def initialized_archive(isolated_data_dir): return isolated_data_dir +# ============================================================================= +# CWD-based CLI Helpers (no DATA_DIR env) +# ============================================================================= + +def run_archivebox_cmd_cwd( + args: List[str], + cwd: Path, + stdin: Optional[str] = None, + timeout: int = 60, + env: Optional[Dict[str, str]] = None, +) -> Tuple[str, str, int]: + """ + Run archivebox command via subprocess using cwd as DATA_DIR (no DATA_DIR env). + Returns (stdout, stderr, returncode). + """ + cmd = [sys.executable, '-m', 'archivebox'] + args + + base_env = os.environ.copy() + base_env.pop('DATA_DIR', None) + base_env['USE_COLOR'] = 'False' + base_env['SHOW_PROGRESS'] = 'False' + + if env: + base_env.update(env) + + result = subprocess.run( + cmd, + input=stdin, + capture_output=True, + text=True, + cwd=cwd, + env=base_env, + timeout=timeout, + ) + + return result.stdout, result.stderr, result.returncode + + +def run_python_cwd( + script: str, + cwd: Path, + timeout: int = 60, +) -> Tuple[str, str, int]: + base_env = os.environ.copy() + base_env.pop('DATA_DIR', None) + result = subprocess.run( + [sys.executable, '-'], + input=script, + capture_output=True, + text=True, + cwd=cwd, + env=base_env, + timeout=timeout, + ) + return result.stdout, result.stderr, result.returncode + +def _get_machine_type() -> str: + import platform + + os_name = platform.system().lower() + arch = platform.machine().lower() + in_docker = os.environ.get('IN_DOCKER', '').lower() in ('1', 'true', 'yes') + suffix = '-docker' if in_docker else '' + return f'{arch}-{os_name}{suffix}' + +def _find_cached_chromium(lib_dir: Path) -> Optional[Path]: + candidates = [ + lib_dir / 'puppeteer', + lib_dir / 'npm' / 'node_modules' / 'puppeteer' / '.local-chromium', + ] + for base in candidates: + if not base.exists(): + continue + for path in base.rglob('Chromium.app/Contents/MacOS/Chromium'): + return path + for path in base.rglob('chrome-linux/chrome'): + return path + for path in base.rglob('chrome-linux64/chrome'): + return path + return None + +def _find_system_browser() -> Optional[Path]: + candidates = [ + Path('/Applications/Chromium.app/Contents/MacOS/Chromium'), + Path('/usr/bin/chromium'), + Path('/usr/bin/chromium-browser'), + ] + for candidate in candidates: + if candidate.exists(): + return candidate + return None + +def _ensure_puppeteer(shared_lib: Path) -> None: + npm_prefix = shared_lib / 'npm' + node_modules = npm_prefix / 'node_modules' + puppeteer_dir = node_modules / 'puppeteer' + if puppeteer_dir.exists(): + return + npm_prefix.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env['PUPPETEER_SKIP_DOWNLOAD'] = '1' + subprocess.run( + ['npm', 'install', 'puppeteer'], + cwd=str(npm_prefix), + env=env, + check=True, + capture_output=True, + text=True, + timeout=600, + ) + + +@pytest.fixture(scope="class") +def real_archive_with_example(tmp_path_factory, request): + """ + Initialize archive and add https://example.com using chrome+responses only. + Uses cwd for DATA_DIR and symlinks lib dir to a shared cache. + """ + tmp_path = tmp_path_factory.mktemp("archivebox_data") + if getattr(request, "cls", None) is not None: + request.cls.data_dir = tmp_path + + stdout, stderr, returncode = run_archivebox_cmd_cwd( + ['init', '--quick'], + cwd=tmp_path, + timeout=120, + ) + assert returncode == 0, f"archivebox init failed: {stderr}" + + stdout, stderr, returncode = run_archivebox_cmd_cwd( + [ + 'config', + '--set', + 'LISTEN_HOST=archivebox.localhost:8000', + 'PUBLIC_INDEX=True', + 'PUBLIC_SNAPSHOTS=True', + 'PUBLIC_ADD_VIEW=True', + ], + cwd=tmp_path, + ) + assert returncode == 0, f"archivebox config failed: {stderr}" + + machine_type = _get_machine_type() + shared_root = Path(__file__).resolve().parents[3] / 'tmp' / 'test_lib_cache' + shared_lib = shared_root / machine_type + shared_lib.mkdir(parents=True, exist_ok=True) + + lib_target = tmp_path / 'lib' / machine_type + if lib_target.exists() and not lib_target.is_symlink(): + shutil.rmtree(lib_target) + if not lib_target.exists(): + lib_target.parent.mkdir(parents=True, exist_ok=True) + lib_target.symlink_to(shared_lib, target_is_directory=True) + + _ensure_puppeteer(shared_lib) + cached_chromium = _find_cached_chromium(shared_lib) + if cached_chromium: + browser_binary = cached_chromium + else: + browser_binary = _find_system_browser() + if browser_binary: + chromium_link = shared_lib / 'chromium-bin' + if not chromium_link.exists(): + chromium_link.symlink_to(browser_binary) + browser_binary = chromium_link + + if browser_binary: + stdout, stderr, returncode = run_archivebox_cmd_cwd( + [f'config', '--set', f'CHROME_BINARY={browser_binary}'], + cwd=tmp_path, + ) + assert returncode == 0, f"archivebox config CHROME_BINARY failed: {stderr}" + script = textwrap.dedent(f"""\ + import os + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') + import django + django.setup() + from django.utils import timezone + from archivebox.machine.models import Binary, Machine + machine = Machine.current() + Binary.objects.filter(machine=machine, name='chromium').update( + status='installed', + abspath='{browser_binary}', + binprovider='env', + retry_at=timezone.now(), + ) + Binary.objects.update_or_create( + machine=machine, + name='chromium', + defaults={{ + 'status': 'installed', + 'abspath': '{browser_binary}', + 'binprovider': 'env', + 'retry_at': timezone.now(), + }}, + ) + print('OK') + """ + ) + stdout, stderr, returncode = run_python_cwd(script, cwd=tmp_path, timeout=60) + assert returncode == 0, f"Register chromium binary failed: {stderr}" + + add_env = { + 'CHROME_ENABLED': 'True', + 'RESPONSES_ENABLED': 'True', + 'DOM_ENABLED': 'False', + 'SHOW_PROGRESS': 'False', + 'USE_COLOR': 'False', + 'CHROME_HEADLESS': 'True', + 'CHROME_PAGELOAD_TIMEOUT': '45', + 'CHROME_TIMEOUT': '60', + 'RESPONSES_TIMEOUT': '30', + } + if browser_binary: + add_env['CHROME_BINARY'] = str(browser_binary) + if cached_chromium: + add_env['PUPPETEER_CACHE_DIR'] = str(shared_lib / 'puppeteer') + stdout, stderr, returncode = run_archivebox_cmd_cwd( + ['add', '--depth=0', '--plugins=chrome,responses', 'https://example.com'], + cwd=tmp_path, + timeout=600, + env=add_env, + ) + assert returncode == 0, f"archivebox add failed: {stderr}" + + return tmp_path + + # ============================================================================= # Output Assertions # ============================================================================= diff --git a/archivebox/tests/test_savepagenow.py b/archivebox/tests/test_savepagenow.py new file mode 100644 index 00000000..ad2df04b --- /dev/null +++ b/archivebox/tests/test_savepagenow.py @@ -0,0 +1,252 @@ +"""Integration tests for /web/https://... shortcut (Save Page Now).""" + +import os +import subprocess +import sys +import textwrap +from pathlib import Path + +from archivebox.tests.conftest import create_test_url + + +def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool): + project_root = Path(__file__).resolve().parents[2] + script = textwrap.dedent( + f""" + import os + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') + + from archivebox.config.django import setup_django + setup_django() + + from django.test import Client + from django.contrib.auth import get_user_model + from archivebox.core.models import Snapshot + + client = Client() + if {login!r}: + user = get_user_model().objects.create_user(username='tester', password='pw') + client.force_login(user) + + target_url = {request_url!r} + + resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000') + assert resp.status_code == 302, resp.status_code + + snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first() + if snapshot is None: + raise AssertionError( + "snapshot not created; status=%s location=%s count=%s" + % ( + resp.status_code, + resp.get('Location'), + Snapshot.objects.count(), + ) + ) + assert resp['Location'] == f"/{{snapshot.url_path}}" + + resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000') + assert resp2.status_code == 302, resp2.status_code + assert Snapshot.objects.filter(url={expected_url!r}).count() == 1 + assert resp2['Location'] == f"/{{snapshot.url_path}}" + """ + ) + + env = { + **os.environ, + 'DATA_DIR': str(initialized_archive), + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + 'PUBLIC_ADD_VIEW': 'True' if public_add_view else 'False', + 'SAVE_ARCHIVEDOTORG': 'False', + 'SAVE_TITLE': 'False', + 'SAVE_FAVICON': 'False', + 'SAVE_WGET': 'False', + 'SAVE_WARC': 'False', + 'SAVE_PDF': 'False', + 'SAVE_SCREENSHOT': 'False', + 'SAVE_DOM': 'False', + 'SAVE_SINGLEFILE': 'False', + 'SAVE_READABILITY': 'False', + 'SAVE_MERCURY': 'False', + 'SAVE_GIT': 'False', + 'SAVE_YTDLP': 'False', + 'SAVE_HEADERS': 'False', + 'SAVE_HTMLTOTEXT': 'False', + } + + return subprocess.run( + [sys.executable, '-c', script], + cwd=project_root, + env=env, + text=True, + capture_output=True, + timeout=60, + ) + + +def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str): + project_root = Path(__file__).resolve().parents[2] + script = textwrap.dedent( + f""" + import os + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') + + from archivebox.config.django import setup_django + setup_django() + + from django.test import Client + from archivebox.core.models import Snapshot + + client = Client() + target_url = {request_url!r} + + resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000') + assert resp.status_code == 404, resp.status_code + assert Snapshot.objects.count() == 0 + """ + ) + + env = { + **os.environ, + 'DATA_DIR': str(initialized_archive), + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + 'PUBLIC_ADD_VIEW': 'False', + 'SAVE_ARCHIVEDOTORG': 'False', + 'SAVE_TITLE': 'False', + 'SAVE_FAVICON': 'False', + 'SAVE_WGET': 'False', + 'SAVE_WARC': 'False', + 'SAVE_PDF': 'False', + 'SAVE_SCREENSHOT': 'False', + 'SAVE_DOM': 'False', + 'SAVE_SINGLEFILE': 'False', + 'SAVE_READABILITY': 'False', + 'SAVE_MERCURY': 'False', + 'SAVE_GIT': 'False', + 'SAVE_YTDLP': 'False', + 'SAVE_HEADERS': 'False', + 'SAVE_HTMLTOTEXT': 'False', + } + + return subprocess.run( + [sys.executable, '-c', script], + cwd=project_root, + env=env, + text=True, + capture_output=True, + timeout=60, + ) + + +def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str): + project_root = Path(__file__).resolve().parents[2] + script = textwrap.dedent( + f""" + import os + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') + + from archivebox.config.django import setup_django + setup_django() + + from django.test import Client + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + from archivebox.base_models.models import get_or_create_system_user_pk + + target_url = {request_url!r} + stored_url = {stored_url!r} + created_by_id = get_or_create_system_user_pk() + crawl = Crawl.objects.create(urls=stored_url, created_by_id=created_by_id) + snapshot = Snapshot.objects.create(url=stored_url, crawl=crawl) + + client = Client() + resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000') + assert resp.status_code == 302, resp.status_code + assert resp['Location'] == f"/{{snapshot.url_path}}" + """ + ) + + env = { + **os.environ, + 'DATA_DIR': str(initialized_archive), + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + 'PUBLIC_ADD_VIEW': 'False', + 'SAVE_ARCHIVEDOTORG': 'False', + 'SAVE_TITLE': 'False', + 'SAVE_FAVICON': 'False', + 'SAVE_WGET': 'False', + 'SAVE_WARC': 'False', + 'SAVE_PDF': 'False', + 'SAVE_SCREENSHOT': 'False', + 'SAVE_DOM': 'False', + 'SAVE_SINGLEFILE': 'False', + 'SAVE_READABILITY': 'False', + 'SAVE_MERCURY': 'False', + 'SAVE_GIT': 'False', + 'SAVE_YTDLP': 'False', + 'SAVE_HEADERS': 'False', + 'SAVE_HTMLTOTEXT': 'False', + } + + return subprocess.run( + [sys.executable, '-c', script], + cwd=project_root, + env=env, + text=True, + capture_output=True, + timeout=60, + ) + + +def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive): + """/web/https://... should work for authenticated users even when public add is off.""" + url = create_test_url(domain='example.com', path='savepagenow-auth') + request_url = url.replace('https://', '') + result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False) + assert result.returncode == 0, ( + "SavePageNow shortcut (logged-in) test failed.\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + +def test_web_add_creates_and_reuses_snapshot_public(initialized_archive): + """/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login.""" + url = create_test_url(domain='example.com', path='savepagenow-public') + request_url = url.replace('https://', '') + result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True) + assert result.returncode == 0, ( + "SavePageNow shortcut (public add) test failed.\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + +def test_web_add_requires_login_when_public_off(initialized_archive): + """/web/https://... should 404 for new URLs when PUBLIC_ADD_VIEW is false and not logged in.""" + url = create_test_url(domain='example.com', path='savepagenow-404') + request_url = url.replace('https://', '') + result = _run_savepagenow_not_found_script(initialized_archive, request_url) + assert result.returncode == 0, ( + "SavePageNow shortcut (no public add) test failed.\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + +def test_web_add_redirects_existing_snapshot_when_public_off(initialized_archive): + """/web/https://... should redirect to existing snapshot even when public add is off and not logged in.""" + url = create_test_url(domain='example.com', path='savepagenow-existing') + request_url = url.replace('https://', '') + result = _run_savepagenow_existing_snapshot_script(initialized_archive, request_url, url) + assert result.returncode == 0, ( + "SavePageNow shortcut (existing snapshot) test failed.\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) diff --git a/archivebox/tests/test_urls.py b/archivebox/tests/test_urls.py new file mode 100644 index 00000000..094481a2 --- /dev/null +++ b/archivebox/tests/test_urls.py @@ -0,0 +1,357 @@ +import os +import sys +import subprocess +import textwrap +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[3] + + +def _merge_pythonpath(env: dict[str, str]) -> dict[str, str]: + env.pop("DATA_DIR", None) + pythonpath = env.get("PYTHONPATH", "") + if pythonpath: + env["PYTHONPATH"] = f"{REPO_ROOT}{os.pathsep}{pythonpath}" + else: + env["PYTHONPATH"] = str(REPO_ROOT) + return env + + +def _run_python(script: str, cwd: Path, timeout: int = 60) -> subprocess.CompletedProcess: + env = _merge_pythonpath(os.environ.copy()) + return subprocess.run( + [sys.executable, "-"], + cwd=cwd, + env=env, + input=script, + capture_output=True, + text=True, + timeout=timeout, + ) + + +def _build_script(body: str) -> str: + prelude = textwrap.dedent( + """ + import os + from pathlib import Path + + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.core.settings") + import django + django.setup() + + from django.test import Client + from django.contrib.auth import get_user_model + + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.config.common import SERVER_CONFIG + from archivebox.core.host_utils import ( + get_admin_host, + get_api_host, + get_web_host, + get_snapshot_host, + get_original_host, + get_listen_subdomain, + split_host_port, + host_matches, + is_snapshot_subdomain, + ) + + def response_body(resp): + if getattr(resp, "streaming", False): + return b"".join(resp.streaming_content) + return resp.content + + def ensure_admin_user(): + User = get_user_model() + admin, _ = User.objects.get_or_create( + username="testadmin", + defaults={"email": "admin@example.com", "is_staff": True, "is_superuser": True}, + ) + admin.set_password("testpassword") + admin.save() + return admin + + def get_snapshot(): + snapshot = Snapshot.objects.order_by("-created_at").first() + assert snapshot is not None + return snapshot + + def get_snapshot_files(snapshot): + output_rel = None + for output in snapshot.discover_outputs(): + candidate = output.get("path") + if not candidate: + continue + if candidate.startswith("responses/"): + continue + if Path(snapshot.output_dir, candidate).is_file(): + output_rel = candidate + break + if output_rel is None: + fallback = Path(snapshot.output_dir, "index.jsonl") + if fallback.exists(): + output_rel = "index.jsonl" + assert output_rel is not None + + responses_root = Path(snapshot.output_dir) / "responses" / snapshot.domain + assert responses_root.exists() + response_file = None + response_rel = None + for candidate in responses_root.rglob("*"): + if not candidate.is_file(): + continue + rel = candidate.relative_to(responses_root) + if not (Path(snapshot.output_dir) / rel).exists(): + response_file = candidate + response_rel = str(rel) + break + if response_file is None: + response_file = next(p for p in responses_root.rglob("*") if p.is_file()) + response_rel = str(response_file.relative_to(responses_root)) + response_output_path = Path(snapshot.output_dir) / response_rel + return output_rel, response_file, response_rel, response_output_path + """ + ) + return prelude + "\n" + textwrap.dedent(body) + + +@pytest.mark.usefixtures("real_archive_with_example") +class TestUrlRouting: + data_dir: Path + + def _run(self, body: str, timeout: int = 120) -> None: + script = _build_script(body) + result = _run_python(script, cwd=self.data_dir, timeout=timeout) + assert result.returncode == 0, result.stderr + assert "OK" in result.stdout + + def test_host_utils_and_public_redirect(self) -> None: + self._run( + """ + snapshot = get_snapshot() + snapshot_id = str(snapshot.id) + domain = snapshot.domain + + web_host = get_web_host() + admin_host = get_admin_host() + api_host = get_api_host() + snapshot_host = get_snapshot_host(snapshot_id) + original_host = get_original_host(domain) + base_host = SERVER_CONFIG.LISTEN_HOST + + host_only, port = split_host_port(base_host) + assert host_only == "archivebox.localhost" + assert port == "8000" + assert web_host == "web.archivebox.localhost:8000" + assert admin_host == "admin.archivebox.localhost:8000" + assert api_host == "api.archivebox.localhost:8000" + assert snapshot_host == f"{snapshot_id}.archivebox.localhost:8000" + assert original_host == f"{domain}.archivebox.localhost:8000" + assert get_listen_subdomain(web_host) == "web" + assert get_listen_subdomain(admin_host) == "admin" + assert get_listen_subdomain(api_host) == "api" + assert get_listen_subdomain(snapshot_host) == snapshot_id + assert get_listen_subdomain(original_host) == domain + assert get_listen_subdomain(base_host) == "" + assert host_matches(web_host, get_web_host()) + assert is_snapshot_subdomain(snapshot_id) + + client = Client() + resp = client.get("/public.html", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert resp["Location"].endswith("/public/") + + resp = client.get("/public/", HTTP_HOST=base_host) + assert resp.status_code in (301, 302) + assert resp["Location"].startswith(f"http://{web_host}/public/") + + resp = client.get("/", HTTP_HOST=api_host) + assert resp.status_code in (301, 302) + assert resp["Location"].startswith("/api/") + + print("OK") + """ + ) + + def test_web_admin_routing(self) -> None: + self._run( + """ + ensure_admin_user() + client = Client() + web_host = get_web_host() + admin_host = get_admin_host() + + resp = client.get("/add/", HTTP_HOST=web_host) + assert resp.status_code == 200 + + resp = client.get("/admin/login/", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert admin_host in resp["Location"] + + resp = client.get("/admin/login/", HTTP_HOST=admin_host) + assert resp.status_code == 200 + + print("OK") + """ + ) + + def test_snapshot_routing_and_hosts(self) -> None: + self._run( + """ + snapshot = get_snapshot() + output_rel, response_file, response_rel, response_output_path = get_snapshot_files(snapshot) + snapshot_id = str(snapshot.id) + snapshot_host = get_snapshot_host(snapshot_id) + original_host = get_original_host(snapshot.domain) + web_host = get_web_host() + + client = Client() + + snapshot_path = f"/{snapshot.url_path}/" + resp = client.get(snapshot_path, HTTP_HOST=web_host) + assert resp.status_code == 200 + + resp = client.get(f"/web/{snapshot.domain}", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert resp["Location"].endswith(f"/{snapshot.url_path}") + + resp = client.get(f"/{snapshot.url_path}", HTTP_HOST=web_host) + assert resp.status_code == 200 + + date_segment = snapshot.url_path.split("/")[1] + resp = client.get(f"/web/{date_segment}/{date_segment}/{snapshot_id}/", HTTP_HOST=web_host) + assert resp.status_code == 404 + + resp = client.get(f"/{snapshot.url_path}/{output_rel}", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert snapshot_host in resp["Location"] + + resp = client.get(f"/{output_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert response_body(resp) == Path(snapshot.output_dir, output_rel).read_bytes() + + resp = client.get(f"/{response_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + snapshot_body = response_body(resp) + if response_output_path.exists(): + assert snapshot_body == response_output_path.read_bytes() + else: + assert snapshot_body == response_file.read_bytes() + + resp = client.get(f"/{response_rel}", HTTP_HOST=original_host) + assert resp.status_code == 200 + assert response_body(resp) == response_file.read_bytes() + + print("OK") + """ + ) + + def test_template_and_admin_links(self) -> None: + self._run( + """ + ensure_admin_user() + snapshot = get_snapshot() + snapshot.write_html_details() + snapshot_id = str(snapshot.id) + snapshot_host = get_snapshot_host(snapshot_id) + admin_host = get_admin_host() + web_host = get_web_host() + + client = Client() + + resp = client.get("/public/", HTTP_HOST=web_host) + assert resp.status_code == 200 + public_html = response_body(resp).decode("utf-8", "ignore") + assert "http://web.archivebox.localhost:8000" in public_html + + resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=web_host) + assert resp.status_code == 200 + live_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://{snapshot_host}/" in live_html + assert "http://web.archivebox.localhost:8000" in live_html + + static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore") + assert f"http://{snapshot_host}/" in static_html + + client.login(username="testadmin", password="testpassword") + resp = client.get(f"/admin/core/snapshot/{snapshot_id}/change/", HTTP_HOST=admin_host) + assert resp.status_code == 200 + admin_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://web.archivebox.localhost:8000/{snapshot.archive_path}" in admin_html + assert f"http://{snapshot_host}/" in admin_html + + result = ArchiveResult.objects.filter(snapshot=snapshot).first() + assert result is not None + resp = client.get(f"/admin/core/archiveresult/{result.id}/change/", HTTP_HOST=admin_host) + assert resp.status_code == 200 + ar_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://{snapshot_host}/" in ar_html + + print("OK") + """ + ) + + def test_api_available_on_admin_and_api_hosts(self) -> None: + self._run( + """ + client = Client() + admin_host = get_admin_host() + api_host = get_api_host() + + resp = client.get("/api/v1/docs", HTTP_HOST=admin_host) + assert resp.status_code == 200 + + resp = client.get("/api/v1/docs", HTTP_HOST=api_host) + assert resp.status_code == 200 + + print("OK") + """ + ) + + def test_api_post_with_token_on_admin_and_api_hosts(self) -> None: + self._run( + """ + ensure_admin_user() + from archivebox.api.auth import get_or_create_api_token + + token = get_or_create_api_token(get_user_model().objects.get(username="testadmin")) + assert token is not None + + client = Client() + admin_host = get_admin_host() + api_host = get_api_host() + + payload = '{"name": "apitest-tag"}' + headers = {"HTTP_X_ARCHIVEBOX_API_KEY": token.token} + + resp = client.post( + "/api/v1/core/tags/create/", + data=payload, + content_type="application/json", + HTTP_HOST=admin_host, + **headers, + ) + assert resp.status_code == 200 + data = resp.json() + assert data.get("success") is True + assert data.get("tag_name") == "apitest-tag" + + resp = client.post( + "/api/v1/core/tags/create/", + data=payload, + content_type="application/json", + HTTP_HOST=api_host, + **headers, + ) + assert resp.status_code == 200 + data = resp.json() + assert data.get("success") is True + assert data.get("tag_name") == "apitest-tag" + + print("OK") + """ + ) diff --git a/archivebox/workers/management/commands/orchestrator_watch.py b/archivebox/workers/management/commands/orchestrator_watch.py new file mode 100644 index 00000000..e0a6edf3 --- /dev/null +++ b/archivebox/workers/management/commands/orchestrator_watch.py @@ -0,0 +1,79 @@ +from django.core.management.base import BaseCommand + + +class Command(BaseCommand): + help = "Watch the runserver autoreload PID file and restart orchestrator on reloads." + + def add_arguments(self, parser): + parser.add_argument( + "--pidfile", + default=None, + help="Path to runserver pidfile to watch", + ) + parser.add_argument( + "--interval", + type=float, + default=1.0, + help="Polling interval in seconds", + ) + + def handle(self, *args, **kwargs): + import os + import time + from archivebox.config.common import STORAGE_CONFIG + from archivebox.machine.models import Process, Machine + from archivebox.workers.orchestrator import Orchestrator + + os.environ['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1' + + pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE") + if not pidfile: + pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid") + + interval = max(0.2, float(kwargs.get("interval", 1.0))) + + last_pid = None + + def restart_orchestrator(): + Process.cleanup_stale_running() + machine = Machine.current() + + running = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type__in=[ + Process.TypeChoices.ORCHESTRATOR, + Process.TypeChoices.WORKER, + Process.TypeChoices.HOOK, + ], + ) + for proc in running: + try: + if proc.process_type == Process.TypeChoices.HOOK: + proc.kill_tree(graceful_timeout=0.5) + else: + proc.terminate(graceful_timeout=1.0) + except Exception: + continue + + if not Orchestrator.is_running(): + Orchestrator(exit_on_idle=False).start() + + while True: + try: + if os.path.exists(pidfile): + with open(pidfile, "r") as handle: + pid = handle.read().strip() or None + else: + pid = None + + if pid and pid != last_pid: + restart_orchestrator() + last_pid = pid + elif not Orchestrator.is_running(): + Orchestrator(exit_on_idle=False).start() + + except Exception: + pass + + time.sleep(interval) diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 64f92824..6465ef88 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -42,6 +42,8 @@ from .worker import Worker, BinaryWorker, CrawlWorker def _run_orchestrator_process(exit_on_idle: bool) -> None: """Top-level function for multiprocessing (must be picklable).""" + import os + os.environ['ARCHIVEBOX_ORCHESTRATOR_PROCESS'] = '1' from archivebox.config.django import setup_django setup_django() orchestrator = Orchestrator(exit_on_idle=exit_on_idle) @@ -80,6 +82,7 @@ class Orchestrator: self.pid_file = None self.idle_count: int = 0 self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running() + self._last_hard_timeout_check: float = 0.0 # Throttle hard timeout enforcement # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker if self.exit_on_idle: @@ -255,10 +258,6 @@ class Orchestrator: pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id) print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]') - if self.exit_on_idle: - # Foreground runs have MAX_CRAWL_WORKERS=1; avoid blocking startup on registration. - return pid - # CRITICAL: Block until worker registers itself in Process table # This prevents race condition where orchestrator spawns multiple workers # before any of them finish on_startup() and register @@ -333,6 +332,8 @@ class Orchestrator: queue_sizes = {} + self._enforce_hard_timeouts() + # Check Binary queue machine = Machine.current() binary_queue = Binary.objects.filter( @@ -359,6 +360,22 @@ class Orchestrator: status__in=Crawl.FINAL_STATES ) + # Prevent duplicate CrawlWorkers for the same crawl (even across orchestrators) + from archivebox.machine.models import Process + running_crawl_ids: set[str] = set() + running_crawl_workers = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + worker_type='crawl', + status=Process.StatusChoices.RUNNING, + ).values_list('env', flat=True) + for env in running_crawl_workers: + if isinstance(env, dict): + crawl_id = env.get('CRAWL_ID') + if crawl_id: + running_crawl_ids.add(str(crawl_id)) + if running_crawl_ids: + crawl_queue = crawl_queue.exclude(id__in=running_crawl_ids) + # Apply crawl_id filter if set if self.crawl_id: crawl_queue = crawl_queue.filter(id=self.crawl_id) @@ -379,6 +396,156 @@ class Orchestrator: return queue_sizes + def _enforce_hard_timeouts(self) -> None: + """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits.""" + import time + from datetime import timedelta + from archivebox.config.constants import CONSTANTS + from archivebox.machine.models import Process + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.crawls.models import Crawl + + throttle_seconds = 30 + now_ts = time.time() + if now_ts - self._last_hard_timeout_check < throttle_seconds: + return + self._last_hard_timeout_check = now_ts + + now = timezone.now() + + # Hard limit for hook processes / archiveresults + hook_cutoff = now - timedelta(seconds=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS) + overdue_hooks = Process.objects.filter( + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + started_at__lt=hook_cutoff, + ).select_related('archiveresult') + + for proc in overdue_hooks: + try: + proc.kill_tree(graceful_timeout=0.0) + except Exception: + pass + + ar = getattr(proc, 'archiveresult', None) + if ar and ar.status == ArchiveResult.StatusChoices.STARTED: + ar.status = ArchiveResult.StatusChoices.FAILED + ar.end_ts = now + ar.retry_at = None + ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at']) + + # Hard limit for snapshots + snapshot_cutoff = now - timedelta(seconds=CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS) + overdue_snapshots = Snapshot.objects.filter( + status=Snapshot.StatusChoices.STARTED, + modified_at__lt=snapshot_cutoff, + ) + + overdue_snapshot_ids = {str(s.id) for s in overdue_snapshots} + if overdue_snapshot_ids: + running_snapshot_workers = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + worker_type='snapshot', + status=Process.StatusChoices.RUNNING, + ) + for proc in running_snapshot_workers: + env = proc.env or {} + if isinstance(env, dict) and str(env.get('SNAPSHOT_ID', '')) in overdue_snapshot_ids: + try: + proc.terminate(graceful_timeout=1.0) + except Exception: + pass + + for snapshot in overdue_snapshots: + running_hooks = Process.objects.filter( + archiveresult__snapshot=snapshot, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + ).distinct() + for process in running_hooks: + try: + process.kill_tree(graceful_timeout=0.0) + except Exception: + continue + + snapshot.archiveresult_set.filter( + status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED], + ).update( + status=ArchiveResult.StatusChoices.FAILED, + end_ts=now, + retry_at=None, + modified_at=now, + ) + + snapshot.cleanup() + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = None + snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + + crawl = snapshot.crawl + if crawl and crawl.is_finished(): + crawl.status = crawl.StatusChoices.SEALED + crawl.retry_at = None + crawl.save(update_fields=['status', 'retry_at', 'modified_at']) + + # Reconcile snapshot/crawl state with running archiveresults + started_snapshot_ids = list( + ArchiveResult.objects.filter( + status=ArchiveResult.StatusChoices.STARTED, + ).values_list('snapshot_id', flat=True).distinct() + ) + if started_snapshot_ids: + Snapshot.objects.filter( + id__in=started_snapshot_ids, + ).exclude( + status=Snapshot.StatusChoices.SEALED, + ).exclude( + status=Snapshot.StatusChoices.STARTED, + ).update( + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + modified_at=now, + ) + + Crawl.objects.filter( + snapshot_set__id__in=started_snapshot_ids, + status=Crawl.StatusChoices.QUEUED, + ).distinct().update( + status=Crawl.StatusChoices.STARTED, + retry_at=None, + modified_at=now, + ) + + # If a snapshot is sealed, any still-started archiveresults should be failed + sealed_snapshot_ids = list( + Snapshot.objects.filter(status=Snapshot.StatusChoices.SEALED).values_list('id', flat=True) + ) + if sealed_snapshot_ids: + started_ars = ArchiveResult.objects.filter( + snapshot_id__in=sealed_snapshot_ids, + status=ArchiveResult.StatusChoices.STARTED, + ).select_related('process') + for ar in started_ars: + if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING: + try: + ar.process.kill_tree(graceful_timeout=0.0) + except Exception: + pass + ar.status = ArchiveResult.StatusChoices.FAILED + ar.end_ts = now + ar.retry_at = None + ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at']) + + # Clear queued/started snapshots that belong to sealed crawls + Snapshot.objects.filter( + crawl__status=Crawl.StatusChoices.SEALED, + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).update( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=now, + ) + def _claim_crawl(self, crawl) -> bool: """Atomically claim a crawl using optimistic locking.""" from archivebox.crawls.models import Crawl diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index fb1f50ac..f4d7aa02 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -32,7 +32,8 @@ _supervisord_proc = None ORCHESTRATOR_WORKER = { "name": "worker_orchestrator", - "command": "archivebox run", # runs forever by default + # Use Django management command to avoid stdin/TTY ambiguity in `archivebox run`. + "command": "archivebox manage orchestrator", "autostart": "true", "autorestart": "true", "stdout_logfile": "logs/worker_orchestrator.log", diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 0bfed22b..ce10f8ab 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -436,6 +436,7 @@ class CrawlWorker(Worker): super().__init__(**kwargs) self.crawl_id = crawl_id self.crawl = None + self.crawl_config = None def get_model(self): from archivebox.crawls.models import Crawl @@ -446,7 +447,9 @@ class CrawlWorker(Worker): super().on_startup() from archivebox.crawls.models import Crawl + from archivebox.config.configset import get_config self.crawl = Crawl.objects.get(id=self.crawl_id) + self.crawl_config = get_config(crawl=self.crawl) def runloop(self) -> None: """Run crawl state machine, spawn SnapshotWorkers.""" @@ -484,6 +487,12 @@ class CrawlWorker(Worker): # Now spawn SnapshotWorkers and monitor progress while True: + self.crawl.refresh_from_db() + if self.crawl.status == Crawl.StatusChoices.SEALED: + print(f'🛑 Crawl {self.crawl_id} was sealed, stopping workers', file=sys.stderr) + self._terminate_running_snapshot_workers() + break + # Check if crawl is done if self._is_crawl_finished(): print(f'🔄 Crawl finished, sealing...', file=sys.stderr) @@ -589,6 +598,22 @@ class CrawlWorker(Worker): thread = threading.Thread(target=pipe_worker_stderr, daemon=True) thread.start() + def _terminate_running_snapshot_workers(self) -> None: + """Terminate any running SnapshotWorkers for this crawl.""" + from archivebox.machine.models import Process + + running_workers = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + worker_type='snapshot', + parent_id=self.db_process.id, + status=Process.StatusChoices.RUNNING, + ) + for proc in running_workers: + try: + proc.terminate(graceful_timeout=1.0) + except Exception: + continue + def _is_crawl_finished(self) -> bool: """Check if all snapshots are sealed.""" from pathlib import Path @@ -684,19 +709,29 @@ class SnapshotWorker(Worker): from archivebox.core.models import Snapshot self.snapshot = Snapshot.objects.get(id=self.snapshot_id) + if self.snapshot.status == Snapshot.StatusChoices.SEALED: + return + # Use state machine to transition queued -> started (triggers enter_started()) self.snapshot.sm.tick() self.snapshot.refresh_from_db() + self.snapshot_started_at = self.snapshot.modified_at or self.snapshot.created_at def runloop(self) -> None: """Execute all hooks sequentially.""" from archivebox.hooks import discover_hooks, is_background_hook - from archivebox.core.models import ArchiveResult + from archivebox.core.models import ArchiveResult, Snapshot from archivebox.config.configset import get_config self.on_startup() try: + if self.snapshot.status == Snapshot.StatusChoices.SEALED: + return + if self._snapshot_exceeded_hard_timeout(): + self._seal_snapshot_due_to_timeout() + return + # Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.) config = get_config(snapshot=self.snapshot, crawl=self.snapshot.crawl) @@ -706,6 +741,13 @@ class SnapshotWorker(Worker): # Execute each hook sequentially for hook_path in hooks: + self.snapshot.refresh_from_db() + if self.snapshot.status == Snapshot.StatusChoices.SEALED: + break + if self._snapshot_exceeded_hard_timeout(): + self._seal_snapshot_due_to_timeout() + return + hook_name = hook_path.name plugin = self._extract_plugin_name(hook_path, hook_name) is_background = is_background_hook(hook_name) @@ -756,9 +798,10 @@ class SnapshotWorker(Worker): # All hooks launched (or completed) - terminate bg hooks and seal self._finalize_background_hooks() - # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing - self.snapshot.sm.seal() - self.snapshot.refresh_from_db() + if self.snapshot.status != Snapshot.StatusChoices.SEALED: + # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing + self.snapshot.sm.seal() + self.snapshot.refresh_from_db() except Exception as e: # Mark snapshot as sealed even on error (still triggers cleanup) @@ -771,17 +814,34 @@ class SnapshotWorker(Worker): def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any: """Fork and run a hook using Process model, return Process.""" - from archivebox.hooks import run_hook + from archivebox.hooks import run_hook, get_plugin_special_config + from archivebox.config.constants import CONSTANTS # Create output directory output_dir = ar.create_output_dir() + timeout = None + try: + plugin_name = hook_path.parent.name + plugin_config = get_plugin_special_config(plugin_name, config) + timeout = plugin_config.get('timeout') + except Exception: + timeout = None + + if getattr(self, 'snapshot_started_at', None): + remaining = max(1, int(CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS - (timezone.now() - self.snapshot_started_at).total_seconds())) + if timeout: + timeout = min(int(timeout), remaining) + else: + timeout = remaining + # Run hook using Process.launch() - returns Process model directly # Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy process = run_hook( script=hook_path, output_dir=output_dir, config=config, + timeout=timeout, parent=self.db_process, url=str(self.snapshot.url), snapshot_id=str(self.snapshot.id), @@ -872,6 +932,44 @@ class SnapshotWorker(Worker): # Remove completed hook from tracking self.background_processes.pop(hook_name, None) + def _snapshot_exceeded_hard_timeout(self) -> bool: + from archivebox.config.constants import CONSTANTS + + if not getattr(self, 'snapshot_started_at', None): + return False + return (timezone.now() - self.snapshot_started_at).total_seconds() > CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS + + def _seal_snapshot_due_to_timeout(self) -> None: + from archivebox.core.models import ArchiveResult + from archivebox.machine.models import Process + + now = timezone.now() + + running_hooks = Process.objects.filter( + archiveresult__snapshot=self.snapshot, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + ).distinct() + for process in running_hooks: + try: + process.kill_tree(graceful_timeout=0.0) + except Exception: + continue + + self.snapshot.archiveresult_set.filter( + status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED], + ).update( + status=ArchiveResult.StatusChoices.FAILED, + end_ts=now, + retry_at=None, + modified_at=now, + ) + + self.snapshot.cleanup() + self.snapshot.status = self.snapshot.StatusChoices.SEALED + self.snapshot.retry_at = None + self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + def on_shutdown(self, error: BaseException | None = None) -> None: """ Terminate all background Snapshot hooks when snapshot finishes. diff --git a/docker-compose.yml b/docker-compose.yml index d650371f..76b237ea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ # docker compose run archivebox config --set SAVE_ARCHIVEDOTORG=False # docker compose run archivebox add --depth=1 'https://news.ycombinator.com' # docker compose run -T archivebox add < bookmarks.txt -# docker compose up -d && open 'https://localhost:8000' +# docker compose up -d && open 'http://web.archivebox.localhost:8000' # docker compose run archivebox help # Documentation: # https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose @@ -21,8 +21,9 @@ services: environment: # - ADMIN_USERNAME=admin # creates an admin user on first run with the given user/pass combo # - ADMIN_PASSWORD=SomeSecretPassword + - LISTEN_HOST=archivebox.localhost:8000 - ALLOWED_HOSTS=* # set this to the hostname(s) you're going to serve the site from! - - CSRF_TRUSTED_ORIGINS=http://localhost:8000 # you MUST set this to the server's URL for admin login and the REST API to work + - CSRF_TRUSTED_ORIGINS=http://admin.archivebox.localhost:8000 # MUST match the admin UI URL for login/API to work - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive