diff --git a/Dockerfile b/Dockerfile
index cb571bab..1c8b682d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -393,7 +393,7 @@ VOLUME "$DATA_DIR"
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
- CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
+ CMD curl --silent 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK'
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
diff --git a/README.md b/README.md
index 2da5f877..6a5117c0 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,8 @@ archivebox init --setup
curl -fsSL 'https://get.archivebox.io' | bash
-Open http://localhost:8000 to see your server's Web UI ➡️
+Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI ➡️
+Set LISTEN_HOST to change the base domain; web. and admin. subdomains are used automatically.
@@ -469,6 +470,7 @@ For more discussion on managed and paid hosting options see here: http://localhost:8000 to see your server's Web UI ➡️
+Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI ➡️
+Set LISTEN_HOST to change the base domain; web. and admin. subdomains are used automatically.
For more info, see our Usage: Web UI wiki. ➡️
diff --git a/archivebox/api/auth.py b/archivebox/api/auth.py
index ae58e1e3..da537606 100644
--- a/archivebox/api/auth.py
+++ b/archivebox/api/auth.py
@@ -127,6 +127,20 @@ class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
pass
+class DjangoSessionAuth:
+ """Allow authenticating with existing Django session cookies (same-origin only)."""
+ def __call__(self, request: HttpRequest) -> Optional[AbstractBaseUser]:
+ return self.authenticate(request)
+
+ def authenticate(self, request: HttpRequest, **kwargs) -> Optional[AbstractBaseUser]:
+ user = getattr(request, 'user', None)
+ if user and user.is_authenticated:
+ request._api_auth_method = self.__class__.__name__
+ if not user.is_superuser:
+ raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
+ return cast(AbstractBaseUser, user)
+ return None
+
### Enabled Auth Methods
API_AUTH_METHODS = [
@@ -134,5 +148,4 @@ API_AUTH_METHODS = [
BearerTokenAuth(),
QueryParamTokenAuth(),
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
- UsernameAndPasswordAuth(),
]
diff --git a/archivebox/api/middleware.py b/archivebox/api/middleware.py
new file mode 100644
index 00000000..952503b1
--- /dev/null
+++ b/archivebox/api/middleware.py
@@ -0,0 +1,34 @@
+__package__ = 'archivebox.api'
+
+from django.http import HttpResponse
+
+
+class ApiCorsMiddleware:
+ """Attach permissive CORS headers for API routes (token-based auth)."""
+
+ def __init__(self, get_response):
+ self.get_response = get_response
+
+ def __call__(self, request):
+ if request.path.startswith('/api/'):
+ if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
+ response = HttpResponse(status=204)
+ return self._add_cors_headers(request, response)
+
+ response = self.get_response(request)
+ return self._add_cors_headers(request, response)
+
+ return self.get_response(request)
+
+ def _add_cors_headers(self, request, response):
+ origin = request.META.get('HTTP_ORIGIN')
+ if not origin:
+ return response
+
+ response['Access-Control-Allow-Origin'] = '*'
+ response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
+ response['Access-Control-Allow-Headers'] = (
+ 'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
+ )
+ response['Access-Control-Max-Age'] = '600'
+ return response
diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py
index f49f05af..12f68509 100644
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -188,6 +188,11 @@ class SnapshotSchema(Schema):
return ArchiveResult.objects.none()
+class SnapshotUpdateSchema(Schema):
+ status: str | None = None
+ retry_at: datetime | None = None
+
+
class SnapshotFilterSchema(FilterSchema):
id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
created_by_id: str = Field(None, q='crawl__created_by_id')
@@ -225,6 +230,31 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
+@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
+def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
+ """Update a snapshot (e.g., set status=sealed to cancel queued work)."""
+ try:
+ snapshot = Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
+ except Snapshot.DoesNotExist:
+ snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
+
+ payload = data.dict(exclude_unset=True)
+
+ if 'status' in payload:
+ if payload['status'] not in Snapshot.StatusChoices.values:
+ raise HttpError(400, f'Invalid status: {payload["status"]}')
+ snapshot.status = payload['status']
+ if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
+ snapshot.retry_at = None
+
+ if 'retry_at' in payload:
+ snapshot.retry_at = payload['retry_at']
+
+ snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
+ request.with_archiveresults = False
+ return snapshot
+
+
### Tag #########################################################################
class TagSchema(Schema):
diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py
index d450b766..36cf5f20 100644
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -3,11 +3,13 @@ __package__ = 'archivebox.api'
from uuid import UUID
from typing import List
from datetime import datetime
+from django.utils import timezone
from django.db.models import Q
from django.contrib.auth import get_user_model
from ninja import Router, Schema
+from ninja.errors import HttpError
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
@@ -54,6 +56,11 @@ class CrawlSchema(Schema):
return Snapshot.objects.none()
+class CrawlUpdateSchema(Schema):
+ status: str | None = None
+ retry_at: datetime | None = None
+
+
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@@ -79,3 +86,32 @@ def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=F
return crawl
+
+@router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl")
+def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
+ """Update a crawl (e.g., set status=sealed to cancel queued work)."""
+ crawl = Crawl.objects.get(id__icontains=crawl_id)
+ payload = data.dict(exclude_unset=True)
+
+ if 'status' in payload:
+ if payload['status'] not in Crawl.StatusChoices.values:
+ raise HttpError(400, f'Invalid status: {payload["status"]}')
+ crawl.status = payload['status']
+ if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
+ crawl.retry_at = None
+
+ if 'retry_at' in payload:
+ crawl.retry_at = payload['retry_at']
+
+ crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+ if payload.get('status') == Crawl.StatusChoices.SEALED:
+ Snapshot.objects.filter(
+ crawl=crawl,
+ status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
+ ).update(
+ status=Snapshot.StatusChoices.SEALED,
+ retry_at=None,
+ modified_at=timezone.now(),
+ )
+ return crawl
diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py
index 0eb21b86..4a53e513 100644
--- a/archivebox/cli/archivebox_persona.py
+++ b/archivebox/cli/archivebox_persona.py
@@ -15,6 +15,7 @@ Examples:
# Create a new persona
archivebox persona create work
archivebox persona create --import=chrome personal
+ archivebox persona create --import=edge work
# List all personas
archivebox persona list
@@ -34,6 +35,7 @@ import subprocess
import tempfile
from pathlib import Path
from typing import Optional, Iterable
+from collections import OrderedDict
import rich_click as click
from rich import print as rprint
@@ -78,34 +80,6 @@ def get_chrome_user_data_dir() -> Optional[Path]:
return None
-def get_firefox_profile_dir() -> Optional[Path]:
- """Get the default Firefox profile directory for the current platform."""
- system = platform.system()
- home = Path.home()
-
- if system == 'Darwin':
- profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
- elif system == 'Linux':
- profiles_dir = home / '.mozilla' / 'firefox'
- elif system == 'Windows':
- app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
- profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
- else:
- return None
-
- if not profiles_dir.exists():
- return None
-
- # Find the default profile (usually ends with .default or .default-release)
- for profile in profiles_dir.iterdir():
- if profile.is_dir() and ('default' in profile.name.lower()):
- return profile
-
- # If no default found, return the first profile
- profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
- return profiles[0] if profiles else None
-
-
def get_brave_user_data_dir() -> Optional[Path]:
"""Get the default Brave user data directory for the current platform."""
system = platform.system()
@@ -134,25 +108,99 @@ def get_brave_user_data_dir() -> Optional[Path]:
return None
+def get_edge_user_data_dir() -> Optional[Path]:
+ """Get the default Edge user data directory for the current platform."""
+ system = platform.system()
+ home = Path.home()
+
+ if system == 'Darwin':
+ candidates = [
+ home / 'Library' / 'Application Support' / 'Microsoft Edge',
+ ]
+ elif system == 'Linux':
+ candidates = [
+ home / '.config' / 'microsoft-edge',
+ home / '.config' / 'microsoft-edge-beta',
+ home / '.config' / 'microsoft-edge-dev',
+ ]
+ elif system == 'Windows':
+ local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
+ candidates = [
+ local_app_data / 'Microsoft' / 'Edge' / 'User Data',
+ ]
+ else:
+ candidates = []
+
+ for candidate in candidates:
+ if candidate.exists() and (candidate / 'Default').exists():
+ return candidate
+
+ return None
+
+
BROWSER_PROFILE_FINDERS = {
'chrome': get_chrome_user_data_dir,
'chromium': get_chrome_user_data_dir, # Same locations
- 'firefox': get_firefox_profile_dir,
'brave': get_brave_user_data_dir,
+ 'edge': get_edge_user_data_dir,
}
+CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
+
# =============================================================================
# Cookie Extraction via CDP
# =============================================================================
+NETSCAPE_COOKIE_HEADER = [
+ '# Netscape HTTP Cookie File',
+ '# https://curl.se/docs/http-cookies.html',
+ '# This file was generated by ArchiveBox persona cookie extraction',
+ '#',
+ '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
+ '',
+]
+
+
+def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]":
+ cookies = OrderedDict()
+ if not path.exists():
+ return cookies
+
+ for line in path.read_text().splitlines():
+ if not line or line.startswith('#'):
+ continue
+ parts = line.split('\t')
+ if len(parts) < 7:
+ continue
+ domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
+ key = (domain, cookie_path, name)
+ cookies[key] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
+ return cookies
+
+
+def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
+ lines = list(NETSCAPE_COOKIE_HEADER)
+ for cookie in cookies.values():
+ lines.append('\t'.join(cookie))
+ path.write_text('\n'.join(lines) + '\n')
+
+
+def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
+ existing = _parse_netscape_cookies(existing_file)
+ new = _parse_netscape_cookies(new_file)
+ for key, cookie in new.items():
+ existing[key] = cookie
+ _write_netscape_cookies(existing_file, existing)
+
+
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
"""
Launch Chrome with the given user data dir and extract cookies via CDP.
Returns True if successful, False otherwise.
"""
- from archivebox.config.constants import CONSTANTS
+ from archivebox.config.common import STORAGE_CONFIG
# Find the cookie extraction script
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
@@ -163,14 +211,21 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
return False
# Get node modules dir
- node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'
+ node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
# Set up environment
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(node_modules_dir)
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
- env['COOKIES_OUTPUT_FILE'] = str(output_file)
env['CHROME_HEADLESS'] = 'true'
+ output_path = output_file
+ temp_output = None
+ temp_dir = None
+ if output_file.exists():
+ temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
+ temp_output = temp_dir / 'cookies.txt'
+ output_path = temp_output
+ env['COOKIES_OUTPUT_FILE'] = str(output_path)
try:
result = subprocess.run(
@@ -182,6 +237,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
)
if result.returncode == 0:
+ if temp_output and temp_output.exists():
+ _merge_netscape_cookies(output_file, temp_output)
return True
else:
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
@@ -196,6 +253,9 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
except Exception as e:
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
return False
+ finally:
+ if temp_dir and temp_dir.exists():
+ shutil.rmtree(temp_dir, ignore_errors=True)
# =============================================================================
@@ -323,6 +383,9 @@ def create_personas(
# Import browser profile if requested
if import_from and source_profile_dir:
+ cookies_file = Path(persona.path) / 'cookies.txt'
+
+ if import_from in CHROMIUM_BROWSERS:
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
# Copy the browser profile
@@ -349,7 +412,6 @@ def create_personas(
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
- cookies_file = Path(persona.path) / 'cookies.txt'
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
@@ -589,7 +651,7 @@ def main():
@main.command('create')
@click.argument('names', nargs=-1)
-@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
+@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
def create_cmd(names: tuple, import_from: Optional[str]):
"""Create Personas, optionally importing from a browser profile."""
sys.exit(create_personas(names, import_from=import_from))
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index b9273e31..afc4542a 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -3,6 +3,9 @@
__package__ = 'archivebox.cli'
from typing import Iterable
+import os
+import sys
+import subprocess
import rich_click as click
from rich import print
@@ -60,6 +63,26 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
pass
if run_in_debug:
+ os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
+ if reload:
+ os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
+ os.environ['ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER'] = '1'
+ from archivebox.config.common import STORAGE_CONFIG
+ pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
+ os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
+
+ from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
+ is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
+ if not is_reloader_child:
+ env = os.environ.copy()
+ env['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
+ subprocess.Popen(
+ [sys.executable, '-m', 'archivebox', 'manage', 'orchestrator_watch', f'--pidfile={pidfile}'],
+ env=env,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+
from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
@@ -79,7 +102,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
is_port_in_use,
)
from archivebox.workers.orchestrator import Orchestrator
- import sys
# Check if port is already in use
if is_port_in_use(host, int(port)):
diff --git a/archivebox/config/common.py b/archivebox/config/common.py
index edf7b602..c6359279 100644
--- a/archivebox/config/common.py
+++ b/archivebox/config/common.py
@@ -99,8 +99,11 @@ class ServerConfig(BaseConfigSet):
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
+ LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
+ ADMIN_BASE_URL: str = Field(default="")
+ ARCHIVE_BASE_URL: str = Field(default="")
ALLOWED_HOSTS: str = Field(default="*")
- CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
+ CSRF_TRUSTED_ORIGINS: str = Field(default="http://admin.archivebox.localhost:8000")
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)
diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py
index 9e78d722..c1f6ae44 100644
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -118,6 +118,10 @@ class ConstantsDict(Mapping):
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
+ # Hard safety limits (seconds)
+ MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
+ MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
+
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py
index b4c420b7..70353578 100644
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -14,6 +14,7 @@ from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
+from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.models import ArchiveResult, Snapshot
@@ -57,7 +58,11 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
- output_link = f'/{result.snapshot.archive_path}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/{result.snapshot.archive_path}/'
+ snapshot_id = str(getattr(result, 'snapshot_id', ''))
+ if embed_path and result.status == 'succeeded':
+ output_link = build_snapshot_url(snapshot_id, embed_path)
+ else:
+ output_link = build_snapshot_url(snapshot_id, '')
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
@@ -252,7 +257,7 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
- readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
+ readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -300,10 +305,11 @@ class ArchiveResultAdmin(BaseModelAdmin):
description='Snapshot Info'
)
def snapshot_info(self, result):
+ snapshot_id = str(result.snapshot_id)
return format_html(
- '[{}] {} {}
',
- result.snapshot.archive_path,
- str(result.snapshot.id)[:8],
+ '[{}] {} {}
',
+ build_snapshot_url(snapshot_id, "index.html"),
+ snapshot_id[:8],
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
@@ -335,10 +341,10 @@ class ArchiveResultAdmin(BaseModelAdmin):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
+ snapshot_id = str(result.snapshot_id)
return format_html(
- '↗️
{}',
- result.snapshot.archive_path,
- output_path,
+ '↗️{}',
+ build_snapshot_url(snapshot_id, output_path),
result.output_str,
)
@@ -348,7 +354,11 @@ class ArchiveResultAdmin(BaseModelAdmin):
'{}', str(result.snapshot.archive_path))
+ snapshot_id = str(result.snapshot_id)
+ output_html += format_html(
+ 'See result files ...
',
+ build_snapshot_url(snapshot_id, "index.html"),
+ )
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('{}/{}
', str(snapshot_dir), str(embed_path))
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index d75198ff..25c89e15 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -8,6 +8,8 @@ from django.contrib import admin, messages
from django.urls import path
from django.utils.html import format_html, mark_safe
from django.utils import timezone
+from django.db.models import Q, Sum, Count, Prefetch
+from django.db.models.functions import Coalesce
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
@@ -18,11 +20,12 @@ from archivebox.misc.util import htmldecode, urldecode
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.misc.logging_util import printable_filesize
from archivebox.search.admin import SearchResultsAdminMixin
+from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
-from archivebox.core.models import Tag, Snapshot
+from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
@@ -36,7 +39,7 @@ class SnapshotActionForm(ActionForm):
super().__init__(*args, **kwargs)
# Define tags field in __init__ to avoid database access during app initialization
self.fields['tags'] = forms.CharField(
- label='Edit tags',
+ label='',
required=False,
widget=TagEditorWidget(),
)
@@ -67,6 +70,19 @@ class SnapshotActionForm(ActionForm):
# )
+class TagNameListFilter(admin.SimpleListFilter):
+ title = 'By tag name'
+ parameter_name = 'tag'
+
+ def lookups(self, request, model_admin):
+ return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by('name')]
+
+ def queryset(self, request, queryset):
+ if self.value():
+ return queryset.filter(tags__id=self.value())
+ return queryset
+
+
class SnapshotAdminForm(forms.ModelForm):
"""Custom form for Snapshot admin with tag editor widget."""
tags_editor = forms.CharField(
@@ -117,11 +133,11 @@ class SnapshotAdminForm(forms.ModelForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
form = SnapshotAdminForm
- list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
- sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
+ list_display = ('created_at', 'preview_icon', 'title_str', 'tags_inline', 'status_with_progress', 'files', 'size_with_stats')
+ sort_fields = ('title_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
- list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
+ list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter)
fieldsets = (
('URL', {
@@ -163,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
ordering = ['-created_at']
- actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
+ actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [] # Removed TagInline, using TagEditorWidget instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
@@ -182,6 +198,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
return super().changelist_view(request, GLOBAL_CONTEXT)
+ def get_actions(self, request):
+ actions = super().get_actions(request)
+ if 'delete_selected' in actions:
+ func, name, _desc = actions['delete_selected']
+ actions['delete_selected'] = (func, name, 'Delete')
+ return actions
+
def get_urls(self):
urls = super().get_urls()
@@ -196,6 +219,52 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
+ def get_queryset(self, request):
+ self.request = request
+ ordering_fields = self._get_ordering_fields(request)
+ needs_size_sort = 'size_with_stats' in ordering_fields
+ needs_files_sort = 'files' in ordering_fields
+ needs_tags_sort = 'tags_inline' in ordering_fields
+
+ prefetch_qs = ArchiveResult.objects.filter(
+ Q(status='succeeded')
+ ).only(
+ 'id',
+ 'snapshot_id',
+ 'plugin',
+ 'status',
+ 'output_size',
+ 'output_files',
+ 'output_str',
+ )
+
+ qs = (
+ super()
+ .get_queryset(request)
+ .defer('config', 'notes')
+ .prefetch_related('tags')
+ .prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs))
+ )
+
+ if needs_size_sort:
+ qs = qs.annotate(
+ output_size_sum=Coalesce(Sum(
+ 'archiveresult__output_size',
+ filter=Q(archiveresult__status='succeeded'),
+ ), 0),
+ )
+
+ if needs_files_sort:
+ qs = qs.annotate(
+ ar_succeeded_count=Count(
+ 'archiveresult',
+ filter=Q(archiveresult__status='succeeded'),
+ ),
+ )
+ if needs_tags_sort:
+ qs = qs.annotate(tag_count=Count('tags', distinct=True))
+
+ return qs
@admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
@@ -233,17 +302,19 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# )
def admin_actions(self, obj):
+ summary_url = build_web_url(f'/{obj.archive_path}')
+ results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
return format_html(
'''
📄 Summary Page
📁 Result Files
@@ -263,7 +334,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
- ⬇️ Get Missing
+ ⬇️ Finish
Tip: Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
''',
- obj.archive_path,
- obj.archive_path,
+ summary_url,
+ results_url,
obj.url,
obj.pk,
obj.pk,
@@ -301,6 +372,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
def status_info(self, obj):
+ favicon_url = build_snapshot_url(str(obj.id), 'favicon.ico')
return format_html(
'''
Archived: {} ({} files {})
@@ -310,7 +382,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'✅' if obj.is_archived else '❌',
obj.num_outputs,
self.size(obj) or '0kb',
- f'/{obj.archive_path}/favicon.ico',
+ favicon_url,
obj.extension or '-',
)
@@ -323,7 +395,37 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
ordering='title',
)
def title_str(self, obj):
- # Render inline tag editor widget
+ title_raw = (obj.title or '').strip()
+ url_raw = (obj.url or '').strip()
+ title_normalized = title_raw.lower()
+ url_normalized = url_raw.lower()
+ show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized
+ css_class = 'fetched' if show_title else 'pending'
+
+ detail_url = build_web_url(f'/{obj.archive_path}/index.html')
+ title_html = ''
+ if show_title:
+ title_html = format_html(
+ ''
+ '{}'
+ '',
+ detail_url,
+ css_class,
+ urldecode(htmldecode(title_raw))[:128],
+ )
+
+ return format_html(
+ '{}'
+ ''
+ '{}'
+ '',
+ title_html,
+ url_raw or obj.url,
+ (url_raw or obj.url)[:128],
+ )
+
+ @admin.display(description='Tags', ordering='tag_count')
+ def tags_inline(self, obj):
widget = InlineTagEditorWidget(snapshot_id=str(obj.pk))
tags_html = widget.render(
name=f'tags_{obj.pk}',
@@ -331,28 +433,58 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
attrs={'id': f'tags_{obj.pk}'},
snapshot_id=str(obj.pk),
)
+ return mark_safe(f'')
- # Show title if available, otherwise show URL
- display_text = obj.title or obj.url
- css_class = 'fetched' if obj.title else 'pending'
+ @admin.display(description='Preview', empty_value='')
+ def preview_icon(self, obj):
+ results = self._get_prefetched_results(obj)
+ has_screenshot = False
+ has_favicon = False
+ if results is not None:
+ has_screenshot = any(r.plugin == 'screenshot' for r in results)
+ has_favicon = any(r.plugin == 'favicon' for r in results)
+
+ if not has_screenshot and not has_favicon:
+ return None
+
+ if has_screenshot:
+ img_url = build_snapshot_url(str(obj.id), 'screenshot/screenshot.png')
+ fallbacks = [
+ build_snapshot_url(str(obj.id), 'screenshot.png'),
+ build_snapshot_url(str(obj.id), 'favicon/favicon.ico'),
+ build_snapshot_url(str(obj.id), 'favicon.ico'),
+ ]
+ img_alt = 'Screenshot'
+ preview_class = 'screenshot'
+ else:
+ img_url = build_snapshot_url(str(obj.id), 'favicon/favicon.ico')
+ fallbacks = [
+ build_snapshot_url(str(obj.id), 'favicon.ico'),
+ ]
+ img_alt = 'Favicon'
+ preview_class = 'favicon'
+
+ fallback_list = ','.join(fallbacks)
+ onerror_js = (
+ "this.dataset.fallbacks && this.dataset.fallbacks.length ? "
+ "(this.src=this.dataset.fallbacks.split(',').shift(), "
+ "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : "
+ "this.remove()"
+ )
return format_html(
- ''
- '
'
- ''
- ''
- '{}'
- '',
- obj.archive_path,
- obj.archive_path,
- obj.archive_path,
- css_class,
- urldecode(htmldecode(display_text))[:128]
- ) + mark_safe(f' ')
+ '
',
+ img_url,
+ img_alt,
+ preview_class,
+ onerror_js,
+ fallback_list,
+ )
@admin.display(
description='Files Saved',
- # ordering='archiveresult_count',
+ ordering='ar_succeeded_count',
)
def files(self, obj):
# return '-'
@@ -371,8 +503,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
else:
size_txt = mark_safe('...')
return format_html(
- '{}',
- obj.archive_path,
+ '{}',
+ build_web_url(f'/{obj.archive_path}'),
size_txt,
)
@@ -382,7 +514,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
def status_with_progress(self, obj):
"""Show status with progress bar for in-progress snapshots."""
- stats = obj.get_progress_stats()
+ stats = self._get_progress_stats(obj)
# Status badge colors
status_colors = {
@@ -440,16 +572,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.display(
description='Size',
+ ordering='output_size_sum',
)
def size_with_stats(self, obj):
"""Show archive size with output size from archive results."""
- stats = obj.get_progress_stats()
-
- # Use output_size from archive results if available, fallback to disk size
+ stats = self._get_progress_stats(obj)
output_size = stats['output_size']
- archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
-
- size_bytes = output_size or archive_size or 0
+ size_bytes = output_size or 0
if size_bytes:
size_txt = printable_filesize(size_bytes)
@@ -461,22 +590,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# Show hook statistics
if stats['total'] > 0:
return format_html(
- ''
+ ''
'{}'
''
'{}/{} hooks',
- obj.archive_path,
+ build_web_url(f'/{obj.archive_path}'),
size_txt,
stats['succeeded'],
stats['total'],
)
return format_html(
- '{}',
- obj.archive_path,
+ '{}',
+ build_web_url(f'/{obj.archive_path}'),
size_txt,
)
+ def _get_progress_stats(self, obj):
+ results = self._get_prefetched_results(obj)
+ if results is None:
+ return obj.get_progress_stats()
+
+ total = len(results)
+ succeeded = sum(1 for r in results if r.status == 'succeeded')
+ failed = sum(1 for r in results if r.status == 'failed')
+ running = sum(1 for r in results if r.status == 'started')
+ skipped = sum(1 for r in results if r.status == 'skipped')
+ pending = max(total - succeeded - failed - running - skipped, 0)
+ completed = succeeded + failed + skipped
+ percent = int((completed / total * 100) if total > 0 else 0)
+ is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED)
+ output_size = None
+
+ if hasattr(obj, 'output_size_sum'):
+ output_size = obj.output_size_sum or 0
+ else:
+ output_size = sum(r.output_size or 0 for r in results if r.status == 'succeeded')
+
+ return {
+ 'total': total,
+ 'succeeded': succeeded,
+ 'failed': failed,
+ 'running': running,
+ 'pending': pending,
+ 'skipped': skipped,
+ 'percent': percent,
+ 'output_size': output_size or 0,
+ 'is_sealed': is_sealed,
+ }
+
+ def _get_prefetched_results(self, obj):
+ if hasattr(obj, '_prefetched_objects_cache') and 'archiveresult_set' in obj._prefetched_objects_cache:
+ return obj.archiveresult_set.all()
+ return None
+
+ def _get_ordering_fields(self, request):
+ ordering = request.GET.get('o')
+ if not ordering:
+ return set()
+ fields = set()
+ for part in ordering.split('.'):
+ if not part:
+ continue
+ try:
+ idx = abs(int(part)) - 1
+ except ValueError:
+ continue
+ if 0 <= idx < len(self.list_display):
+ fields.add(self.list_display[idx])
+ return fields
+
@admin.display(
description='Original URL',
ordering='url',
@@ -524,20 +707,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# return super().changelist_view(request, extra_context=None)
@admin.action(
- description="ℹ️ Get Title"
- )
- def update_titles(self, request, queryset):
- count = queryset.count()
-
- # Queue snapshots for archiving via the state machine system
- queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
- messages.success(
- request,
- f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
- )
-
- @admin.action(
- description="⬇️ Get Missing"
+ description="⏯️ Finish"
)
def update_snapshots(self, request, queryset):
count = queryset.count()
@@ -551,7 +721,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.action(
- description="🆕 Archive Again"
+ description="⬇️ Fresh"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
@@ -579,7 +749,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
@admin.action(
- description="☠️ Delete"
+ description="🗑️ Delete"
)
def delete_snapshots(self, request, queryset):
"""Delete snapshots in a single transaction to avoid SQLite concurrency issues."""
diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py
index 4c0e438a..713d34d9 100644
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,6 +1,9 @@
__package__ = 'archivebox.core'
from django.apps import AppConfig
+import os
+
+_ORCHESTRATOR_BOOTSTRAPPED = False
class CoreConfig(AppConfig):
@@ -10,6 +13,7 @@ class CoreConfig(AppConfig):
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
import sys
+ from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
from archivebox.core.admin_site import register_admin_site
register_admin_site()
@@ -18,3 +22,45 @@ class CoreConfig(AppConfig):
# Skip during makemigrations to avoid premature state machine access
if 'makemigrations' not in sys.argv:
from archivebox.core import models # noqa: F401
+
+ pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE')
+ if pidfile:
+ should_write_pid = True
+ if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
+ should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
+ if should_write_pid:
+ try:
+ with open(pidfile, 'w') as handle:
+ handle.write(str(os.getpid()))
+ except Exception:
+ pass
+
+ def _should_manage_orchestrator() -> bool:
+ if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1':
+ return False
+ if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1':
+ return False
+ if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
+ if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
+ return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
+ return True
+
+ argv = ' '.join(sys.argv).lower()
+ if 'orchestrator' in argv:
+ return False
+ return 'daphne' in argv and '--reload' in sys.argv
+
+ if _should_manage_orchestrator():
+ global _ORCHESTRATOR_BOOTSTRAPPED
+ if _ORCHESTRATOR_BOOTSTRAPPED:
+ return
+ _ORCHESTRATOR_BOOTSTRAPPED = True
+
+ from archivebox.machine.models import Process, Machine
+ from archivebox.workers.orchestrator import Orchestrator
+
+ Process.cleanup_stale_running()
+ machine = Machine.current()
+
+ if not Orchestrator.is_running():
+ Orchestrator(exit_on_idle=False).start()
diff --git a/archivebox/core/host_utils.py b/archivebox/core/host_utils.py
new file mode 100644
index 00000000..2e723d05
--- /dev/null
+++ b/archivebox/core/host_utils.py
@@ -0,0 +1,189 @@
+from __future__ import annotations
+
+from __future__ import annotations
+
+import re
+from urllib.parse import urlparse
+
+from archivebox.config.common import SERVER_CONFIG
+
+
+_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$")
+
+
+def split_host_port(host: str) -> tuple[str, str | None]:
+ parsed = urlparse(f"//{host}")
+ hostname = (parsed.hostname or host or "").lower()
+ port = str(parsed.port) if parsed.port else None
+ return hostname, port
+
+
+def _normalize_base_url(value: str | None) -> str:
+ if not value:
+ return ""
+ base = value.strip()
+ if not base:
+ return ""
+ if "://" not in base:
+ base = f"http://{base}"
+ parsed = urlparse(base)
+ if not parsed.netloc:
+ return ""
+ return f"{parsed.scheme}://{parsed.netloc}"
+
+
+def normalize_base_url(value: str | None) -> str:
+ return _normalize_base_url(value)
+
+
+def get_listen_host() -> str:
+ return (SERVER_CONFIG.LISTEN_HOST or "").strip()
+
+
+def get_listen_parts() -> tuple[str, str | None]:
+ return split_host_port(get_listen_host())
+
+
+def _build_listen_host(subdomain: str | None) -> str:
+ host, port = get_listen_parts()
+ if not host:
+ return ""
+ full_host = f"{subdomain}.{host}" if subdomain else host
+ if port:
+ return f"{full_host}:{port}"
+ return full_host
+
+
+def get_admin_host() -> str:
+ override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
+ if override:
+ return urlparse(override).netloc.lower()
+ return _build_listen_host("admin")
+
+
+def get_web_host() -> str:
+ override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
+ if override:
+ return urlparse(override).netloc.lower()
+ return _build_listen_host("web")
+
+def get_api_host() -> str:
+ return _build_listen_host("api")
+
+def get_public_host() -> str:
+ return _build_listen_host("public")
+
+
+def get_snapshot_host(snapshot_id: str) -> str:
+ return _build_listen_host(snapshot_id)
+
+
+def get_original_host(domain: str) -> str:
+ return _build_listen_host(domain)
+
+
+def is_snapshot_subdomain(subdomain: str) -> bool:
+ return bool(_SNAPSHOT_ID_RE.match(subdomain or ""))
+
+
+def get_listen_subdomain(request_host: str) -> str:
+ req_host, req_port = split_host_port(request_host)
+ listen_host, listen_port = get_listen_parts()
+ if not listen_host:
+ return ""
+ if listen_port and req_port and listen_port != req_port:
+ return ""
+ if req_host == listen_host:
+ return ""
+ suffix = f".{listen_host}"
+ if req_host.endswith(suffix):
+ return req_host[: -len(suffix)]
+ return ""
+
+
+def host_matches(request_host: str, target_host: str) -> bool:
+ if not request_host or not target_host:
+ return False
+ req_host, req_port = split_host_port(request_host)
+ target_host_only, target_port = split_host_port(target_host)
+ if req_host != target_host_only:
+ return False
+ if target_port and req_port and target_port != req_port:
+ return False
+ return True
+
+
+def _scheme_from_request(request=None) -> str:
+ if request:
+ return request.scheme
+ return "http"
+
+
+def _build_base_url_for_host(host: str, request=None) -> str:
+ if not host:
+ return ""
+ scheme = _scheme_from_request(request)
+ return f"{scheme}://{host}"
+
+
+def get_admin_base_url(request=None) -> str:
+ override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
+ if override:
+ return override
+ return _build_base_url_for_host(get_admin_host(), request=request)
+
+
+def get_web_base_url(request=None) -> str:
+ override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
+ if override:
+ return override
+ return _build_base_url_for_host(get_web_host(), request=request)
+
+def get_api_base_url(request=None) -> str:
+ return _build_base_url_for_host(get_api_host(), request=request)
+
+
+# Backwards-compat aliases (archive == web)
+def get_archive_base_url(request=None) -> str:
+ return get_web_base_url(request=request)
+
+
+def get_snapshot_base_url(snapshot_id: str, request=None) -> str:
+ return _build_base_url_for_host(get_snapshot_host(snapshot_id), request=request)
+
+
+def get_original_base_url(domain: str, request=None) -> str:
+ return _build_base_url_for_host(get_original_host(domain), request=request)
+
+
+def build_admin_url(path: str = "", request=None) -> str:
+ return _build_url(get_admin_base_url(request), path)
+
+
+def build_web_url(path: str = "", request=None) -> str:
+ return _build_url(get_web_base_url(request), path)
+
+def build_api_url(path: str = "", request=None) -> str:
+ return _build_url(get_api_base_url(request), path)
+
+
+def build_archive_url(path: str = "", request=None) -> str:
+ return _build_url(get_archive_base_url(request), path)
+
+
+def build_snapshot_url(snapshot_id: str, path: str = "", request=None) -> str:
+ return _build_url(get_snapshot_base_url(snapshot_id, request=request), path)
+
+
+def build_original_url(domain: str, path: str = "", request=None) -> str:
+ return _build_url(get_original_base_url(domain, request=request), path)
+
+
+def _build_url(base_url: str, path: str) -> str:
+ if not base_url:
+ if not path:
+ return ""
+ return path if path.startswith("/") else f"/{path}"
+ if not path:
+ return base_url
+ return f"{base_url}{path if path.startswith('/') else f'/{path}'}"
diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py
index a5343196..2003b478 100644
--- a/archivebox/core/middleware.py
+++ b/archivebox/core/middleware.py
@@ -2,11 +2,33 @@ __package__ = 'archivebox.core'
import ipaddress
import re
+from pathlib import Path
from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
+from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
+from django.shortcuts import redirect
+from django.contrib.staticfiles import finders
+from django.utils.http import http_date
+from django.http import HttpResponseNotModified
from archivebox.config.common import SERVER_CONFIG
+from archivebox.config import VERSION
+from archivebox.config.version import get_COMMIT_HASH
+from archivebox.core.host_utils import (
+ build_admin_url,
+ build_api_url,
+ build_web_url,
+ get_api_host,
+ get_admin_host,
+ get_listen_host,
+ get_listen_subdomain,
+ get_public_host,
+ get_web_host,
+ host_matches,
+ is_snapshot_subdomain,
+)
+from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
def detect_timezone(request, activate: bool=True):
@@ -30,17 +52,112 @@ def TimezoneMiddleware(get_response):
def CacheControlMiddleware(get_response):
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
+ static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip()
def middleware(request):
response = get_response(request)
+ if request.path.startswith('/static/'):
+ rel_path = request.path[len('/static/'):]
+ static_path = finders.find(rel_path)
+ if static_path:
+ try:
+ mtime = Path(static_path).stat().st_mtime
+ except OSError:
+ mtime = None
+ etag = f'"{static_cache_key}:{int(mtime) if mtime else 0}"'
+ inm = request.META.get("HTTP_IF_NONE_MATCH")
+ if inm:
+ inm_list = [item.strip() for item in inm.split(",")]
+ if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
+ not_modified = HttpResponseNotModified()
+ not_modified.headers["ETag"] = etag
+ not_modified.headers["Cache-Control"] = "public, max-age=31536000, immutable"
+ if mtime:
+ not_modified.headers["Last-Modified"] = http_date(mtime)
+ return not_modified
+ response.headers["ETag"] = etag
+ response.headers["Cache-Control"] = "public, max-age=31536000, immutable"
+ if mtime and not response.headers.get("Last-Modified"):
+ response.headers["Last-Modified"] = http_date(mtime)
+ return response
+
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
- policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
- response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
- # print('Set Cache-Control header to', response['Cache-Control'])
+ if not response.get('Cache-Control'):
+ policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
+ response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
+ # print('Set Cache-Control header to', response['Cache-Control'])
return response
return middleware
+
+def HostRoutingMiddleware(get_response):
+ def middleware(request):
+ request_host = (request.get_host() or "").lower()
+ admin_host = get_admin_host()
+ web_host = get_web_host()
+ api_host = get_api_host()
+ public_host = get_public_host()
+ listen_host = get_listen_host()
+ subdomain = get_listen_subdomain(request_host)
+
+ if host_matches(request_host, admin_host):
+ return get_response(request)
+
+ if host_matches(request_host, api_host):
+ request.user = AnonymousUser()
+ request._cached_user = request.user
+ if request.path.startswith("/admin"):
+ target = build_admin_url(request.path, request=request)
+ if request.META.get("QUERY_STRING"):
+ target = f"{target}?{request.META['QUERY_STRING']}"
+ return redirect(target)
+ if not request.path.startswith("/api/"):
+ target_path = f"/api{request.path if request.path.startswith('/') else f'/{request.path}'}"
+ if request.META.get("QUERY_STRING"):
+ target_path = f"{target_path}?{request.META['QUERY_STRING']}"
+ return redirect(target_path)
+ return get_response(request)
+
+ if host_matches(request_host, web_host):
+ request.user = AnonymousUser()
+ request._cached_user = request.user
+ if request.path.startswith("/admin"):
+ target = build_admin_url(request.path, request=request)
+ if request.META.get("QUERY_STRING"):
+ target = f"{target}?{request.META['QUERY_STRING']}"
+ return redirect(target)
+ return get_response(request)
+
+ if host_matches(request_host, public_host):
+ request.user = AnonymousUser()
+ request._cached_user = request.user
+ return get_response(request)
+
+ if subdomain:
+ if is_snapshot_subdomain(subdomain):
+ view = SnapshotHostView.as_view()
+ return view(request, snapshot_id=subdomain, path=request.path.lstrip("/"))
+ view = OriginalDomainHostView.as_view()
+ return view(request, domain=subdomain, path=request.path.lstrip("/"))
+
+ if host_matches(request_host, listen_host):
+ target = build_web_url(request.path, request=request)
+ if request.META.get("QUERY_STRING"):
+ target = f"{target}?{request.META['QUERY_STRING']}"
+ return redirect(target)
+
+ if admin_host or web_host:
+ target = build_web_url(request.path, request=request)
+ if target:
+ if request.META.get("QUERY_STRING"):
+ target = f"{target}?{request.META['QUERY_STRING']}"
+ return redirect(target)
+
+ return get_response(request)
+
+ return middleware
+
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
diff --git a/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py
new file mode 100644
index 00000000..cea2b04d
--- /dev/null
+++ b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py
@@ -0,0 +1,17 @@
+# Generated by Codex on 2026-01-21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0030_alter_archiveresult_id'),
+ ]
+
+ operations = [
+ migrations.AddIndex(
+ model_name='archiveresult',
+ index=models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'),
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index e306fd64..b2c4d719 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1297,7 +1297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
path = self.archive_path
output = ""
- output_template = '{} '
+ output_template = '{}'
# Get all plugins from hooks system (sorted by numeric prefix)
all_plugins = [get_plugin_name(e) for e in get_plugins()]
@@ -1322,7 +1322,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
icon
)
- return format_html('', mark_safe(output))
+ return format_html('', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
@@ -1789,7 +1789,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)['total_size'] or 0
# Check if sealed
- is_sealed = self.status in (self.StatusChoices.SEALED, self.StatusChoices.FAILED, self.StatusChoices.BACKOFF)
+ is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED)
return {
'total': total,
@@ -1992,6 +1992,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file())
else:
size = abs_path.stat().st_size
+ plugin_lower = (result.plugin or '').lower()
+ if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl'):
+ plugin_dir = snap_dir / result.plugin
+ if plugin_dir.exists():
+ try:
+ size = sum(p.stat().st_size for p in plugin_dir.rglob('*') if p.is_file())
+ except OSError:
+ pass
outputs.append({
'name': result.plugin,
'path': embed_path,
@@ -2057,6 +2065,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
from archivebox.misc.util import ts_to_date_str
+ from archivebox.core.host_utils import build_snapshot_url
result = {
'TYPE': 'core.models.Snapshot',
@@ -2078,6 +2087,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'is_static': self.is_static,
'is_archived': self.is_archived,
'archive_path': self.archive_path,
+ 'archive_url': build_snapshot_url(str(self.id), 'index.html'),
'output_dir': self.output_dir,
'link_dir': self.output_dir, # backwards compatibility alias
'archive_size': self.archive_size,
@@ -2129,14 +2139,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
outputs_by_plugin = {out['name']: out for out in outputs}
best_preview_path = 'about:blank'
+ best_result = {'path': 'about:blank', 'result': None}
for plugin in preview_priority:
out = outputs_by_plugin.get(plugin)
if out and out.get('path'):
best_preview_path = out['path']
+ best_result = out
break
if best_preview_path == 'about:blank' and outputs:
best_preview_path = outputs[0].get('path') or 'about:blank'
+ best_result = outputs[0]
context = {
**self.to_dict(extended=True),
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
@@ -2151,6 +2164,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'best_preview_path': best_preview_path,
+ 'best_result': best_result,
'archiveresults': outputs,
}
rendered_html = render_to_string('snapshot.html', context)
@@ -2326,6 +2340,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
+ indexes = [
+ models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'),
+ ]
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
@@ -2487,6 +2504,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_lower = (plugin_name or '').lower()
prefer_media = plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl')
+ preferred_text = []
+ if plugin_lower:
+ preferred_text.extend([
+ f'{plugin_lower}.jsonl',
+ f'{plugin_lower}.json',
+ f'{plugin_lower}.txt',
+ f'{plugin_lower}.log',
+ ])
+ preferred_text.extend(['index.jsonl', 'index.json'])
+ for name in preferred_text:
+ candidate = dir_path / name
+ if candidate.exists() and candidate.is_file():
+ return candidate
+
if not prefer_media:
for name in ('index.html', 'index.htm'):
candidate = dir_path / name
@@ -2504,6 +2535,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if file_path.is_dir() or file_path.name.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
+ if ext in ('pid', 'log', 'sh'):
+ continue
if ext not in embeddable_exts:
continue
try:
@@ -2547,20 +2580,44 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Fallback: treat output_str as a file path only if it exists on disk
if self.output_str:
try:
- output_path = Path(self.output_str)
+ raw_output = str(self.output_str).strip()
+ if raw_output in ('.', './', ''):
+ best_file = self._find_best_output_file(plugin_dir, self.plugin)
+ if best_file:
+ return str(best_file.relative_to(snapshot_dir))
+ output_path = None
+ else:
+ output_path = Path(raw_output)
- if output_path.is_absolute():
+ if output_path and output_path.is_absolute():
# If absolute and within snapshot dir, normalize to relative
if snapshot_dir in output_path.parents and output_path.exists():
- return str(output_path.relative_to(snapshot_dir))
- else:
+ if output_path.is_file():
+ return str(output_path.relative_to(snapshot_dir))
+ if output_path.is_dir():
+ best_file = self._find_best_output_file(output_path, self.plugin)
+ if best_file:
+ return str(best_file.relative_to(snapshot_dir))
+ elif output_path:
# If relative, prefer plugin-prefixed path, then direct path
- if (plugin_dir / output_path).exists():
- return f'{self.plugin}/{output_path}'
+ plugin_candidate = plugin_dir / output_path
+ if plugin_candidate.exists():
+ if plugin_candidate.is_file():
+ return f'{self.plugin}/{output_path}'
+ if plugin_candidate.is_dir():
+ best_file = self._find_best_output_file(plugin_candidate, self.plugin)
+ if best_file:
+ return str(best_file.relative_to(snapshot_dir))
if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'):
return None
- if (snapshot_dir / output_path).exists():
- return str(output_path)
+ snapshot_candidate = snapshot_dir / output_path
+ if snapshot_candidate.exists():
+ if snapshot_candidate.is_file():
+ return str(output_path)
+ if snapshot_candidate.is_dir():
+ best_file = self._find_best_output_file(snapshot_candidate, self.plugin)
+ if best_file:
+ return str(best_file.relative_to(snapshot_dir))
except Exception:
pass
@@ -2569,7 +2626,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
output_candidates = [
f for f in self.output_files.keys()
- if Path(f).name not in ignored
+ if Path(f).name not in ignored and Path(f).suffix not in ('.pid', '.log', '.sh')
]
first_file = output_candidates[0] if output_candidates else None
if first_file and (plugin_dir / first_file).exists():
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 16b6df0c..2dec9a03 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -12,6 +12,7 @@ import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa
+from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
@@ -77,9 +78,11 @@ MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
+ "archivebox.api.middleware.ApiCorsMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
+ "archivebox.core.middleware.HostRoutingMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"archivebox.core.middleware.CacheControlMiddleware",
# Additional middlewares from plugins (if any)
@@ -347,6 +350,14 @@ SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnop
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",")
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(",")))
+admin_base_url = normalize_base_url(get_admin_base_url())
+if admin_base_url and admin_base_url not in CSRF_TRUSTED_ORIGINS:
+ CSRF_TRUSTED_ORIGINS.append(admin_base_url)
+
+api_base_url = normalize_base_url(get_api_base_url())
+if api_base_url and api_base_url not in CSRF_TRUSTED_ORIGINS:
+ CSRF_TRUSTED_ORIGINS.append(api_base_url)
+
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
for hostname in ALLOWED_HOSTS:
@@ -363,6 +374,7 @@ CSRF_COOKIE_SECURE = False
SESSION_COOKIE_SECURE = False
SESSION_COOKIE_HTTPONLY = True
SESSION_COOKIE_DOMAIN = None
+CSRF_COOKIE_DOMAIN = None
SESSION_COOKIE_AGE = 1209600 # 2 weeks
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
SESSION_SAVE_EVERY_REQUEST = False
diff --git a/archivebox/core/templatetags/config_tags.py b/archivebox/core/templatetags/config_tags.py
index 9921b1fb..94992075 100644
--- a/archivebox/core/templatetags/config_tags.py
+++ b/archivebox/core/templatetags/config_tags.py
@@ -15,6 +15,6 @@ def get_config(key: str) -> any:
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
"""
try:
- return _get_config(key)
+ return _get_config().get(key)
except (KeyError, AttributeError):
return None
diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py
index bcf7f10d..e9a38023 100644
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -9,10 +9,114 @@ from pathlib import Path
from archivebox.hooks import (
get_plugin_icon, get_plugin_template, get_plugin_name,
)
+from archivebox.core.host_utils import (
+ get_admin_base_url,
+ get_web_base_url,
+ get_snapshot_base_url,
+ build_snapshot_url,
+)
register = template.Library()
+_MEDIA_FILE_EXTS = {
+ '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v', '.mpg', '.mpeg', '.ts', '.m2ts', '.mts',
+ '.3gp', '.3g2', '.ogv',
+ '.mp3', '.m4a', '.aac', '.ogg', '.oga', '.opus', '.wav', '.flac', '.alac', '.aiff', '.wma', '.mka', '.ac3', '.eac3', '.dts',
+}
+
+
+def _count_media_files(result) -> int:
+ try:
+ output_files = getattr(result, 'output_files', None) or {}
+ except Exception:
+ output_files = {}
+
+ count_from_output = 0
+ if output_files:
+ count_from_output = sum(
+ 1
+ for path in output_files.keys()
+ if Path(path).suffix.lower() in _MEDIA_FILE_EXTS
+ )
+ if count_from_output >= 2:
+ return count_from_output
+
+ try:
+ plugin_dir = Path(result.snapshot_dir) / result.plugin
+ except Exception:
+ return 0
+
+ if not plugin_dir.exists():
+ return 0
+
+ count = 0
+ scanned = 0
+ max_scan = 500
+ for file_path in plugin_dir.rglob('*'):
+ if scanned >= max_scan:
+ break
+ scanned += 1
+ if not file_path.is_file():
+ continue
+ if file_path.suffix.lower() in _MEDIA_FILE_EXTS:
+ count += 1
+ return max(count_from_output, count)
+
+
+def _list_media_files(result) -> list[dict]:
+ media_files: list[dict] = []
+ try:
+ plugin_dir = Path(result.snapshot_dir) / result.plugin
+ snapshot_dir = Path(result.snapshot_dir)
+ except Exception:
+ return media_files
+
+ output_files = getattr(result, 'output_files', None) or {}
+ candidates: list[Path] = []
+ if output_files:
+ for path in output_files.keys():
+ rel_path = Path(path)
+ if rel_path.suffix.lower() in _MEDIA_FILE_EXTS:
+ candidates.append(rel_path)
+
+ if not candidates and plugin_dir.exists():
+ scanned = 0
+ max_scan = 2000
+ for file_path in plugin_dir.rglob('*'):
+ if scanned >= max_scan:
+ break
+ scanned += 1
+ if not file_path.is_file():
+ continue
+ if file_path.suffix.lower() in _MEDIA_FILE_EXTS:
+ try:
+ rel_path = file_path.relative_to(plugin_dir)
+ except ValueError:
+ continue
+ candidates.append(rel_path)
+
+ for rel_path in candidates:
+ file_path = plugin_dir / rel_path
+ if not file_path.exists() or not file_path.is_file():
+ continue
+ try:
+ size = file_path.stat().st_size
+ except OSError:
+ size = None
+ try:
+ href = str(file_path.relative_to(snapshot_dir))
+ except ValueError:
+ href = str(Path(result.plugin) / rel_path)
+ media_files.append({
+ 'name': file_path.name,
+ 'path': href,
+ 'size': size,
+ })
+
+ media_files.sort(key=lambda item: item['name'].lower())
+ return media_files
+
@register.filter(name='split')
def split(value, separator: str=','):
return (value or '').split(separator)
@@ -52,6 +156,28 @@ def url_replace(context, **kwargs):
return dict_.urlencode()
+@register.simple_tag(takes_context=True)
+def admin_base_url(context) -> str:
+ return get_admin_base_url(request=context.get('request'))
+
+
+@register.simple_tag(takes_context=True)
+def web_base_url(context) -> str:
+ return get_web_base_url(request=context.get('request'))
+
+
+@register.simple_tag(takes_context=True)
+def snapshot_base_url(context, snapshot) -> str:
+ snapshot_id = getattr(snapshot, 'id', snapshot)
+ return get_snapshot_base_url(str(snapshot_id), request=context.get('request'))
+
+
+@register.simple_tag(takes_context=True)
+def snapshot_url(context, snapshot, path: str = "") -> str:
+ snapshot_id = getattr(snapshot, 'id', snapshot)
+ return build_snapshot_url(str(snapshot_id), path, request=context.get('request'))
+
+
@register.simple_tag
def plugin_icon(plugin: str) -> str:
"""
@@ -82,24 +208,41 @@ def plugin_card(context, result) -> str:
template_str = get_plugin_template(plugin, 'card')
# Use embed_path() for the display path
- output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+ raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+ output_url = build_snapshot_url(
+ str(getattr(result, 'snapshot_id', '')),
+ raw_output_path or '',
+ request=context.get('request'),
+ )
icon_html = get_plugin_icon(plugin)
+ plugin_lower = (plugin or '').lower()
+ media_file_count = _count_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else 0
+ media_files = _list_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else []
+ if media_files:
+ snapshot_id = str(getattr(result, 'snapshot_id', ''))
+ request = context.get('request')
+ for item in media_files:
+ path = item.get('path') or ''
+ item['url'] = build_snapshot_url(snapshot_id, path, request=request) if path else ''
- output_lower = (output_path or '').lower()
+ output_lower = (raw_output_path or '').lower()
text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
force_text_preview = output_lower.endswith(text_preview_exts)
# Create a mini template and render it with context
try:
- if template_str and output_path and str(output_path).strip() not in ('.', '/', './') and not force_text_preview:
+ if template_str and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './') and not force_text_preview:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
- 'output_path': output_path,
+ 'output_path': output_url,
+ 'output_path_raw': raw_output_path,
'plugin': plugin,
'plugin_icon': icon_html,
+ 'media_file_count': media_file_count,
+ 'media_files': media_files,
})
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
@@ -108,10 +251,10 @@ def plugin_card(context, result) -> str:
except Exception:
pass
- if force_text_preview and output_path and str(output_path).strip() not in ('.', '/', './'):
- output_file = Path(output_path)
+ if force_text_preview and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './'):
+ output_file = Path(raw_output_path)
if not output_file.is_absolute():
- output_file = Path(result.snapshot_dir) / output_path
+ output_file = Path(result.snapshot_dir) / raw_output_path
try:
output_file = output_file.resolve()
snap_dir = Path(result.snapshot_dir).resolve()
@@ -169,14 +312,20 @@ def plugin_full(context, result) -> str:
if not template_str:
return ''
- output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+ raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+ output_url = build_snapshot_url(
+ str(getattr(result, 'snapshot_id', '')),
+ raw_output_path or '',
+ request=context.get('request'),
+ )
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
- 'output_path': output_path,
+ 'output_path': output_url,
+ 'output_path_raw': raw_output_path,
'plugin': plugin,
})
rendered = tpl.render(ctx)
@@ -198,3 +347,30 @@ def plugin_name(value: str) -> str:
Usage: {{ result.plugin|plugin_name }}
"""
return get_plugin_name(value)
+
+
+@register.filter
+def plugin_display_name(value: str) -> str:
+ """
+ Human-friendly plugin name overrides for UI display.
+ """
+ name = get_plugin_name(value)
+ if name == 'merkletree':
+ return 'hashes'
+ return name
+
+
+@register.simple_tag(takes_context=True)
+def api_token(context) -> str:
+ """
+ Return an API token string for the logged-in user, creating one if needed.
+ """
+ from archivebox.api.auth import get_or_create_api_token
+
+ request = context.get('request')
+ user = getattr(request, 'user', None)
+ if not user or not user.is_authenticated:
+ return ''
+
+ token = get_or_create_api_token(user)
+ return token.token if token else ''
diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py
index 708705a6..92f106e1 100644
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from archivebox.core.admin_site import archivebox_admin
-from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, HealthCheckView, live_progress_view
+from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view
from archivebox.workers.views import JobsDashboardView
@@ -29,11 +29,15 @@ urlpatterns = [
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
path('public/', PublicIndexView.as_view(), name='public-index'),
+ path('public.html', RedirectView.as_view(url='/public/'), name='public-index-html'),
path('archive/', RedirectView.as_view(url='/')),
path('archive/', SnapshotView.as_view(), name='Snapshot'),
+ re_path(r'^web/(?P(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$', WebAddView.as_view(), name='web-add'),
re_path(r'^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?Phttps?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'),
re_path(r'^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'),
+ re_path(r'^(?P[^/]+)/(?Phttps?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url-nodate'),
+ re_path(r'^(?P[^/]+)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path-nodate'),
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 4dd7afea..42ec421c 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -1,13 +1,16 @@
__package__ = 'archivebox.core'
import os
+import posixpath
+from glob import glob, escape
from django.utils import timezone
import inspect
from typing import Callable, get_type_hints
from pathlib import Path
+from urllib.parse import urlparse
from django.shortcuts import render, redirect
-from django.http import HttpRequest, HttpResponse, Http404
+from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.views import View
from django.views.generic.list import ListView
@@ -31,6 +34,21 @@ from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index
from archivebox.core.models import Snapshot
+from archivebox.core.host_utils import build_snapshot_url
+
+
+def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
+ target = archivefile or ''
+ if target == 'index.html':
+ target = ''
+ fullpath = Path(snapshot.output_dir) / target
+ if fullpath.is_file():
+ target = str(Path(target).parent)
+ if target == '.':
+ target = ''
+ return target
+
+
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
@@ -86,13 +104,95 @@ class SnapshotView(View):
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
- outputs = snapshot.discover_outputs()
+ hidden_card_plugins = {'archivedotorg', 'favicon', 'title'}
+ outputs = [
+ out for out in snapshot.discover_outputs()
+ if (out.get('size') or 0) > 0 and out.get('name') not in hidden_card_plugins
+ ]
archiveresults = {out['name']: out for out in outputs}
snap_dir = Path(snapshot.output_dir)
-
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
+ accounted_entries: set[str] = set()
+ for output in outputs:
+ output_name = output.get('name') or ''
+ if output_name:
+ accounted_entries.add(output_name)
+ output_path = output.get('path') or ''
+ if not output_path:
+ continue
+ parts = Path(output_path).parts
+ if parts:
+ accounted_entries.add(parts[0])
+
+ ignore_names = {
+ '.DS_Store',
+ 'index.html',
+ 'index.json',
+ 'index.jsonl',
+ 'favicon.ico',
+ }
+ ignored_suffixes = {'.log', '.pid', '.sh'}
+ max_loose_scan = 300
+
+ def has_meaningful_files(dir_path: Path) -> bool:
+ scanned = 0
+ for file_path in dir_path.rglob('*'):
+ scanned += 1
+ if scanned > max_loose_scan:
+ return True
+ if file_path.is_dir() or file_path.name.startswith('.'):
+ continue
+ if file_path.suffix.lower() in ignored_suffixes:
+ continue
+ try:
+ if file_path.stat().st_size == 0:
+ continue
+ except OSError:
+ continue
+ return True
+ return False
+
+ unaccounted_entries = []
+ if snap_dir.exists():
+ for entry in snap_dir.iterdir():
+ name = entry.name
+ if name.startswith('.') or name in ignore_names or name in accounted_entries:
+ continue
+ is_dir = entry.is_dir()
+ is_meaningful = False
+ size = None
+ if is_dir:
+ is_meaningful = has_meaningful_files(entry)
+ elif entry.is_file():
+ if entry.suffix.lower() not in ignored_suffixes:
+ try:
+ size = entry.stat().st_size
+ is_meaningful = size > 0
+ except OSError:
+ size = None
+ is_meaningful = False
+
+ unaccounted_entries.append({
+ 'name': name,
+ 'path': name,
+ 'is_dir': is_dir,
+ 'size': size,
+ 'is_meaningful': is_meaningful,
+ })
+
+ unaccounted_entries.sort(key=lambda item: item['name'].lower())
+ loose_items = [item for item in unaccounted_entries if item['is_meaningful']]
+ failed_exclude_suffixes = {'.json', '.jsonl', '.sh', '.log'}
+ failed_items = [
+ item for item in unaccounted_entries
+ if not item['is_meaningful']
+ and not (
+ not item['is_dir']
+ and Path(item['name']).suffix.lower() in failed_exclude_suffixes
+ )
+ ]
preview_priority = [
'singlefile',
'screenshot',
@@ -111,12 +211,48 @@ class SnapshotView(View):
break
snapshot_info = snapshot.to_dict(extended=True)
+ related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url)
+ related_snapshots = list(
+ related_snapshots_qs.exclude(id=snapshot.id).order_by('-bookmarked_at', '-created_at', '-timestamp')[:25]
+ )
+ related_years_map: dict[int, list[Snapshot]] = {}
+ for snap in [snapshot, *related_snapshots]:
+ snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at
+ if not snap_dt:
+ continue
+ related_years_map.setdefault(snap_dt.year, []).append(snap)
+ related_years = []
+ for year, snaps in related_years_map.items():
+ snaps_sorted = sorted(
+ snaps,
+ key=lambda s: (s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now()),
+ reverse=True,
+ )
+ related_years.append({
+ 'year': year,
+ 'latest': snaps_sorted[0],
+ 'snapshots': snaps_sorted,
+ })
+ related_years.sort(key=lambda item: item['year'], reverse=True)
try:
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
except IndexError:
warc_path = 'warc/'
+ ordered_outputs = sorted(
+ archiveresults.values(),
+ key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size'],
+ )
+ non_compact_outputs = [
+ out for out in ordered_outputs
+ if not out.get('is_compact') and not out.get('is_metadata')
+ ]
+ compact_outputs = [
+ out for out in ordered_outputs
+ if out.get('is_compact') or out.get('is_metadata')
+ ]
+
context = {
**snapshot_info,
'title': htmlencode(
@@ -131,9 +267,13 @@ class SnapshotView(View):
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
'warc_path': warc_path,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
- 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
+ 'archiveresults': [*non_compact_outputs, *compact_outputs],
'best_result': best_result,
'snapshot': snapshot, # Pass the snapshot object for template tags
+ 'related_snapshots': related_snapshots,
+ 'related_years': related_years,
+ 'loose_items': loose_items,
+ 'failed_items': failed_items,
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -168,13 +308,20 @@ class SnapshotView(View):
target_path = f'{target_path}?{query}'
return redirect(target_path)
- if archivefile == 'index.html':
+ if request.GET.get('files'):
+ target_path = _files_index_target(snapshot, archivefile)
+ response = serve_static_with_byterange_support(
+ request, target_path, document_root=snapshot.output_dir, show_indexes=True,
+ )
+ elif archivefile == 'index.html':
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
else:
- response = serve_static_with_byterange_support(
- request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
- )
+ target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
+ query = request.META.get('QUERY_STRING')
+ if query:
+ target = f'{target}?{query}'
+ return redirect(target)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist:
@@ -328,13 +475,16 @@ class SnapshotView(View):
class SnapshotPathView(View):
"""Serve snapshots by the new URL scheme: /////..."""
- def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
+ def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
if username == 'system':
return redirect(request.path.replace('/system/', '/web/', 1))
+ if date and domain and domain == date:
+ raise Http404
+
requested_url = url
if not requested_url and domain and domain.startswith(('http://', 'https://')):
requested_url = domain
@@ -358,19 +508,20 @@ class SnapshotPathView(View):
else:
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
- try:
- if len(date) == 4:
- qs = qs.filter(created_at__year=int(date))
- elif len(date) == 6:
- qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
- elif len(date) == 8:
- qs = qs.filter(
- created_at__year=int(date[:4]),
- created_at__month=int(date[4:6]),
- created_at__day=int(date[6:8]),
- )
- except ValueError:
- pass
+ if date:
+ try:
+ if len(date) == 4:
+ qs = qs.filter(created_at__year=int(date))
+ elif len(date) == 6:
+ qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
+ elif len(date) == 8:
+ qs = qs.filter(
+ created_at__year=int(date[:4]),
+ created_at__month=int(date[4:6]),
+ created_at__day=int(date[6:8]),
+ )
+ except ValueError:
+ pass
if requested_url:
snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
@@ -401,7 +552,10 @@ class SnapshotPathView(View):
)
canonical_base = snapshot.url_path
- requested_base = f'{username}/{date}/{domain or url or ""}'
+ if date:
+ requested_base = f'{username}/{date}/{domain or url or ""}'
+ else:
+ requested_base = f'{username}/{domain or url or ""}'
if snapshot_id:
requested_base = f'{requested_base}/{snapshot_id}'
if canonical_base != requested_base:
@@ -412,6 +566,18 @@ class SnapshotPathView(View):
return redirect(target)
archivefile = path or "index.html"
+ if archivefile != "index.html" and not request.GET.get('files'):
+ target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
+ query = request.META.get('QUERY_STRING')
+ if query:
+ target = f'{target}?{query}'
+ return redirect(target)
+
+ if request.GET.get('files'):
+ target_path = _files_index_target(snapshot, archivefile)
+ return serve_static_with_byterange_support(
+ request, target_path, document_root=snapshot.output_dir, show_indexes=True,
+ )
if archivefile == "index.html":
return SnapshotView.render_live_index(request, snapshot)
@@ -421,6 +587,202 @@ class SnapshotPathView(View):
)
+def _safe_archive_relpath(path: str) -> str | None:
+ if not path:
+ return ""
+ cleaned = posixpath.normpath(path)
+ cleaned = cleaned.lstrip("/")
+ if cleaned.startswith("..") or "/../" in f"/{cleaned}/":
+ return None
+ return cleaned
+
+
+def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None:
+ if not domain or not rel_path:
+ return None
+ domain = domain.split(":", 1)[0].lower()
+ # TODO: optimize by querying output_files in DB instead of globbing filesystem
+ data_root = DATA_DIR / "users"
+ escaped_domain = escape(domain)
+ escaped_path = escape(rel_path)
+ pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path)
+ matches = glob(pattern)
+ if not matches:
+ return None
+
+ def sort_key(match_path: str) -> tuple[str, str]:
+ parts = Path(match_path).parts
+ date_str = ""
+ try:
+ idx = parts.index("snapshots")
+ date_str = parts[idx + 1]
+ except Exception:
+ date_str = ""
+ return (date_str, match_path)
+
+ best = max(matches, key=sort_key)
+ best_path = Path(best)
+ parts = best_path.parts
+ try:
+ responses_idx = parts.index("responses")
+ except ValueError:
+ return None
+ responses_root = Path(*parts[: responses_idx + 1])
+ rel_to_root = Path(*parts[responses_idx + 1 :])
+ return responses_root, rel_to_root
+
+
+def _latest_responses_root(domain: str) -> Path | None:
+ if not domain:
+ return None
+ domain = domain.split(":", 1)[0].lower()
+ data_root = DATA_DIR / "users"
+ escaped_domain = escape(domain)
+ pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain)
+ matches = glob(pattern)
+ if not matches:
+ return None
+
+ def sort_key(match_path: str) -> tuple[str, str]:
+ parts = Path(match_path).parts
+ date_str = ""
+ try:
+ idx = parts.index("snapshots")
+ date_str = parts[idx + 1]
+ except Exception:
+ date_str = ""
+ return (date_str, match_path)
+
+ best = max(matches, key=sort_key)
+ return Path(best)
+
+
+def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool):
+ candidates: list[str] = []
+ rel_path = rel_path or ""
+ if rel_path.endswith("/"):
+ rel_path = f"{rel_path}index.html"
+ if "." not in Path(rel_path).name:
+ candidates.append(f"{rel_path.rstrip('/')}/index.html")
+ candidates.append(rel_path)
+
+ for candidate in candidates:
+ try:
+ return serve_static_with_byterange_support(
+ request,
+ candidate,
+ document_root=str(responses_root),
+ show_indexes=show_indexes,
+ )
+ except Http404:
+ pass
+
+ if rel_path.endswith("index.html"):
+ rel_dir = rel_path[: -len("index.html")]
+ try:
+ return serve_static_with_byterange_support(
+ request,
+ rel_dir,
+ document_root=str(responses_root),
+ show_indexes=True,
+ )
+ except Http404:
+ return None
+ return None
+
+
+class SnapshotHostView(View):
+ """Serve snapshot directory contents on ./."""
+
+ def get(self, request, snapshot_id: str, path: str = ""):
+ if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
+ return HttpResponseForbidden("Public snapshots are disabled.")
+ snapshot = None
+ if snapshot_id:
+ try:
+ snapshot = Snapshot.objects.get(pk=snapshot_id)
+ except Snapshot.DoesNotExist:
+ try:
+ snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
+ except Snapshot.DoesNotExist:
+ snapshot = None
+ except Snapshot.MultipleObjectsReturned:
+ snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
+
+ if not snapshot:
+ raise Http404
+
+ rel_path = path or ""
+ show_indexes = bool(request.GET.get("files"))
+ if not rel_path or rel_path.endswith("/"):
+ if show_indexes:
+ rel_path = rel_path.rstrip("/")
+ else:
+ rel_path = f"{rel_path}index.html"
+ rel_path = _safe_archive_relpath(rel_path)
+ if rel_path is None:
+ raise Http404
+
+ try:
+ return serve_static_with_byterange_support(
+ request,
+ rel_path,
+ document_root=snapshot.output_dir,
+ show_indexes=show_indexes,
+ )
+ except Http404:
+ pass
+
+ # Fallback to responses//
+ host = urlparse(snapshot.url).hostname or snapshot.domain
+ responses_root = Path(snapshot.output_dir) / "responses" / host
+ if responses_root.exists():
+ response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
+ if response is not None:
+ return response
+
+ raise Http404
+
+
+class OriginalDomainHostView(View):
+ """Serve responses from the most recent snapshot when using ./."""
+
+ def get(self, request, domain: str, path: str = ""):
+ if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
+ return HttpResponseForbidden("Public snapshots are disabled.")
+ rel_path = path or ""
+ if not rel_path or rel_path.endswith("/"):
+ rel_path = f"{rel_path}index.html"
+ rel_path = _safe_archive_relpath(rel_path)
+ if rel_path is None:
+ raise Http404
+
+ domain = domain.lower()
+ match = _latest_response_match(domain, rel_path)
+ if not match and "." not in Path(rel_path).name:
+ index_path = f"{rel_path.rstrip('/')}/index.html"
+ match = _latest_response_match(domain, index_path)
+ if not match and "." not in Path(rel_path).name:
+ html_path = f"{rel_path}.html"
+ match = _latest_response_match(domain, html_path)
+
+ show_indexes = bool(request.GET.get("files"))
+ if match:
+ responses_root, rel_to_root = match
+ response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
+ if response is not None:
+ return response
+
+ # If no direct match, try serving directory index from latest responses root
+ responses_root = _latest_responses_root(domain)
+ if responses_root:
+ response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
+ if response is not None:
+ return response
+
+ raise Http404
+
+
class PublicIndexView(ListView):
template_name = 'public_index.html'
model = Snapshot
@@ -508,7 +870,7 @@ class AddView(UserPassesTestMixin, FormView):
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
- def form_valid(self, form):
+ def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
@@ -522,13 +884,21 @@ class AddView(UserPassesTestMixin, FormView):
update = form.cleaned_data.get("update", False)
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
- custom_config = form.cleaned_data.get("config", {})
+ custom_config = form.cleaned_data.get("config") or {}
from archivebox.config.permissions import HOSTNAME
+ if created_by_id is None:
+ if self.request.user.is_authenticated:
+ created_by_id = self.request.user.pk
+ else:
+ from archivebox.base_models.models import get_or_create_system_user_pk
+ created_by_id = get_or_create_system_user_pk()
+
+ created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt
- sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
+ sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Crawl with the URLs from the file
@@ -552,8 +922,8 @@ class AddView(UserPassesTestMixin, FormView):
max_depth=depth,
tags_str=tag,
notes=notes,
- label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
- created_by_id=self.request.user.pk,
+ label=f'{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}',
+ created_by_id=created_by_id,
config=config
)
@@ -566,7 +936,7 @@ class AddView(UserPassesTestMixin, FormView):
is_enabled=True,
label=crawl.label,
notes=f"Auto-created from add page. {notes}".strip(),
- created_by_id=self.request.user.pk,
+ created_by_id=created_by_id,
)
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
@@ -576,7 +946,13 @@ class AddView(UserPassesTestMixin, FormView):
# from archivebox.crawls.actors import CrawlActor
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
+ return crawl
+ def form_valid(self, form):
+ crawl = self._create_crawl_from_form(form)
+
+ urls = form.cleaned_data["url"]
+ schedule = form.cleaned_data.get("schedule", "").strip()
rough_url_count = urls.count('://')
# Build success message with schedule link if created
@@ -593,6 +969,74 @@ class AddView(UserPassesTestMixin, FormView):
return redirect(crawl.admin_change_url)
+class WebAddView(AddView):
+ def _latest_snapshot_for_url(self, requested_url: str):
+ return SnapshotView.find_snapshots_for_url(requested_url).order_by(
+ '-created_at', '-bookmarked_at', '-timestamp'
+ ).first()
+
+ def _normalize_add_url(self, requested_url: str) -> str:
+ if requested_url.startswith(('http://', 'https://')):
+ return requested_url
+ return f'https://{requested_url}'
+
+ def dispatch(self, request, *args, **kwargs):
+ requested_url = urldecode(kwargs.get('url', '') or '')
+ if requested_url:
+ snapshot = self._latest_snapshot_for_url(requested_url)
+ if snapshot:
+ return redirect(f'/{snapshot.url_path}')
+
+ if not self.test_func():
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ 'No Snapshots match the given url: {}
'
+ 'Return to the Main Index'
+ ' '
+ ),
+ requested_url or '',
+ ),
+ content_type="text/html",
+ status=404,
+ )
+
+ return super().dispatch(request, *args, **kwargs)
+
+ def get(self, request, url: str):
+ requested_url = urldecode(url)
+ if not requested_url:
+ raise Http404
+
+ snapshot = self._latest_snapshot_for_url(requested_url)
+ if snapshot:
+ return redirect(f'/{snapshot.url_path}')
+
+ add_url = self._normalize_add_url(requested_url)
+ defaults_form = self.form_class()
+ form_data = {
+ 'url': add_url,
+ 'depth': defaults_form.fields['depth'].initial or '0',
+ 'persona': defaults_form.fields['persona'].initial or 'Default',
+ 'config': {},
+ }
+ if defaults_form.fields['update'].initial:
+ form_data['update'] = 'on'
+ if defaults_form.fields['overwrite'].initial:
+ form_data['overwrite'] = 'on'
+ if defaults_form.fields['index_only'].initial:
+ form_data['index_only'] = 'on'
+
+ form = self.form_class(data=form_data)
+ if not form.is_valid():
+ return self.form_invalid(form)
+
+ crawl = self._create_crawl_from_form(form)
+ snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
+ return redirect(f'/{snapshot.url_path}')
+
+
class HealthCheckView(View):
"""
A Django view that renders plain text "OK" for service discovery tools
@@ -617,11 +1061,19 @@ def live_progress_view(request):
from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
+ from archivebox.machine.models import Process, Machine
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
+ machine = Machine.current()
+ orchestrator_proc = Process.objects.filter(
+ machine=machine,
+ process_type=Process.TypeChoices.ORCHESTRATOR,
+ status=Process.StatusChoices.RUNNING,
+ ).order_by('-started_at').first()
+ orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
@@ -653,24 +1105,47 @@ def live_progress_view(request):
ext = embed.lower().split('.')[-1] if '.' in embed else ''
is_embeddable = ext in ('png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'pdf', 'html')
if is_embeddable or ar.plugin in ('screenshot', 'favicon', 'dom'):
+ archive_path = embed or ''
recent_thumbnails.append({
'id': str(ar.id),
'plugin': ar.plugin,
'snapshot_id': str(ar.snapshot_id),
'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '',
'embed_path': embed,
- 'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '',
+ 'archive_path': archive_path,
+ 'archive_url': build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else '',
'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
})
# Build hierarchical active crawls with nested snapshots and archive results
from django.db.models import Prefetch
+ running_workers = Process.objects.filter(
+ machine=machine,
+ process_type=Process.TypeChoices.WORKER,
+ status=Process.StatusChoices.RUNNING,
+ )
+ crawl_worker_pids: dict[str, int] = {}
+ snapshot_worker_pids: dict[str, int] = {}
+ for proc in running_workers:
+ env = proc.env or {}
+ if not isinstance(env, dict):
+ continue
+ if proc.worker_type == 'crawl':
+ crawl_id = env.get('CRAWL_ID')
+ if crawl_id:
+ crawl_worker_pids[str(crawl_id)] = proc.pid
+ elif proc.worker_type == 'snapshot':
+ snapshot_id = env.get('SNAPSHOT_ID')
+ if snapshot_id:
+ snapshot_worker_pids[str(snapshot_id)] = proc.pid
+
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).prefetch_related(
'snapshot_set',
'snapshot_set__archiveresult_set',
+ 'snapshot_set__archiveresult_set__process',
).distinct().order_by('-modified_at')[:10]
active_crawls = []
@@ -710,8 +1185,9 @@ def live_progress_view(request):
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
- # Calculate snapshot progress
- snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
+ # Calculate snapshot progress using per-plugin progress
+ now = timezone.now()
+ plugin_progress_values: list[int] = []
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
@@ -724,14 +1200,42 @@ def live_progress_view(request):
}
return (status_order.get(ar.status, 4), ar.plugin)
- all_plugins = [
- {
+ all_plugins = []
+ for ar in sorted(snapshot_results, key=plugin_sort_key):
+ status = ar.status
+ progress_value = 0
+ if status in (
+ ArchiveResult.StatusChoices.SUCCEEDED,
+ ArchiveResult.StatusChoices.FAILED,
+ ArchiveResult.StatusChoices.SKIPPED,
+ ):
+ progress_value = 100
+ elif status == ArchiveResult.StatusChoices.STARTED:
+ started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
+ timeout = ar.timeout or 120
+ if started_at and timeout:
+ elapsed = max(0.0, (now - started_at).total_seconds())
+ progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100)))
+ else:
+ progress_value = 1
+ else:
+ progress_value = 0
+
+ plugin_progress_values.append(progress_value)
+
+ plugin_payload = {
'id': str(ar.id),
'plugin': ar.plugin,
- 'status': ar.status,
+ 'status': status,
}
- for ar in sorted(snapshot_results, key=plugin_sort_key)
- ]
+ if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
+ plugin_payload['pid'] = ar.process.pid
+ if status == ArchiveResult.StatusChoices.STARTED:
+ plugin_payload['progress'] = progress_value
+ plugin_payload['timeout'] = ar.timeout or 120
+ all_plugins.append(plugin_payload)
+
+ snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
@@ -744,6 +1248,7 @@ def live_progress_view(request):
'failed_plugins': failed_plugins,
'pending_plugins': pending_plugins,
'all_plugins': all_plugins,
+ 'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
})
# Check if crawl can start (for debugging stuck crawls)
@@ -772,10 +1277,12 @@ def live_progress_view(request):
'urls_preview': urls_preview,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
+ 'worker_pid': crawl_worker_pids.get(str(crawl.id)),
})
return JsonResponse({
'orchestrator_running': orchestrator_running,
+ 'orchestrator_pid': orchestrator_pid,
'total_workers': total_workers,
'crawls_pending': crawls_pending,
'crawls_started': crawls_started,
diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py
index 433f5c93..bbbceaa7 100644
--- a/archivebox/core/widgets.py
+++ b/archivebox/core/widgets.py
@@ -1,8 +1,11 @@
__package__ = 'archivebox.core'
import json
+import re
+import hashlib
from django import forms
from django.utils.html import escape
+from django.utils.safestring import mark_safe
class TagEditorWidget(forms.Widget):
@@ -27,6 +30,23 @@ class TagEditorWidget(forms.Widget):
"""Escape HTML entities in value."""
return escape(str(value)) if value else ''
+ def _normalize_id(self, value):
+ """Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start)."""
+ normalized = re.sub(r'[^A-Za-z0-9_]', '_', str(value))
+ if not normalized or not re.match(r'[A-Za-z_]', normalized):
+ normalized = f't_{normalized}'
+ return normalized
+
+ def _tag_style(self, value):
+ """Compute a stable pastel color style for a tag value."""
+ tag = (value or '').strip().lower()
+ digest = hashlib.md5(tag.encode('utf-8')).hexdigest()
+ hue = int(digest[:4], 16) % 360
+ bg = f'hsl({hue}, 70%, 92%)'
+ border = f'hsl({hue}, 60%, 82%)'
+ fg = f'hsl({hue}, 35%, 28%)'
+ return f'--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};'
+
def render(self, name, value, attrs=None, renderer=None):
"""
Render the tag editor widget.
@@ -67,13 +87,14 @@ class TagEditorWidget(forms.Widget):
elif isinstance(value, str):
tags = sorted([t.strip() for t in value.split(',') if t.strip()])
- widget_id = attrs.get('id', name) if attrs else name
+ widget_id_raw = attrs.get('id', name) if attrs else name
+ widget_id = self._normalize_id(widget_id_raw)
# Build pills HTML
pills_html = ''
for tag in tags:
pills_html += f'''
-
+
{self._escape(tag)}
@@ -92,6 +113,7 @@ class TagEditorWidget(forms.Widget):
placeholder="Add tag..."
autocomplete="off"
onkeydown="handleTagKeydown_{widget_id}(event)"
+ onkeypress="if(event.key==='Enter' || event.keyCode===13){{event.preventDefault(); event.stopPropagation();}}"
oninput="fetchTagAutocomplete_{widget_id}(this.value)"
>
@@ -112,6 +134,47 @@ class TagEditorWidget(forms.Widget):
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
}};
+ function computeTagStyle_{widget_id}(tagName) {{
+ var hash = 0;
+ var name = String(tagName || '').toLowerCase();
+ for (var i = 0; i < name.length; i++) {{
+ hash = (hash * 31 + name.charCodeAt(i)) % 360;
+ }}
+ var bg = 'hsl(' + hash + ', 70%, 92%)';
+ var border = 'hsl(' + hash + ', 60%, 82%)';
+ var fg = 'hsl(' + hash + ', 35%, 28%)';
+ return {{ bg: bg, border: border, fg: fg }};
+ }}
+
+ function applyTagStyle_{widget_id}(el, tagName) {{
+ var colors = computeTagStyle_{widget_id}(tagName);
+ el.style.setProperty('--tag-bg', colors.bg);
+ el.style.setProperty('--tag-border', colors.border);
+ el.style.setProperty('--tag-fg', colors.fg);
+ }}
+
+ function getApiKey() {{
+ return (window.ARCHIVEBOX_API_KEY || '').trim();
+ }}
+
+ function buildApiUrl(path) {{
+ var apiKey = getApiKey();
+ if (!apiKey) return path;
+ var sep = path.indexOf('?') !== -1 ? '&' : '?';
+ return path + sep + 'api_key=' + encodeURIComponent(apiKey);
+ }}
+
+ function buildApiHeaders() {{
+ var headers = {{
+ 'Content-Type': 'application/json',
+ }};
+ var apiKey = getApiKey();
+ if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
+ var csrfToken = getCSRFToken();
+ if (csrfToken) headers['X-CSRFToken'] = csrfToken;
+ return headers;
+ }}
+
window.addTag_{widget_id} = function(tagName) {{
tagName = tagName.trim();
if (!tagName) return;
@@ -139,12 +202,9 @@ class TagEditorWidget(forms.Widget):
document.getElementById('{widget_id}_input').value = '';
// Create tag via API if it doesn't exist (fire and forget)
- fetch('/api/v1/core/tags/create/', {{
+ fetch(buildApiUrl('/api/v1/core/tags/create/'), {{
method: 'POST',
- headers: {{
- 'Content-Type': 'application/json',
- 'X-CSRFToken': getCSRFToken()
- }},
+ headers: buildApiHeaders(),
body: JSON.stringify({{ name: tagName }})
}}).catch(function(err) {{
console.log('Tag creation note:', err);
@@ -166,6 +226,7 @@ class TagEditorWidget(forms.Widget):
var pill = document.createElement('span');
pill.className = 'tag-pill';
pill.setAttribute('data-tag', tag);
+ applyTagStyle_{widget_id}(pill, tag);
var tagText = document.createTextNode(tag);
pill.appendChild(tagText);
@@ -195,14 +256,16 @@ class TagEditorWidget(forms.Widget):
var input = event.target;
var value = input.value.trim();
- if (event.key === 'Enter' || event.key === ' ' || event.key === ',') {{
+ if (event.key === 'Enter' || event.keyCode === 13 || event.key === ' ' || event.key === ',') {{
event.preventDefault();
+ event.stopPropagation();
if (value) {{
// Handle comma-separated values
value.split(',').forEach(function(tag) {{
addTag_{widget_id}(tag.trim());
}});
}}
+ return false;
}} else if (event.key === 'Backspace' && !value && currentTags_{widget_id}.length > 0) {{
// Remove last tag on backspace when input is empty
var lastTag = currentTags_{widget_id}.pop();
@@ -222,7 +285,7 @@ class TagEditorWidget(forms.Widget):
return;
}}
- fetch('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))
+ fetch(buildApiUrl('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query)))
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
var datalist = document.getElementById('{widget_id}_datalist');
@@ -261,7 +324,7 @@ class TagEditorWidget(forms.Widget):
'''
- return html
+ return mark_safe(html)
class InlineTagEditorWidget(TagEditorWidget):
@@ -295,20 +358,23 @@ class InlineTagEditorWidget(TagEditorWidget):
tag_data.sort(key=lambda x: x['name'].lower())
tags = [t['name'] for t in tag_data]
- widget_id = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
+ widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
+ widget_id = self._normalize_id(widget_id_raw)
# Build pills HTML with filter links
pills_html = ''
for td in tag_data:
pills_html += f'''
-
+
{self._escape(td['name'])}
'''
+ tags_json = escape(json.dumps(tag_data))
+
html = f'''
-
+
{pills_html}
@@ -318,195 +384,10 @@ class InlineTagEditorWidget(TagEditorWidget):
list="{widget_id}_datalist"
placeholder="+"
autocomplete="off"
- onkeydown="handleInlineTagKeydown_{widget_id}(event)"
- oninput="fetchInlineTagAutocomplete_{widget_id}(this.value)"
- onfocus="this.placeholder='add tag...'"
- onblur="this.placeholder='+'"
+ data-inline-tag-input="1"
>
-
-
'''
- return html
+ return mark_safe(html)
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 08cedf0f..b8429c11 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -62,6 +62,7 @@ import json
import signal
import time
import subprocess
+from functools import lru_cache
from pathlib import Path
from typing import List, Dict, Any, Optional, TypedDict
@@ -255,6 +256,7 @@ def run_hook(
records = process.get_records() # Get parsed JSONL output
"""
from archivebox.machine.models import Process, Machine
+ from archivebox.config.constants import CONSTANTS
import time
import sys
start_time = time.time()
@@ -264,6 +266,8 @@ def run_hook(
plugin_name = script.parent.name
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config['timeout']
+ if timeout:
+ timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
# Get current machine
machine = Machine.current()
@@ -568,6 +572,7 @@ def run_hooks(
return results
+@lru_cache(maxsize=1)
def get_plugins() -> List[str]:
"""
Get list of available plugins by discovering Snapshot hooks.
@@ -988,6 +993,8 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
Template content as string, or None if not found and fallback=False.
"""
base_name = get_plugin_name(plugin)
+ if base_name in ('yt-dlp', 'youtube-dl'):
+ base_name = 'ytdlp'
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
@@ -1011,6 +1018,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
return None
+@lru_cache(maxsize=None)
def get_plugin_icon(plugin: str) -> str:
"""
Get the icon for a plugin from its icon.html template.
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index a47f32ea..6f57cd0b 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -1685,8 +1685,11 @@ class Process(models.Model):
TimeoutError if process doesn't exit in time
"""
import time
+ from archivebox.config.constants import CONSTANTS
timeout = timeout or self.timeout
+ if self.process_type == self.TypeChoices.HOOK:
+ timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
start = time.time()
while True:
diff --git a/archivebox/misc/serve_static.py b/archivebox/misc/serve_static.py
index 8df249e1..76bc74e8 100644
--- a/archivebox/misc/serve_static.py
+++ b/archivebox/misc/serve_static.py
@@ -1,3 +1,6 @@
+import html
+import json
+import re
import os
import stat
import posixpath
@@ -10,6 +13,267 @@ from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpRespon
from django.utils._os import safe_join
from django.utils.http import http_date
from django.utils.translation import gettext as _
+from archivebox.config.common import SERVER_CONFIG
+
+
+_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}
+
+
+def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
+ hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
+ if not hashes_path.exists():
+ return None
+ try:
+ mtime = hashes_path.stat().st_mtime
+ except OSError:
+ return None
+
+ cached = _HASHES_CACHE.get(hashes_path)
+ if cached and cached[0] == mtime:
+ return cached[1]
+
+ try:
+ data = json.loads(hashes_path.read_text(encoding='utf-8'))
+ except Exception:
+ return None
+
+ file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
+ _HASHES_CACHE[hashes_path] = (mtime, file_map)
+ return file_map
+
+
+def _hash_for_path(document_root: Path, rel_path: str) -> str | None:
+ file_map = _load_hash_map(document_root)
+ if not file_map:
+ return None
+ return file_map.get(rel_path)
+
+
+def _cache_policy() -> str:
+ return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
+
+
+# Ensure common web types are mapped consistently across platforms.
+mimetypes.add_type("text/html", ".html")
+mimetypes.add_type("text/html", ".htm")
+mimetypes.add_type("text/css", ".css")
+mimetypes.add_type("application/javascript", ".js")
+mimetypes.add_type("application/json", ".json")
+mimetypes.add_type("application/x-ndjson", ".jsonl")
+mimetypes.add_type("text/markdown", ".md")
+mimetypes.add_type("text/yaml", ".yml")
+mimetypes.add_type("text/yaml", ".yaml")
+mimetypes.add_type("text/csv", ".csv")
+mimetypes.add_type("text/tab-separated-values", ".tsv")
+mimetypes.add_type("application/xml", ".xml")
+mimetypes.add_type("image/svg+xml", ".svg")
+
+try:
+ import markdown as _markdown
+except Exception:
+ _markdown = None
+
+MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
+MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
+MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
+MARKDOWN_ITALIC_RE = re.compile(r'(?]*>')
+HTML_BODY_RE = re.compile(r']*>(.*)', flags=re.IGNORECASE | re.DOTALL)
+
+
+def _extract_markdown_candidate(text: str) -> str:
+ candidate = text
+ body_match = HTML_BODY_RE.search(candidate)
+ if body_match:
+ candidate = body_match.group(1)
+ candidate = re.sub(r'^\s*]*>', '', candidate, flags=re.IGNORECASE)
+ candidate = re.sub(r'
\s*$', '', candidate, flags=re.IGNORECASE)
+ return candidate.strip()
+
+
+def _looks_like_markdown(text: str) -> bool:
+ lower = text.lower()
+ if "" in lower:
+ return False
+ md_markers = 0
+ md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
+ md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
+ md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
+ md_markers += text.count('[TOC]')
+ md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
+ md_markers += text.count('\n---') + text.count('\n***')
+ return md_markers >= 6
+
+
+def _render_markdown_fallback(text: str) -> str:
+ if _markdown is not None and not HTML_TAG_RE.search(text):
+ try:
+ return _markdown.markdown(
+ text,
+ extensions=["extra", "toc", "sane_lists"],
+ output_format="html5",
+ )
+ except Exception:
+ pass
+
+ lines = text.splitlines()
+ headings = []
+
+ def slugify(value: str) -> str:
+ slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
+ return slug or "section"
+
+ for raw_line in lines:
+ heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
+ if heading_match:
+ level = len(heading_match.group(1))
+ content = heading_match.group(2).strip()
+ headings.append((level, content, slugify(content)))
+
+ html_lines = []
+ in_code = False
+ in_ul = False
+ in_ol = False
+ in_blockquote = False
+
+ def render_inline(markup: str) -> str:
+ content = MARKDOWN_INLINE_IMAGE_RE.sub(r'
', markup)
+ content = MARKDOWN_INLINE_LINK_RE.sub(r'\1', content)
+ content = MARKDOWN_BOLD_RE.sub(r'\1', content)
+ content = MARKDOWN_ITALIC_RE.sub(r'\1', content)
+ return content
+
+ def close_lists():
+ nonlocal in_ul, in_ol
+ if in_ul:
+ html_lines.append("")
+ in_ul = False
+ if in_ol:
+ html_lines.append("")
+ in_ol = False
+
+ for raw_line in lines:
+ line = raw_line.rstrip("\n")
+ stripped = line.strip()
+
+ if stripped.startswith("```"):
+ if in_code:
+ html_lines.append("")
+ in_code = False
+ else:
+ close_lists()
+ if in_blockquote:
+ html_lines.append("")
+ in_blockquote = False
+ html_lines.append("")
+ in_code = True
+ continue
+
+ if in_code:
+ html_lines.append(html.escape(line))
+ continue
+
+ if not stripped:
+ close_lists()
+ if in_blockquote:
+ html_lines.append("")
+ in_blockquote = False
+ html_lines.append("
")
+ continue
+
+ heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
+ if heading_match:
+ close_lists()
+ if in_blockquote:
+ html_lines.append("")
+ in_blockquote = False
+ leading_tags = heading_match.group(1).strip()
+ level = len(heading_match.group(2))
+ content = heading_match.group(3).strip()
+ if leading_tags:
+ html_lines.append(leading_tags)
+ html_lines.append(f"{render_inline(content)} ")
+ continue
+
+ if stripped in ("---", "***"):
+ close_lists()
+ html_lines.append("
")
+ continue
+
+ if stripped.startswith("> "):
+ if not in_blockquote:
+ close_lists()
+ html_lines.append("")
+ in_blockquote = True
+ content = stripped[2:]
+ html_lines.append(render_inline(content))
+ continue
+ else:
+ if in_blockquote:
+ html_lines.append("
")
+ in_blockquote = False
+
+ ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
+ if ul_match:
+ if in_ol:
+ html_lines.append("")
+ in_ol = False
+ if not in_ul:
+ html_lines.append("")
+ in_ul = True
+ html_lines.append(f"- {render_inline(ul_match.group(1))}
")
+ continue
+
+ ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
+ if ol_match:
+ if in_ul:
+ html_lines.append("
")
+ in_ul = False
+ if not in_ol:
+ html_lines.append("")
+ in_ol = True
+ html_lines.append(f"- {render_inline(ol_match.group(1))}
")
+ continue
+
+ close_lists()
+
+ # Inline conversions (leave raw HTML intact)
+ if stripped == "[TOC]":
+ toc_items = []
+ for level, title, slug in headings:
+ toc_items.append(
+ f'- {title}
'
+ )
+ html_lines.append(
+ ''
+ )
+ continue
+
+ html_lines.append(f"{render_inline(line)}
")
+
+ close_lists()
+ if in_blockquote:
+ html_lines.append("")
+ if in_code:
+ html_lines.append("
")
+
+ return "\n".join(html_lines)
+
+
+def _render_markdown_document(markdown_text: str) -> str:
+ body = _render_markdown_fallback(markdown_text)
+ wrapped = (
+ ""
+ ""
+ ""
+ ""
+ f"{body}"
+ ""
+ )
+ return wrapped
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
@@ -28,18 +292,101 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
if not os.access(fullpath, os.R_OK):
raise Http404(_("“%(path)s” does not exist") % {"path": fullpath})
- # Respect the If-Modified-Since header.
statobj = fullpath.stat()
- if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
- return HttpResponseNotModified()
+ document_root = Path(document_root) if document_root else None
+ rel_path = path
+ etag = None
+ if document_root:
+ file_hash = _hash_for_path(document_root, rel_path)
+ if file_hash:
+ etag = f'"{file_hash}"'
+
+ if etag:
+ inm = request.META.get("HTTP_IF_NONE_MATCH")
+ if inm:
+ inm_list = [item.strip() for item in inm.split(",")]
+ if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
+ not_modified = HttpResponseNotModified()
+ not_modified.headers["ETag"] = etag
+ not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+ not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
+ return not_modified
content_type, encoding = mimetypes.guess_type(str(fullpath))
content_type = content_type or "application/octet-stream"
-
+ # Add charset for text-like types (best guess), but don't override the type.
+ is_text_like = (
+ content_type.startswith("text/")
+ or content_type in {
+ "application/json",
+ "application/javascript",
+ "application/xml",
+ "application/x-ndjson",
+ "image/svg+xml",
+ }
+ )
+ if is_text_like and "charset=" not in content_type:
+ content_type = f"{content_type}; charset=utf-8"
+
+ # Respect the If-Modified-Since header for non-markdown responses.
+ if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
+ if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
+ return HttpResponseNotModified()
+
+ # Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
+ # are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
+ if content_type.startswith("text/plain") or content_type.startswith("text/html"):
+ try:
+ max_unescape_size = 10 * 1024 * 1024 # 10MB cap to avoid heavy memory use
+ if statobj.st_size <= max_unescape_size:
+ raw = fullpath.read_bytes()
+ decoded = raw.decode("utf-8", errors="replace")
+ escaped_count = decoded.count("<") + decoded.count(">")
+ tag_count = decoded.count("<")
+ if escaped_count and escaped_count > tag_count * 2:
+ decoded = html.unescape(decoded)
+ markdown_candidate = _extract_markdown_candidate(decoded)
+ if _looks_like_markdown(markdown_candidate):
+ wrapped = _render_markdown_document(markdown_candidate)
+ response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
+ response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+ if etag:
+ response.headers["ETag"] = etag
+ response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+ else:
+ response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+ response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+ if encoding:
+ response.headers["Content-Encoding"] = encoding
+ return response
+ if escaped_count and escaped_count > tag_count * 2:
+ response = HttpResponse(decoded, content_type=content_type)
+ response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+ if etag:
+ response.headers["ETag"] = etag
+ response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+ else:
+ response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+ response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+ if encoding:
+ response.headers["Content-Encoding"] = encoding
+ return response
+ except Exception:
+ pass
+
# setup resposne object
ranged_file = RangedFileReader(open(fullpath, "rb"))
response = StreamingHttpResponse(ranged_file, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+ if etag:
+ response.headers["ETag"] = etag
+ response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+ else:
+ response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+ if is_text_like:
+ response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+ if content_type.startswith("image/"):
+ response.headers["Cache-Control"] = "public, max-age=604800, immutable"
# handle byte-range requests by serving chunk of file
if stat.S_ISREG(statobj.st_mode):
diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
index 4a99028a..7b73a422 100755
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -26,6 +26,7 @@ const PLUGIN_NAME = 'accessibility';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = '../chrome';
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
// Parse command line arguments
function parseArgs() {
@@ -76,6 +77,27 @@ function getCdpUrl() {
return null;
}
+function assertChromeSession() {
+ const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+ const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
+ const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
+ if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+ try {
+ const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
+ if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
+ process.kill(pid, 0);
+ } catch (e) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+ const cdpUrl = getCdpUrl();
+ if (!cdpUrl) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+ return cdpUrl;
+}
+
// Extract accessibility info
async function extractAccessibility(url) {
// Output directory is current directory (hook already runs in output dir)
@@ -85,10 +107,7 @@ async function extractAccessibility(url) {
try {
// Connect to existing Chrome session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
- }
+ const cdpUrl = assertChromeSession();
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
@@ -226,13 +245,10 @@ async function main() {
}
// Check if Chrome session exists, then wait for page load
- const cdpUrl = getCdpUrl();
- if (cdpUrl) {
- // Wait for page to be fully loaded
- const pageLoaded = await waitForChromeTabLoaded(60000);
- if (!pageLoaded) {
- throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
- }
+ assertChromeSession();
+ const pageLoaded = await waitForChromeTabLoaded(60000);
+ if (!pageLoaded) {
+ throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await extractAccessibility(url);
diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py
index addd51df..cccfa215 100644
--- a/archivebox/plugins/accessibility/tests/test_accessibility.py
+++ b/archivebox/plugins/accessibility/tests/test_accessibility.py
@@ -47,7 +47,6 @@ class TestAccessibilityPlugin(TestCase):
self.assertTrue(ACCESSIBILITY_HOOK.exists(), f"Hook not found: {ACCESSIBILITY_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestAccessibilityWithChrome(TestCase):
"""Integration tests for accessibility plugin with Chrome."""
@@ -109,9 +108,7 @@ class TestAccessibilityWithChrome(TestCase):
self.assertIn('headings', accessibility_data, f"Missing headings: {accessibility_data}")
self.assertIn('url', accessibility_data, f"Missing url: {accessibility_data}")
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
+ except RuntimeError:
raise
def test_accessibility_disabled_skips(self):
diff --git a/archivebox/plugins/apt/tests/test_apt_provider.py b/archivebox/plugins/apt/tests/test_apt_provider.py
index 430fde24..c8b7934e 100644
--- a/archivebox/plugins/apt/tests/test_apt_provider.py
+++ b/archivebox/plugins/apt/tests/test_apt_provider.py
@@ -70,9 +70,9 @@ class TestAptProviderHook(TestCase):
self.assertEqual(result.returncode, 0)
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
- @pytest.mark.skipif(not apt_available(), reason="apt not installed")
def test_hook_detects_apt(self):
"""Hook should detect apt binary when available."""
+ assert apt_available(), "apt not installed"
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
@@ -112,12 +112,12 @@ class TestAptProviderHook(TestCase):
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
-@pytest.mark.skipif(not apt_available(), reason="apt not installed")
class TestAptProviderSystemBinaries(TestCase):
"""Test apt provider with system binaries."""
def test_detect_existing_binary(self):
"""apt provider should detect already-installed system binaries."""
+ assert apt_available(), "apt not installed"
# Check for a binary that's almost certainly installed (like 'ls' or 'bash')
result = subprocess.run(
[
diff --git a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py b/archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py
similarity index 100%
rename from archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.bg.py
rename to archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py
diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js
index df43115f..e0e42a7e 100755
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -18,6 +18,8 @@ const { finished } = require('stream/promises');
const execAsync = promisify(exec);
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
+
// ============================================================================
// Environment helpers
// ============================================================================
@@ -373,6 +375,7 @@ async function launchChromium(options = {}) {
outputDir = 'chrome',
userDataDir = getEnv('CHROME_USER_DATA_DIR'),
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
+ userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''),
headless = getEnvBool('CHROME_HEADLESS', true),
sandbox = getEnvBool('CHROME_SANDBOX', true),
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
@@ -450,17 +453,17 @@ async function launchChromium(options = {}) {
const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []);
// Build dynamic Chrome arguments (these must be computed at runtime)
+ const inDocker = getEnvBool('IN_DOCKER', false);
const dynamicArgs = [
// Remote debugging setup
`--remote-debugging-port=${debugPort}`,
'--remote-debugging-address=127.0.0.1',
// Sandbox settings (disable in Docker)
- ...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']),
+ ...(sandbox ? [] : (inDocker ? ['--no-sandbox', '--disable-setuid-sandbox'] : [])),
// Docker-specific workarounds
'--disable-dev-shm-usage',
- '--disable-gpu',
// Window size
`--window-size=${width},${height}`,
@@ -468,6 +471,9 @@ async function launchChromium(options = {}) {
// User data directory (for persistent sessions with persona)
...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
+ // User agent
+ ...(userAgent ? [`--user-agent=${userAgent}`] : []),
+
// Headless mode
...(headless ? ['--headless=new'] : []),
@@ -1387,6 +1393,18 @@ function findChromium() {
return null;
}
+/**
+ * Find Chromium binary path only (never Chrome/Brave/Edge).
+ * Prefers CHROME_BINARY if set, then Chromium.
+ *
+ * @returns {string|null} - Absolute path or command name to browser binary
+ */
+function findAnyChromiumBinary() {
+ const chromiumBinary = findChromium();
+ if (chromiumBinary) return chromiumBinary;
+ return null;
+}
+
// ============================================================================
// Shared Extension Installer Utilities
// ============================================================================
@@ -1658,13 +1676,13 @@ async function connectToPage(options = {}) {
// Wait for chrome session to be ready
const sessionReady = await waitForChromeSession(chromeSessionDir, timeoutMs);
if (!sessionReady) {
- throw new Error(`Chrome session not ready after ${timeoutMs/1000}s (chrome plugin must run first)`);
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Read session files
const cdpUrl = readCdpUrl(chromeSessionDir);
if (!cdpUrl) {
- throw new Error('No Chrome session found (cdp_url.txt missing)');
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const targetId = readTargetId(chromeSessionDir);
@@ -1749,6 +1767,7 @@ module.exports = {
installPuppeteerCore,
// Chromium binary finding
findChromium,
+ findAnyChromiumBinary,
// Extension utilities
getExtensionId,
loadExtensionManifest,
diff --git a/archivebox/plugins/chrome/extract_cookies.js b/archivebox/plugins/chrome/extract_cookies.js
index 2a330152..c23515dc 100644
--- a/archivebox/plugins/chrome/extract_cookies.js
+++ b/archivebox/plugins/chrome/extract_cookies.js
@@ -23,7 +23,7 @@ if (process.env.NODE_MODULES_DIR) {
const fs = require('fs');
const path = require('path');
const {
- findChromium,
+ findAnyChromiumBinary,
launchChromium,
killChrome,
getEnv,
@@ -109,9 +109,9 @@ async function main() {
process.exit(1);
}
- const binary = findChromium();
+ const binary = findAnyChromiumBinary();
if (!binary) {
- console.error('ERROR: Chromium binary not found');
+ console.error('ERROR: Chromium-based browser binary not found');
process.exit(1);
}
diff --git a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
index 17185786..b5cb9822 100644
--- a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
@@ -31,12 +31,15 @@ if (process.env.NODE_MODULES_DIR) {
const fs = require('fs');
const path = require('path');
+const http = require('http');
const puppeteer = require('puppeteer');
const {
findChromium,
launchChromium,
killChrome,
getEnv,
+ getEnvBool,
+ getExtensionId,
writePidWithMtime,
getExtensionsDir,
} = require('./chrome_utils.js');
@@ -154,6 +157,84 @@ async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
}
+function getPortFromCdpUrl(cdpUrl) {
+ if (!cdpUrl) return null;
+ const match = cdpUrl.match(/:(\d+)\/devtools\//);
+ return match ? match[1] : null;
+}
+
+async function fetchDevtoolsTargets(cdpUrl) {
+ const port = getPortFromCdpUrl(cdpUrl);
+ if (!port) return [];
+
+ const urlPath = '/json/list';
+ return new Promise((resolve, reject) => {
+ const req = http.get(
+ { hostname: '127.0.0.1', port, path: urlPath },
+ (res) => {
+ let data = '';
+ res.on('data', (chunk) => (data += chunk));
+ res.on('end', () => {
+ try {
+ const targets = JSON.parse(data);
+ resolve(Array.isArray(targets) ? targets : []);
+ } catch (e) {
+ reject(e);
+ }
+ });
+ }
+ );
+ req.on('error', reject);
+ });
+}
+
+async function discoverExtensionTargets(cdpUrl, installedExtensions) {
+ const builtinIds = [
+ 'nkeimhogjdpnpccoofpliimaahmaaome',
+ 'fignfifoniblkonapihmkfakmlgkbkcf',
+ 'ahfgeienlihckogmohjhadlkjgocpleb',
+ 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
+ ];
+
+ let targets = [];
+ for (let i = 0; i < 10; i += 1) {
+ try {
+ targets = await fetchDevtoolsTargets(cdpUrl);
+ if (targets.length > 0) break;
+ } catch (e) {
+ // Ignore and retry
+ }
+ await new Promise(r => setTimeout(r, 500));
+ }
+
+ const customExtTargets = targets.filter(t => {
+ const url = t.url || '';
+ if (!url.startsWith('chrome-extension://')) return false;
+ const extId = url.split('://')[1].split('/')[0];
+ return !builtinIds.includes(extId);
+ });
+
+ console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`);
+
+ for (const target of customExtTargets) {
+ const url = target.url || '';
+ const extId = url.split('://')[1].split('/')[0];
+ console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`);
+ }
+
+ const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0]));
+ for (const ext of installedExtensions) {
+ if (ext.id) {
+ ext.loaded = runtimeIds.has(ext.id);
+ }
+ }
+
+ if (customExtTargets.length === 0 && installedExtensions.length > 0) {
+ console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
+ console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
+ }
+}
+
// Parse command line arguments
function parseArgs() {
const args = {};
@@ -257,6 +338,17 @@ async function main() {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
+ // Ensure extension IDs are available without chrome://extensions
+ for (const ext of installedExtensions) {
+ if (!ext.id && ext.unpacked_path) {
+ try {
+ ext.id = getExtensionId(ext.unpacked_path);
+ } catch (e) {
+ console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`);
+ }
+ }
+ }
+
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
@@ -280,131 +372,31 @@ async function main() {
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
- // Connect puppeteer for extension verification
- console.error(`[*] Connecting puppeteer to CDP...`);
- const browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- defaultViewport: null,
- });
- browserInstance = browser;
-
- // Import cookies into Chrome profile at crawl start
- await importCookiesFromFile(browser, cookiesFile, userDataDir);
-
- // Get actual extension IDs from chrome://extensions page
+ // Discover extension targets at launch (no chrome://extensions)
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 2000));
+ console.error('[*] Discovering extension targets via devtools /json/list...');
+ await discoverExtensionTargets(cdpUrl, installedExtensions);
+ }
+
+ // Only connect to CDP when cookies import is needed to reduce crash risk.
+ if (cookiesFile) {
+ console.error(`[*] Connecting puppeteer to CDP for cookie import...`);
+ const browser = await puppeteer.connect({
+ browserWSEndpoint: cdpUrl,
+ defaultViewport: null,
+ });
+ browserInstance = browser;
+
+ // Import cookies into Chrome profile at crawl start
+ await importCookiesFromFile(browser, cookiesFile, userDataDir);
try {
- const extPage = await browser.newPage();
- await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
- await new Promise(r => setTimeout(r, 2000));
-
- // Parse extension info from the page
- const extensionsFromPage = await extPage.evaluate(() => {
- const extensions = [];
- // Extensions manager uses shadow DOM
- const manager = document.querySelector('extensions-manager');
- if (!manager || !manager.shadowRoot) return extensions;
-
- const itemList = manager.shadowRoot.querySelector('extensions-item-list');
- if (!itemList || !itemList.shadowRoot) return extensions;
-
- const items = itemList.shadowRoot.querySelectorAll('extensions-item');
- for (const item of items) {
- const id = item.getAttribute('id');
- const nameEl = item.shadowRoot?.querySelector('#name');
- const name = nameEl?.textContent?.trim() || '';
- if (id && name) {
- extensions.push({ id, name });
- }
- }
- return extensions;
- });
-
- console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
- for (const e of extensionsFromPage) {
- console.error(` - ${e.id}: "${e.name}"`);
- }
-
- // Match extensions by name (strict matching)
- for (const ext of installedExtensions) {
- // Read the extension's manifest to get its display name
- const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
- if (fs.existsSync(manifestPath)) {
- const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
- let manifestName = manifest.name || '';
-
- // Resolve message placeholder (e.g., __MSG_extName__)
- if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
- const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
- const defaultLocale = manifest.default_locale || 'en';
- const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
- if (fs.existsSync(messagesPath)) {
- try {
- const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
- if (messages[msgKey] && messages[msgKey].message) {
- manifestName = messages[msgKey].message;
- }
- } catch (e) {
- console.error(`[!] Failed to read messages.json: ${e.message}`);
- }
- }
- }
-
- console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
-
- // Find matching extension from page by exact name match first
- let match = extensionsFromPage.find(e => e.name === manifestName);
-
- // If no exact match, try case-insensitive exact match
- if (!match) {
- match = extensionsFromPage.find(e =>
- e.name.toLowerCase() === manifestName.toLowerCase()
- );
- }
-
- if (match) {
- ext.id = match.id;
- console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
- } else {
- console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
- }
- }
- }
-
- await extPage.close();
- } catch (e) {
- console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
- }
-
- // Fallback: check browser targets
- const targets = browser.targets();
- const builtinIds = [
- 'nkeimhogjdpnpccoofpliimaahmaaome',
- 'fignfifoniblkonapihmkfakmlgkbkcf',
- 'ahfgeienlihckogmohjhadlkjgocpleb',
- 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
- ];
- const customExtTargets = targets.filter(t => {
- const url = t.url();
- if (!url.startsWith('chrome-extension://')) return false;
- const extId = url.split('://')[1].split('/')[0];
- return !builtinIds.includes(extId);
- });
-
- console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
-
- for (const target of customExtTargets) {
- const url = target.url();
- const extId = url.split('://')[1].split('/')[0];
- console.error(`[+] Extension target: ${extId} (${target.type()})`);
- }
-
- if (customExtTargets.length === 0 && extensionPaths.length > 0) {
- console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
- console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
- }
+ browser.disconnect();
+ } catch (e) {}
+ browserInstance = null;
+ } else {
+ console.error('[*] Skipping puppeteer CDP connection (no cookies to import)');
}
// Write extensions metadata with actual IDs
diff --git a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
index ca8e8232..4f3c6594 100755
--- a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
@@ -2,9 +2,8 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
- * If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js),
- * this connects to it and creates a new tab. Otherwise, falls back to launching
- * its own Chrome instance.
+ * Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js)
+ * and creates a new tab. This hook does NOT launch its own Chrome instance.
*
* Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= --crawl-id=
* Output: Creates chrome/ directory under snapshot output dir with:
@@ -15,11 +14,7 @@
*
* Environment variables:
* CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
- * CHROME_BINARY: Path to Chromium binary (for fallback)
- * CHROME_RESOLUTION: Page resolution (default: 1440,2000)
- * CHROME_USER_AGENT: User agent string (optional)
- * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
- * CHROME_HEADLESS: Run in headless mode (default: true)
+ * CHROME_BINARY: Path to Chromium binary (optional, for version info)
*
* This is a background hook that stays alive until SIGTERM so the tab
* can be closed cleanly at the end of the snapshot run.
@@ -27,24 +22,18 @@
const fs = require('fs');
const path = require('path');
-const { spawn } = require('child_process');
+const { execSync } = require('child_process');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer');
-const {
- findChromium,
- getEnv,
- getEnvBool,
- parseResolution,
- findFreePort,
- waitForDebugPort,
-} = require('./chrome_utils.js');
+const { getEnv, getEnvInt } = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_tab';
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
const CHROME_SESSION_DIR = '.';
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
let finalStatus = 'failed';
let finalOutput = '';
@@ -118,61 +107,75 @@ process.on('SIGTERM', () => cleanup('SIGTERM'));
process.on('SIGINT', () => cleanup('SIGINT'));
// Try to find the crawl's Chrome session
-function findCrawlChromeSession() {
+function getCrawlChromeSession() {
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
- if (!crawlOutputDir) return null;
+ if (!crawlOutputDir) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
const crawlChromeDir = path.join(crawlOutputDir, 'chrome');
const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
const pidFile = path.join(crawlChromeDir, 'chrome.pid');
- if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) {
- try {
- const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
- const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
-
- // Verify the process is still running
- try {
- process.kill(pid, 0); // Signal 0 = check if process exists
- return { cdpUrl, pid };
- } catch (e) {
- // Process not running
- return null;
- }
- } catch (e) {
- return null;
- }
+ if (!fs.existsSync(cdpFile)) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+ if (!fs.existsSync(pidFile)) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
- return null;
+ const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
+ const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
+ if (!cdpUrl) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+ if (!pid || Number.isNaN(pid)) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+
+ // Verify the process is still running
+ try {
+ process.kill(pid, 0); // Signal 0 = check if process exists
+ } catch (e) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+
+ return { cdpUrl, pid };
+}
+
+async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) {
+ const startTime = Date.now();
+ let lastError = null;
+
+ while (Date.now() - startTime < timeoutMs) {
+ try {
+ return getCrawlChromeSession();
+ } catch (e) {
+ lastError = e;
+ }
+ await new Promise(resolve => setTimeout(resolve, intervalMs));
+ }
+
+ if (lastError) {
+ throw lastError;
+ }
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Create a new tab in an existing Chrome session
async function createTabInExistingChrome(cdpUrl, url, pid) {
- const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
- const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
- const { width, height } = parseResolution(resolution);
-
console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
// Connect Puppeteer to the running Chrome
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
- defaultViewport: { width, height },
+ defaultViewport: null,
});
// Create a new tab for this snapshot
const page = await browser.newPage();
- // Set viewport
- await page.setViewport({ width, height });
-
- // Set user agent if specified
- if (userAgent) {
- await page.setUserAgent(userAgent);
- }
-
// Get the page target ID
const target = page.target();
const targetId = target._targetId;
@@ -189,112 +192,6 @@ async function createTabInExistingChrome(cdpUrl, url, pid) {
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid };
}
-// Fallback: Launch a new Chrome instance for this snapshot
-async function launchNewChrome(url, binary) {
- const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
- const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
- const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
- const headless = getEnvBool('CHROME_HEADLESS', true);
-
- const { width, height } = parseResolution(resolution);
-
- // Find a free port for Chrome DevTools
- const debugPort = await findFreePort();
- console.log(`[*] Launching new Chrome on port: ${debugPort}`);
-
- // Build Chrome arguments
- const chromeArgs = [
- `--remote-debugging-port=${debugPort}`,
- '--remote-debugging-address=127.0.0.1',
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- '--disable-sync',
- '--no-first-run',
- '--no-default-browser-check',
- '--disable-default-apps',
- '--disable-infobars',
- '--disable-blink-features=AutomationControlled',
- '--disable-component-update',
- '--disable-domain-reliability',
- '--disable-breakpad',
- '--disable-background-networking',
- '--disable-background-timer-throttling',
- '--disable-backgrounding-occluded-windows',
- '--disable-renderer-backgrounding',
- '--disable-ipc-flooding-protection',
- '--password-store=basic',
- '--use-mock-keychain',
- '--font-render-hinting=none',
- '--force-color-profile=srgb',
- `--window-size=${width},${height}`,
- ...(headless ? ['--headless=new'] : []),
- ...(checkSsl ? [] : ['--ignore-certificate-errors']),
- 'about:blank',
- ];
-
- // Launch Chrome as a detached process (since no crawl-level Chrome exists)
- const chromeProcess = spawn(binary, chromeArgs, {
- detached: true,
- stdio: ['ignore', 'ignore', 'ignore'],
- });
- chromeProcess.unref();
-
- const chromePid = chromeProcess.pid;
- console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
-
- // Write PID immediately for cleanup
- fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
-
- try {
- // Wait for Chrome to be ready
- const versionInfo = await waitForDebugPort(debugPort, 30000);
- console.log(`[+] Chrome ready: ${versionInfo.Browser}`);
-
- const wsUrl = versionInfo.webSocketDebuggerUrl;
- fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
-
- // Connect Puppeteer to get page info
- const browser = await puppeteer.connect({
- browserWSEndpoint: wsUrl,
- defaultViewport: { width, height },
- });
-
- let pages = await browser.pages();
- let page = pages[0];
-
- if (!page) {
- page = await browser.newPage();
- }
-
- await page.setViewport({ width, height });
-
- if (userAgent) {
- await page.setUserAgent(userAgent);
- }
-
- const target = page.target();
- const targetId = target._targetId;
-
- fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
- fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId);
- fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
-
- browser.disconnect();
-
- return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid };
-
- } catch (e) {
- try {
- process.kill(chromePid, 'SIGTERM');
- } catch (killErr) {
- // Ignore
- }
- return { success: false, error: `${e.name}: ${e.message}` };
- }
-}
-
async function main() {
const args = parseArgs();
const url = args.url;
@@ -312,33 +209,21 @@ async function main() {
let version = '';
try {
- const binary = findChromium();
- if (!binary) {
- console.error('ERROR: Chromium binary not found');
- console.error('DEPENDENCY_NEEDED=chromium');
- console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
- console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
- process.exit(1);
- }
-
// Get Chrome version
try {
- const { execSync } = require('child_process');
- version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
+ const binary = getEnv('CHROME_BINARY', '').trim();
+ if (binary) {
+ version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
+ }
} catch (e) {
version = '';
}
- // Try to use existing crawl Chrome session
- const crawlSession = findCrawlChromeSession();
- let result;
-
- if (crawlSession) {
- console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
- result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
- } else {
- result = { success: false, error: 'No crawl Chrome session found (CRAWL_OUTPUT_DIR missing or chrome not running)' };
- }
+ // Try to use existing crawl Chrome session (wait for readiness)
+ const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
+ const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000);
+ console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
+ const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
if (result.success) {
status = 'succeeded';
diff --git a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
index 219b58b9..dae2a3db 100644
--- a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
+++ b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
@@ -21,6 +21,7 @@ const {
} = require('./chrome_utils.js');
const CHROME_SESSION_DIR = '.';
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
@@ -50,7 +51,7 @@ async function main() {
const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
if (!ready) {
- const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`;
+ const error = CHROME_SESSION_REQUIRED_ERROR;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
@@ -59,7 +60,7 @@ async function main() {
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
const targetId = readTargetId(CHROME_SESSION_DIR);
if (!cdpUrl || !targetId) {
- const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)';
+ const error = CHROME_SESSION_REQUIRED_ERROR;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
diff --git a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
index 242c9853..33c515ec 100644
--- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
@@ -24,6 +24,7 @@ const puppeteer = require('puppeteer');
const PLUGIN_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '.';
const OUTPUT_DIR = '.';
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
@@ -175,13 +176,13 @@ async function main() {
// Wait for chrome tab to be open (up to 60s)
const tabOpen = await waitForChromeTabOpen(60000);
if (!tabOpen) {
- console.error('ERROR: Chrome tab not open after 60s (chrome_tab must run first)');
+ console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
process.exit(1);
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
- console.error('ERROR: Chrome CDP URL not found (chrome tab not initialized)');
+ console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
process.exit(1);
}
diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
index 8be2bb3c..3e37ce26 100644
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -229,6 +229,33 @@ def get_extensions_dir() -> str:
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
+def link_puppeteer_cache(lib_dir: Path) -> None:
+ """Best-effort symlink from system Puppeteer cache into test lib_dir.
+
+ Avoids repeated Chromium downloads across tests by reusing the
+ default Puppeteer cache directory.
+ """
+ cache_dir = lib_dir / 'puppeteer'
+ cache_dir.mkdir(parents=True, exist_ok=True)
+
+ candidates = [
+ Path.home() / 'Library' / 'Caches' / 'puppeteer',
+ Path.home() / '.cache' / 'puppeteer',
+ ]
+ for src_root in candidates:
+ if not src_root.exists():
+ continue
+ for item in src_root.iterdir():
+ dst = cache_dir / item.name
+ if dst.exists():
+ continue
+ try:
+ os.symlink(item, dst, target_is_directory=item.is_dir())
+ except Exception:
+ # Best-effort only; if symlink fails, leave as-is.
+ pass
+
+
def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
"""Find the Chromium binary path.
@@ -632,9 +659,8 @@ def setup_test_env(tmpdir: Path) -> dict:
tmpdir: Base temporary directory for the test
Returns:
- Environment dict with all paths set, or pytest.skip() if Chrome install fails
+ Environment dict with all paths set.
"""
- import pytest
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
@@ -688,7 +714,7 @@ def setup_test_env(tmpdir: Path) -> dict:
try:
install_chromium_with_hooks(env)
except RuntimeError as e:
- pytest.skip(str(e))
+ raise RuntimeError(str(e))
return env
@@ -873,6 +899,7 @@ def chrome_session(
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
node_modules_dir = npm_dir / 'node_modules'
+ puppeteer_cache_dir = lib_dir / 'puppeteer'
# Create lib structure for puppeteer installation
node_modules_dir.mkdir(parents=True, exist_ok=True)
@@ -893,8 +920,12 @@ def chrome_session(
'NODE_PATH': str(node_modules_dir),
'NPM_BIN_DIR': str(npm_dir / '.bin'),
'CHROME_HEADLESS': 'true',
+ 'PUPPETEER_CACHE_DIR': str(puppeteer_cache_dir),
})
+ # Reuse system Puppeteer cache to avoid redundant Chromium downloads
+ link_puppeteer_cache(lib_dir)
+
# Install Chromium via npm + puppeteer hooks using normal Binary flow
install_chromium_with_hooks(env)
diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py
index 554a2539..33d328c9 100644
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -125,10 +125,10 @@ def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
try:
chromium_binary = install_chromium_with_hooks(env)
except RuntimeError as e:
- pytest.skip(str(e))
+ raise RuntimeError(str(e))
if not chromium_binary:
- pytest.skip("Chromium not found after install")
+ raise RuntimeError("Chromium not found after install")
os.environ['CHROME_BINARY'] = chromium_binary
for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py
index 7d590aaa..ab851d15 100644
--- a/archivebox/plugins/consolelog/tests/test_consolelog.py
+++ b/archivebox/plugins/consolelog/tests/test_consolelog.py
@@ -13,27 +13,18 @@ import tempfile
import time
from pathlib import Path
-import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
- get_test_env,
+ CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
-def chrome_available() -> bool:
- """Check if Chrome/Chromium is available."""
- for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
- if shutil.which(name):
- return True
- return False
-
-
# Get the path to the consolelog hook
PLUGIN_DIR = get_plugin_dir(__file__)
CONSOLELOG_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_consolelog.*')
@@ -48,7 +39,6 @@ class TestConsolelogPlugin(TestCase):
self.assertTrue(CONSOLELOG_HOOK.exists(), f"Hook not found: {CONSOLELOG_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestConsolelogWithChrome(TestCase):
"""Integration tests for consolelog plugin with Chrome."""
@@ -62,68 +52,75 @@ class TestConsolelogWithChrome(TestCase):
def test_consolelog_captures_output(self):
"""Consolelog hook should capture console output from page."""
- test_url = 'https://example.com'
+ test_url = 'data:text/html,'
snapshot_id = 'test-consolelog-snapshot'
- try:
- with chrome_session(
- self.temp_dir,
- crawl_id='test-consolelog-crawl',
- snapshot_id=snapshot_id,
- test_url=test_url,
- navigate=True,
- timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
- # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+ with chrome_session(
+ self.temp_dir,
+ crawl_id='test-consolelog-crawl',
+ snapshot_id=snapshot_id,
+ test_url=test_url,
+ navigate=False,
+ timeout=30,
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ console_dir = snapshot_chrome_dir.parent / 'consolelog'
+ console_dir.mkdir(exist_ok=True)
+ # Run consolelog hook with the active Chrome session (background hook)
+ result = subprocess.Popen(
+ ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(console_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
- # Run consolelog hook with the active Chrome session (background hook)
- result = subprocess.Popen(
- ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
+ self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
- # Check for output file
- console_output = snapshot_chrome_dir / 'console.jsonl'
+ # Check for output file
+ console_output = console_dir / 'console.jsonl'
- # Allow it to run briefly, then terminate (background hook)
- time.sleep(3)
- if result.poll() is None:
- result.terminate()
- try:
- stdout, stderr = result.communicate(timeout=5)
- except subprocess.TimeoutExpired:
- result.kill()
- stdout, stderr = result.communicate()
- else:
+ # Allow it to run briefly, then terminate (background hook)
+ for _ in range(10):
+ if console_output.exists() and console_output.stat().st_size > 0:
+ break
+ time.sleep(1)
+ if result.poll() is None:
+ result.terminate()
+ try:
+ stdout, stderr = result.communicate(timeout=5)
+ except subprocess.TimeoutExpired:
+ result.kill()
stdout, stderr = result.communicate()
+ else:
+ stdout, stderr = result.communicate()
- # At minimum, verify no crash
- self.assertNotIn('Traceback', stderr)
+ # At minimum, verify no crash
+ self.assertNotIn('Traceback', stderr)
- # If output file exists, verify it's valid JSONL
- if console_output.exists():
- with open(console_output) as f:
- content = f.read().strip()
- if content:
- for line in content.split('\n'):
- if line.strip():
- try:
- record = json.loads(line)
- # Verify structure
- self.assertIn('timestamp', record)
- self.assertIn('type', record)
- except json.JSONDecodeError:
- pass # Some lines may be incomplete
-
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
- raise
+ # If output file exists, verify it's valid JSONL and has output
+ if console_output.exists():
+ with open(console_output) as f:
+ content = f.read().strip()
+ self.assertTrue(content, "Console output should not be empty")
+ for line in content.split('\n'):
+ if line.strip():
+ try:
+ record = json.loads(line)
+ # Verify structure
+ self.assertIn('timestamp', record)
+ self.assertIn('type', record)
+ except json.JSONDecodeError:
+ pass # Some lines may be incomplete
if __name__ == '__main__':
diff --git a/archivebox/plugins/dns/tests/test_dns.py b/archivebox/plugins/dns/tests/test_dns.py
new file mode 100644
index 00000000..ac10a478
--- /dev/null
+++ b/archivebox/plugins/dns/tests/test_dns.py
@@ -0,0 +1,126 @@
+"""
+Tests for the DNS plugin.
+
+Tests the real DNS hook with an actual URL to verify
+DNS resolution capture.
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+from django.test import TestCase
+
+# Import chrome test helpers
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
+from chrome_test_helpers import (
+ chrome_session,
+ CHROME_NAVIGATE_HOOK,
+ get_plugin_dir,
+ get_hook_script,
+)
+
+
+# Get the path to the DNS hook
+PLUGIN_DIR = get_plugin_dir(__file__)
+DNS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dns.*')
+
+
+class TestDNSPlugin(TestCase):
+ """Test the DNS plugin."""
+
+ def test_dns_hook_exists(self):
+ """DNS hook script should exist."""
+ self.assertIsNotNone(DNS_HOOK, "DNS hook not found in plugin directory")
+ self.assertTrue(DNS_HOOK.exists(), f"Hook not found: {DNS_HOOK}")
+
+
+class TestDNSWithChrome(TestCase):
+ """Integration tests for DNS plugin with Chrome."""
+
+ def setUp(self):
+ """Set up test environment."""
+ self.temp_dir = Path(tempfile.mkdtemp())
+
+ def tearDown(self):
+ """Clean up."""
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+ def test_dns_records_captured(self):
+ """DNS hook should capture DNS records from a real URL."""
+ test_url = 'https://example.com'
+ snapshot_id = 'test-dns-snapshot'
+
+ with chrome_session(
+ self.temp_dir,
+ crawl_id='test-dns-crawl',
+ snapshot_id=snapshot_id,
+ test_url=test_url,
+ navigate=False,
+ timeout=30,
+ ) as (_process, _pid, snapshot_chrome_dir, env):
+ dns_dir = snapshot_chrome_dir.parent / 'dns'
+ dns_dir.mkdir(exist_ok=True)
+
+ result = subprocess.Popen(
+ ['node', str(DNS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(dns_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
+
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
+ self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
+
+ dns_output = dns_dir / 'dns.jsonl'
+ for _ in range(30):
+ if dns_output.exists() and dns_output.stat().st_size > 0:
+ break
+ time.sleep(1)
+
+ if result.poll() is None:
+ result.terminate()
+ try:
+ stdout, stderr = result.communicate(timeout=5)
+ except subprocess.TimeoutExpired:
+ result.kill()
+ stdout, stderr = result.communicate()
+ else:
+ stdout, stderr = result.communicate()
+
+ self.assertNotIn('Traceback', stderr)
+
+ self.assertTrue(dns_output.exists(), "dns.jsonl not created")
+ content = dns_output.read_text().strip()
+ self.assertTrue(content, "DNS output should not be empty")
+
+ records = []
+ for line in content.split('\n'):
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ records.append(json.loads(line))
+ except json.JSONDecodeError:
+ pass
+
+ self.assertTrue(records, "No DNS records parsed")
+ has_ip_record = any(r.get('hostname') and r.get('ip') for r in records)
+ self.assertTrue(has_ip_record, f"No DNS record with hostname + ip: {records}")
+
+
+if __name__ == '__main__':
+ pytest.main([__file__, '-v'])
diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js
index f62662f8..db8a2420 100644
--- a/archivebox/plugins/dom/on_Snapshot__53_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js
@@ -2,19 +2,12 @@
/**
* Dump the DOM of a URL using Chrome/Puppeteer.
*
- * If a Chrome session exists (from chrome plugin), connects to it via CDP.
- * Otherwise launches a new Chrome instance.
+ * Requires a Chrome session (from chrome plugin) and connects to it via CDP.
*
* Usage: on_Snapshot__53_dom.js --url= --snapshot-id=
* Output: Writes dom/output.html
*
* Environment variables:
- * CHROME_BINARY: Path to Chrome/Chromium binary
- * CHROME_TIMEOUT: Timeout in seconds (default: 60)
- * CHROME_RESOLUTION: Page resolution (default: 1440,2000)
- * CHROME_USER_AGENT: User agent string (optional)
- * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
- * CHROME_HEADLESS: Run in headless mode (default: true)
* DOM_ENABLED: Enable DOM extraction (default: true)
*/
@@ -24,11 +17,7 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
- findChromium,
- getEnv,
getEnvBool,
- getEnvInt,
- parseResolution,
parseArgs,
readCdpUrl,
} = require('../chrome/chrome_utils.js');
@@ -86,81 +75,30 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
}
async function dumpDom(url) {
- const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
- const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
- const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
- const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
- const headless = getEnvBool('CHROME_HEADLESS', true);
-
- const { width, height } = parseResolution(resolution);
-
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
- let connectedToSession = false;
try {
- // Try to connect to existing Chrome session
+ // Connect to existing Chrome session (required)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (cdpUrl) {
- try {
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- defaultViewport: { width, height },
- });
- connectedToSession = true;
-
- // Get existing pages or create new one
- const pages = await browser.pages();
- page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- page = await browser.newPage();
- }
-
- // Set viewport on the page
- await page.setViewport({ width, height });
-
- } catch (e) {
- console.error(`Failed to connect to CDP session: ${e.message}`);
- browser = null;
- }
+ if (!cdpUrl) {
+ return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
- // Fall back to launching new browser
- if (!browser) {
- const executablePath = findChromium();
- if (!executablePath) {
- return { success: false, error: 'Chrome binary not found' };
- }
+ browser = await puppeteer.connect({
+ browserWSEndpoint: cdpUrl,
+ defaultViewport: null,
+ });
- browser = await puppeteer.launch({
- executablePath,
- headless: headless ? 'new' : false,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- `--window-size=${width},${height}`,
- ...(checkSsl ? [] : ['--ignore-certificate-errors']),
- ],
- defaultViewport: { width, height },
- });
+ // Get existing pages or create new one
+ const pages = await browser.pages();
+ page = pages.find(p => p.url().startsWith('http')) || pages[0];
+ if (!page) {
page = await browser.newPage();
-
- // Navigate to URL (only if we launched fresh browser)
- if (userAgent) {
- await page.setUserAgent(userAgent);
- }
-
- await page.goto(url, {
- waitUntil: 'networkidle2',
- timeout,
- });
}
// Get the full DOM content
@@ -176,9 +114,8 @@ async function dumpDom(url) {
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
- // Only close browser if we launched it (not if we connected to session)
- if (browser && !connectedToSession) {
- await browser.close();
+ if (browser) {
+ browser.disconnect();
}
}
}
@@ -206,14 +143,15 @@ async function main() {
process.exit(0);
}
- // Only wait for page load if using shared Chrome session
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (cdpUrl) {
- // Wait for page to be fully loaded
- const pageLoaded = await waitForChromeTabLoaded(60000);
- if (!pageLoaded) {
- throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
- }
+ if (!cdpUrl) {
+ throw new Error('No Chrome session found (chrome plugin must run first)');
+ }
+
+ // Wait for page to be fully loaded
+ const pageLoaded = await waitForChromeTabLoaded(60000);
+ if (!pageLoaded) {
+ throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await dumpDom(url);
diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py
index 7312a72f..2d98d873 100644
--- a/archivebox/plugins/dom/tests/test_dom.py
+++ b/archivebox/plugins/dom/tests/test_dom.py
@@ -28,6 +28,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
+ chrome_session,
)
@@ -61,15 +62,19 @@ def test_extracts_dom_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Run DOM extraction hook
- result = subprocess.run(
- ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=120
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
+ dom_dir = snapshot_chrome_dir.parent / 'dom'
+ dom_dir.mkdir(exist_ok=True)
+
+ # Run DOM extraction hook
+ result = subprocess.run(
+ ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
+ cwd=dom_dir,
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -90,7 +95,7 @@ def test_extracts_dom_from_example_com():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify filesystem output (hook writes directly to working dir)
- dom_file = tmpdir / 'output.html'
+ dom_file = dom_dir / 'output.html'
assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}"
# Verify HTML content contains REAL example.com text
diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py
index cb62dfe3..fc4604f4 100644
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py
@@ -128,8 +128,6 @@ def main(url: str, snapshot_id: str):
success, output, error = get_favicon(url)
if success:
status = 'succeeded'
- elif error == 'No favicon found':
- status = 'skipped'
else:
status = 'failed'
@@ -148,7 +146,7 @@ def main(url: str, snapshot_id: str):
}
print(json.dumps(result))
- sys.exit(0 if status in ('succeeded', 'skipped') else 1)
+ sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
diff --git a/archivebox/plugins/favicon/templates/card.html b/archivebox/plugins/favicon/templates/card.html
index 8555e174..c5df1617 100644
--- a/archivebox/plugins/favicon/templates/card.html
+++ b/archivebox/plugins/favicon/templates/card.html
@@ -3,7 +3,7 @@
{% if output_path %}
{% endif %}
diff --git a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
index 73a72a24..b30ca715 100755
--- a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
+++ b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
@@ -48,7 +48,9 @@ def main():
'pip': {
'packages': [
'--no-deps',
+ '--prefer-binary',
'forum-dl',
+ 'chardet==5.2.0',
'pydantic',
'pydantic-core',
'typing-extensions',
diff --git a/archivebox/plugins/forumdl/tests/test_forumdl.py b/archivebox/plugins/forumdl/tests/test_forumdl.py
index f965d898..18a692c9 100644
--- a/archivebox/plugins/forumdl/tests/test_forumdl.py
+++ b/archivebox/plugins/forumdl/tests/test_forumdl.py
@@ -13,6 +13,7 @@ Tests verify:
"""
import json
+import os
import subprocess
import sys
import tempfile
@@ -28,6 +29,7 @@ TEST_URL = 'https://example.com'
# Module-level cache for binary path
_forumdl_binary_path = None
+_forumdl_lib_root = None
def get_forumdl_binary_path():
"""Get the installed forum-dl binary path from cache or by running installation."""
@@ -50,11 +52,48 @@ def get_forumdl_binary_path():
except Exception:
pass
- # If not found, try to install via pip
- pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
+ # If not found, try to install via pip using the crawl hook overrides
+ pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py'
+ crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py'
if pip_hook.exists():
binary_id = str(uuid.uuid4())
machine_id = str(uuid.uuid4())
+ overrides = None
+
+ if crawl_hook.exists():
+ crawl_result = subprocess.run(
+ [sys.executable, str(crawl_hook)],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ for crawl_line in crawl_result.stdout.strip().split('\n'):
+ if crawl_line.strip().startswith('{'):
+ try:
+ crawl_record = json.loads(crawl_line)
+ if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl':
+ overrides = crawl_record.get('overrides')
+ break
+ except json.JSONDecodeError:
+ continue
+
+ # Create a persistent temp LIB_DIR for the pip provider
+ import platform
+ global _forumdl_lib_root
+ if not _forumdl_lib_root:
+ _forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-')
+ machine = platform.machine().lower()
+ system = platform.system().lower()
+ if machine in ('arm64', 'aarch64'):
+ machine = 'arm64'
+ elif machine in ('x86_64', 'amd64'):
+ machine = 'x86_64'
+ machine_type = f"{machine}-{system}"
+ lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type
+ lib_dir.mkdir(parents=True, exist_ok=True)
+ env = os.environ.copy()
+ env['LIB_DIR'] = str(lib_dir)
+ env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data')
cmd = [
sys.executable, str(pip_hook),
@@ -62,12 +101,15 @@ def get_forumdl_binary_path():
'--machine-id', machine_id,
'--name', 'forum-dl'
]
+ if overrides:
+ cmd.append(f'--overrides={json.dumps(overrides)}')
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
- timeout=300
+ timeout=300,
+ env=env,
)
# Parse Binary from pip installation
@@ -212,8 +254,7 @@ def test_real_forum_url():
import os
binary_path = get_forumdl_binary_path()
- if not binary_path:
- pytest.skip("forum-dl binary not available")
+ assert binary_path, "forum-dl binary not available"
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/archivebox/plugins/git/tests/test_git.py b/archivebox/plugins/git/tests/test_git.py
index 7701039a..c7449495 100644
--- a/archivebox/plugins/git/tests/test_git.py
+++ b/archivebox/plugins/git/tests/test_git.py
@@ -19,7 +19,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None)
-TEST_URL = 'https://github.com/example/repo.git'
+TEST_URL = 'https://github.com/ArchiveBox/abx-pkg.git'
def test_hook_script_exists():
assert GIT_HOOK.exists()
@@ -31,10 +31,7 @@ def test_verify_deps_with_abx_pkg():
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
git_loaded = git_binary.load()
- if git_loaded and git_loaded.abspath:
- assert True, "git is available"
- else:
- pass
+ assert git_loaded and git_loaded.abspath, "git is required for git plugin tests"
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:
@@ -48,9 +45,7 @@ def test_reports_missing_git():
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
def test_handles_non_git_url():
- pass
- if not shutil.which('git'):
- pass
+ assert shutil.which('git'), "git binary not available"
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
@@ -83,8 +78,7 @@ def test_real_git_repo():
"""Test that git can clone a real GitHub repository."""
import os
- if not shutil.which('git'):
- pytest.skip("git binary not available")
+ assert shutil.which('git'), "git binary not available"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
diff --git a/archivebox/plugins/merkletree/config.json b/archivebox/plugins/hashes/config.json
similarity index 78%
rename from archivebox/plugins/merkletree/config.json
rename to archivebox/plugins/hashes/config.json
index 6070a026..b57db14a 100644
--- a/archivebox/plugins/merkletree/config.json
+++ b/archivebox/plugins/hashes/config.json
@@ -3,13 +3,13 @@
"type": "object",
"additionalProperties": false,
"properties": {
- "MERKLETREE_ENABLED": {
+ "HASHES_ENABLED": {
"type": "boolean",
"default": true,
- "x-aliases": ["SAVE_MERKLETREE", "USE_MERKLETREE"],
+ "x-aliases": ["SAVE_HASHES", "USE_HASHES"],
"description": "Enable merkle tree hash generation"
},
- "MERKLETREE_TIMEOUT": {
+ "HASHES_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
diff --git a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py b/archivebox/plugins/hashes/on_Snapshot__93_hashes.py
similarity index 84%
rename from archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
rename to archivebox/plugins/hashes/on_Snapshot__93_hashes.py
index 164a0f6a..2738d85f 100755
--- a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
+++ b/archivebox/plugins/hashes/on_Snapshot__93_hashes.py
@@ -1,16 +1,16 @@
#!/usr/bin/env python3
"""
-Create a Merkle tree of all archived outputs.
+Create a hashed Merkle tree of all archived outputs.
This plugin runs after all extractors complete (priority 93) and generates
-a cryptographic Merkle tree of all files in the snapshot directory.
+a cryptographic Merkle hash tree of all files in the snapshot directory.
-Output: merkletree.json containing root_hash, tree structure, file list, metadata
+Output: hashes.json containing root_hash, tree structure, file list, metadata
-Usage: on_Snapshot__93_merkletree.py --url= --snapshot-id=
+Usage: on_Snapshot__93_hashes.py --url= --snapshot-id=
Environment variables:
- SAVE_MERKLETREE: Enable merkle tree generation (default: true)
+ SAVE_HASHES: Enable hash merkle tree generation (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
@@ -45,7 +45,7 @@ def sha256_data(data: bytes) -> str:
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""Recursively collect all files in snapshot directory."""
- exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
+ exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
@@ -94,8 +94,8 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
return root_hash, tree_levels
-def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
- """Create a complete Merkle tree of all files in snapshot directory."""
+def create_hashes(snapshot_dir: Path) -> Dict[str, Any]:
+ """Create a complete Merkle hash tree of all files in snapshot directory."""
files = collect_files(snapshot_dir)
file_hashes = [file_hash for _, file_hash, _ in files]
root_hash, tree_levels = build_merkle_tree(file_hashes)
@@ -132,14 +132,14 @@ def main(url: str, snapshot_id: str):
try:
# Check if enabled
- save_merkletree = os.getenv('MERKLETREE_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
+ save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
- if not save_merkletree:
+ if not save_hashes:
status = 'skipped'
- click.echo(json.dumps({'status': status, 'output': 'MERKLETREE_ENABLED=false'}))
+ click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'}))
sys.exit(0)
- # Working directory is the extractor output dir (e.g., /merkletree/)
+ # Working directory is the extractor output dir (e.g., /hashes/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
@@ -149,17 +149,17 @@ def main(url: str, snapshot_id: str):
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)
- output_path = output_dir / 'merkletree.json'
+ output_path = output_dir / 'hashes.json'
# Generate Merkle tree
- merkle_data = create_merkle_tree(snapshot_dir)
+ merkle_data = create_hashes(snapshot_dir)
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
- output = 'merkletree.json'
+ output = 'hashes.json'
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']
diff --git a/archivebox/plugins/hashes/templates/icon.html b/archivebox/plugins/hashes/templates/icon.html
new file mode 100644
index 00000000..211930f0
--- /dev/null
+++ b/archivebox/plugins/hashes/templates/icon.html
@@ -0,0 +1 @@
+
diff --git a/archivebox/plugins/merkletree/tests/test_merkletree.py b/archivebox/plugins/hashes/tests/test_hashes.py
similarity index 71%
rename from archivebox/plugins/merkletree/tests/test_merkletree.py
rename to archivebox/plugins/hashes/tests/test_hashes.py
index ebdd5808..0eb7d7f1 100644
--- a/archivebox/plugins/merkletree/tests/test_merkletree.py
+++ b/archivebox/plugins/hashes/tests/test_hashes.py
@@ -1,5 +1,5 @@
"""
-Tests for the merkletree plugin.
+Tests for the hashes plugin.
Tests the real merkle tree generation with actual files.
"""
@@ -15,27 +15,27 @@ import pytest
from django.test import TestCase
-# Get the path to the merkletree hook
+# Get the path to the hashes hook
PLUGIN_DIR = Path(__file__).parent.parent
-MERKLETREE_HOOK = PLUGIN_DIR / 'on_Snapshot__93_merkletree.py'
+HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py'
-class TestMerkletreePlugin(TestCase):
- """Test the merkletree plugin."""
+class TestHashesPlugin(TestCase):
+ """Test the hashes plugin."""
- def test_merkletree_hook_exists(self):
- """Merkletree hook script should exist."""
- self.assertTrue(MERKLETREE_HOOK.exists(), f"Hook not found: {MERKLETREE_HOOK}")
+ def test_hashes_hook_exists(self):
+ """Hashes hook script should exist."""
+ self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}")
- def test_merkletree_generates_tree_for_files(self):
- """Merkletree hook should generate merkle tree for files in snapshot directory."""
+ def test_hashes_generates_tree_for_files(self):
+ """Hashes hook should generate merkle tree for files in snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create a mock snapshot directory structure
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
- # Create output directory for merkletree
- output_dir = snapshot_dir / 'merkletree'
+ # Create output directory for hashes
+ output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
# Create some test files
@@ -48,11 +48,11 @@ class TestMerkletreePlugin(TestCase):
# Run the hook from the output directory
env = os.environ.copy()
- env['MERKLETREE_ENABLED'] = 'true'
+ env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
- sys.executable, str(MERKLETREE_HOOK),
+ sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
@@ -67,8 +67,8 @@ class TestMerkletreePlugin(TestCase):
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
- output_file = output_dir / 'merkletree.json'
- self.assertTrue(output_file.exists(), "merkletree.json not created")
+ output_file = output_dir / 'hashes.json'
+ self.assertTrue(output_file.exists(), "hashes.json not created")
# Parse and verify output
with open(output_file) as f:
@@ -87,20 +87,20 @@ class TestMerkletreePlugin(TestCase):
self.assertGreater(data['metadata']['file_count'], 0)
self.assertGreater(data['metadata']['total_size'], 0)
- def test_merkletree_skips_when_disabled(self):
- """Merkletree hook should skip when MERKLETREE_ENABLED=false."""
+ def test_hashes_skips_when_disabled(self):
+ """Hashes hook should skip when HASHES_ENABLED=false."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
- output_dir = snapshot_dir / 'merkletree'
+ output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
- env['MERKLETREE_ENABLED'] = 'false'
+ env['HASHES_ENABLED'] = 'false'
result = subprocess.run(
[
- sys.executable, str(MERKLETREE_HOOK),
+ sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
@@ -115,20 +115,20 @@ class TestMerkletreePlugin(TestCase):
self.assertEqual(result.returncode, 0)
self.assertIn('skipped', result.stdout)
- def test_merkletree_handles_empty_directory(self):
- """Merkletree hook should handle empty snapshot directory."""
+ def test_hashes_handles_empty_directory(self):
+ """Hashes hook should handle empty snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
- output_dir = snapshot_dir / 'merkletree'
+ output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
- env['MERKLETREE_ENABLED'] = 'true'
+ env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
- sys.executable, str(MERKLETREE_HOOK),
+ sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
@@ -143,7 +143,7 @@ class TestMerkletreePlugin(TestCase):
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
- output_file = output_dir / 'merkletree.json'
+ output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists())
with open(output_file) as f:
diff --git a/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js b/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js
new file mode 100644
index 00000000..7ca72994
--- /dev/null
+++ b/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js
@@ -0,0 +1,247 @@
+#!/usr/bin/env node
+/**
+ * Capture original request + response headers for the main navigation.
+ *
+ * This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
+ * then waits for navigation to complete. It records the first top-level
+ * request headers and the corresponding response headers (with :status).
+ *
+ * Usage: on_Snapshot__27_headers.bg.js --url= --snapshot-id=
+ * Output: Writes headers.json
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+// Add NODE_MODULES_DIR to module resolution paths if set
+if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
+
+const puppeteer = require('puppeteer-core');
+
+// Import shared utilities from chrome_utils.js
+const {
+ getEnvBool,
+ getEnvInt,
+ parseArgs,
+ connectToPage,
+ waitForPageLoaded,
+} = require('../chrome/chrome_utils.js');
+
+const PLUGIN_NAME = 'headers';
+const OUTPUT_DIR = '.';
+const OUTPUT_FILE = 'headers.json';
+const CHROME_SESSION_DIR = '../chrome';
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
+
+let browser = null;
+let page = null;
+let client = null;
+let shuttingDown = false;
+let headersWritten = false;
+
+let requestId = null;
+let requestUrl = null;
+let requestHeaders = null;
+let responseHeaders = null;
+let responseStatus = null;
+let responseStatusText = null;
+let responseUrl = null;
+let originalUrl = null;
+
+function getFinalUrl() {
+ const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt');
+ if (fs.existsSync(finalUrlFile)) {
+ return fs.readFileSync(finalUrlFile, 'utf8').trim();
+ }
+ return page ? page.url() : null;
+}
+
+function writeHeadersFile() {
+ if (headersWritten) return;
+ if (!responseHeaders) return;
+
+ const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+ const responseHeadersWithStatus = {
+ ...(responseHeaders || {}),
+ };
+
+ if (responseStatus !== null && responseStatus !== undefined &&
+ responseHeadersWithStatus[':status'] === undefined) {
+ responseHeadersWithStatus[':status'] = String(responseStatus);
+ }
+
+ const record = {
+ url: requestUrl || originalUrl,
+ final_url: getFinalUrl(),
+ status: responseStatus !== undefined ? responseStatus : null,
+ request_headers: requestHeaders || {},
+ response_headers: responseHeadersWithStatus,
+ headers: responseHeadersWithStatus, // backwards compatibility
+ };
+
+ if (responseStatusText) {
+ record.statusText = responseStatusText;
+ }
+ if (responseUrl) {
+ record.response_url = responseUrl;
+ }
+
+ fs.writeFileSync(outputPath, JSON.stringify(record, null, 2));
+ headersWritten = true;
+}
+
+async function setupListener(url) {
+ const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
+ const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+ const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
+ const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
+
+ if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+ try {
+ const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
+ if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
+ process.kill(pid, 0);
+ } catch (e) {
+ throw new Error(CHROME_SESSION_REQUIRED_ERROR);
+ }
+
+ const { browser, page } = await connectToPage({
+ chromeSessionDir: CHROME_SESSION_DIR,
+ timeoutMs: timeout,
+ puppeteer,
+ });
+
+ client = await page.target().createCDPSession();
+ await client.send('Network.enable');
+
+ client.on('Network.requestWillBeSent', (params) => {
+ try {
+ if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) {
+ responseHeaders = params.redirectResponse.headers || {};
+ responseStatus = params.redirectResponse.status || null;
+ responseStatusText = params.redirectResponse.statusText || null;
+ responseUrl = params.redirectResponse.url || null;
+ writeHeadersFile();
+ }
+
+ if (requestId) return;
+ if (params.type && params.type !== 'Document') return;
+ if (!params.request || !params.request.url) return;
+ if (!params.request.url.startsWith('http')) return;
+
+ requestId = params.requestId;
+ requestUrl = params.request.url;
+ requestHeaders = params.request.headers || {};
+ } catch (e) {
+ // Ignore errors
+ }
+ });
+
+ client.on('Network.responseReceived', (params) => {
+ try {
+ if (!requestId || params.requestId !== requestId || responseHeaders) return;
+ const response = params.response || {};
+ responseHeaders = response.headers || {};
+ responseStatus = response.status || null;
+ responseStatusText = response.statusText || null;
+ responseUrl = response.url || null;
+ writeHeadersFile();
+ } catch (e) {
+ // Ignore errors
+ }
+ });
+
+ return { browser, page };
+}
+
+function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) {
+ if (shuttingDown) return;
+ shuttingDown = true;
+
+ console.log(JSON.stringify({
+ type: 'ArchiveResult',
+ status,
+ output_str: outputStr,
+ }));
+}
+
+async function handleShutdown(signal) {
+ console.error(`\nReceived ${signal}, emitting final results...`);
+ if (!headersWritten) {
+ writeHeadersFile();
+ }
+ if (headersWritten) {
+ emitResult('succeeded', OUTPUT_FILE);
+ } else {
+ emitResult('failed', 'No headers captured');
+ }
+
+ if (browser) {
+ try {
+ browser.disconnect();
+ } catch (e) {}
+ }
+ process.exit(headersWritten ? 0 : 1);
+}
+
+async function main() {
+ const args = parseArgs();
+ const url = args.url;
+ const snapshotId = args.snapshot_id;
+
+ if (!url || !snapshotId) {
+ console.error('Usage: on_Snapshot__27_headers.bg.js --url= --snapshot-id=');
+ process.exit(1);
+ }
+
+ originalUrl = url;
+
+ if (!getEnvBool('HEADERS_ENABLED', true)) {
+ console.error('Skipping (HEADERS_ENABLED=False)');
+ console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'}));
+ process.exit(0);
+ }
+
+ try {
+ // Set up listeners BEFORE navigation
+ const connection = await setupListener(url);
+ browser = connection.browser;
+ page = connection.page;
+
+ // Register signal handlers for graceful shutdown
+ process.on('SIGTERM', () => handleShutdown('SIGTERM'));
+ process.on('SIGINT', () => handleShutdown('SIGINT'));
+
+ // Wait for chrome_navigate to complete (non-fatal)
+ try {
+ const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
+ await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
+ } catch (e) {
+ console.error(`WARN: ${e.message}`);
+ }
+
+ // Keep alive until SIGTERM
+ await new Promise(() => {});
+ return;
+
+ } catch (e) {
+ const errorMessage = (e && e.message)
+ ? `${e.name || 'Error'}: ${e.message}`
+ : String(e || 'Unknown error');
+ console.error(`ERROR: ${errorMessage}`);
+
+ console.log(JSON.stringify({
+ type: 'ArchiveResult',
+ status: 'failed',
+ output_str: errorMessage,
+ }));
+ process.exit(1);
+ }
+}
+
+main().catch(e => {
+ console.error(`Fatal error: ${e.message}`);
+ process.exit(1);
+});
diff --git a/archivebox/plugins/headers/on_Snapshot__55_headers.js b/archivebox/plugins/headers/on_Snapshot__55_headers.js
deleted file mode 100644
index 098b95e7..00000000
--- a/archivebox/plugins/headers/on_Snapshot__55_headers.js
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env node
-/**
- * Extract HTTP response headers for a URL.
- *
- * If a Chrome session exists (from chrome plugin), reads the captured
- * response headers from chrome plugin/response_headers.json.
- * Otherwise falls back to making an HTTP HEAD request.
- *
- * Usage: on_Snapshot__55_headers.js --url= --snapshot-id=
- * Output: Writes headers/headers.json
- *
- * Environment variables:
- * TIMEOUT: Timeout in seconds (default: 30)
- * USER_AGENT: User agent string (optional)
- * CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
- */
-
-const fs = require('fs');
-const path = require('path');
-const https = require('https');
-const http = require('http');
-// Add NODE_MODULES_DIR to module resolution paths if set
-if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
-
-const {
- getEnv,
- getEnvBool,
- getEnvInt,
- parseArgs,
-} = require('../chrome/chrome_utils.js');
-
-// Extractor metadata
-const PLUGIN_NAME = 'headers';
-const OUTPUT_DIR = '.';
-const OUTPUT_FILE = 'headers.json';
-const CHROME_SESSION_DIR = '../chrome';
-const CHROME_HEADERS_FILE = 'response_headers.json';
-
-// Get headers from chrome plugin if available
-function getHeadersFromChromeSession() {
- const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
- if (fs.existsSync(headersFile)) {
- try {
- const data = JSON.parse(fs.readFileSync(headersFile, 'utf8'));
- return data;
- } catch (e) {
- return null;
- }
- }
- return null;
-}
-
-// Fetch headers via HTTP HEAD request (fallback)
-function fetchHeaders(url) {
- return new Promise((resolve, reject) => {
- const timeout = getEnvInt('TIMEOUT', 30) * 1000;
- const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
- const checkSsl = getEnvBool('CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
-
- const parsedUrl = new URL(url);
- const client = parsedUrl.protocol === 'https:' ? https : http;
-
- const options = {
- method: 'HEAD',
- hostname: parsedUrl.hostname,
- port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
- path: parsedUrl.pathname + parsedUrl.search,
- headers: { 'User-Agent': userAgent },
- timeout,
- rejectUnauthorized: checkSsl,
- };
-
- const req = client.request(options, (res) => {
- resolve({
- url: url,
- status: res.statusCode,
- statusText: res.statusMessage,
- headers: res.headers,
- });
- });
-
- req.on('error', reject);
- req.on('timeout', () => {
- req.destroy();
- reject(new Error('Request timeout'));
- });
-
- req.end();
- });
-}
-
-async function extractHeaders(url) {
- // Output directory is current directory (hook already runs in output dir)
- const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
-
- // Try Chrome session first
- const chromeHeaders = getHeadersFromChromeSession();
- if (chromeHeaders && chromeHeaders.headers) {
- fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
- return { success: true, output: outputPath, method: 'chrome', status: chromeHeaders.status };
- }
-
- // Fallback to HTTP HEAD request
- try {
- const headers = await fetchHeaders(url);
- fs.writeFileSync(outputPath, JSON.stringify(headers, null, 2), 'utf8');
- return { success: true, output: outputPath, method: 'http', status: headers.status };
- } catch (e) {
- return { success: false, error: e.message };
- }
-}
-
-async function main() {
- const args = parseArgs();
- const url = args.url;
- const snapshotId = args.snapshot_id;
-
- if (!url || !snapshotId) {
- console.error('Usage: on_Snapshot__55_headers.js --url= --snapshot-id=');
- process.exit(1);
- }
-
- const startTs = new Date();
- let status = 'failed';
- let output = null;
- let error = '';
-
- try {
- const result = await extractHeaders(url);
-
- if (result.success) {
- status = 'succeeded';
- output = result.output;
- console.log(`Headers extracted (${result.method}): HTTP ${result.status}`);
- } else {
- status = 'failed';
- error = result.error;
- }
- } catch (e) {
- error = `${e.name}: ${e.message}`;
- status = 'failed';
- }
-
- const endTs = new Date();
-
- if (error) console.error(`ERROR: ${error}`);
-
- // Output clean JSONL (no RESULT_JSON= prefix)
- console.log(JSON.stringify({
- type: 'ArchiveResult',
- status,
- output_str: output || error || '',
- }));
-
- process.exit(status === 'succeeded' ? 0 : 1);
-}
-
-main().catch(e => {
- console.error(`Fatal error: ${e.message}`);
- process.exit(1);
-});
diff --git a/archivebox/plugins/headers/tests/test_headers.py b/archivebox/plugins/headers/tests/test_headers.py
index 0930737c..09ec86fb 100644
--- a/archivebox/plugins/headers/tests/test_headers.py
+++ b/archivebox/plugins/headers/tests/test_headers.py
@@ -7,23 +7,68 @@ Tests verify:
2. Node.js is available
3. Headers extraction works for real example.com
4. Output JSON contains actual HTTP headers
-5. HTTP fallback works correctly
-6. Config options work (TIMEOUT, USER_AGENT)
+5. Config options work (TIMEOUT, USER_AGENT)
"""
import json
import shutil
import subprocess
import tempfile
+import time
from pathlib import Path
import pytest
+from archivebox.plugins.chrome.tests.chrome_test_helpers import (
+ CHROME_NAVIGATE_HOOK,
+ get_test_env,
+ chrome_session,
+)
PLUGIN_DIR = Path(__file__).parent.parent
HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
TEST_URL = 'https://example.com'
+def normalize_root_url(url: str) -> str:
+ return url.rstrip('/')
+
+def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id):
+ hook_proc = subprocess.Popen(
+ ['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
+ cwd=headers_dir,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env,
+ )
+
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
+ cwd=snapshot_chrome_dir,
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env,
+ )
+
+ headers_file = headers_dir / 'headers.json'
+ for _ in range(60):
+ if headers_file.exists() and headers_file.stat().st_size > 0:
+ break
+ time.sleep(1)
+
+ if hook_proc.poll() is None:
+ hook_proc.terminate()
+ try:
+ stdout, stderr = hook_proc.communicate(timeout=5)
+ except subprocess.TimeoutExpired:
+ hook_proc.kill()
+ stdout, stderr = hook_proc.communicate()
+ else:
+ stdout, stderr = hook_proc.communicate()
+
+ return hook_proc.returncode, stdout, stderr, nav_result, headers_file
+
def test_hook_script_exists():
"""Verify hook script exists."""
@@ -66,21 +111,25 @@ def test_extracts_headers_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Run headers extraction
- result = subprocess.run(
- ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ headers_dir = snapshot_chrome_dir.parent / 'headers'
+ headers_dir.mkdir(exist_ok=True)
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+ result = run_headers_capture(
+ headers_dir,
+ snapshot_chrome_dir,
+ env,
+ TEST_URL,
+ 'test789',
+ )
+
+ hook_code, stdout, stderr, nav_result, headers_file = result
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
+ assert hook_code == 0, f"Extraction failed: {stderr}"
# Parse clean JSONL output
result_json = None
- for line in result.stdout.strip().split('\n'):
+ for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
@@ -96,28 +145,36 @@ def test_extracts_headers_from_example_com():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file exists (hook writes to current directory)
- headers_file = tmpdir / 'headers.json'
assert headers_file.exists(), "headers.json not created"
# Verify headers JSON contains REAL example.com response
headers_data = json.loads(headers_file.read_text())
assert 'url' in headers_data, "Should have url field"
- assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}"
+ assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}"
assert 'status' in headers_data, "Should have status field"
assert headers_data['status'] in [200, 301, 302], \
f"Should have valid HTTP status, got {headers_data['status']}"
+ assert 'request_headers' in headers_data, "Should have request_headers field"
+ assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict"
+
+ assert 'response_headers' in headers_data, "Should have response_headers field"
+ assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict"
+ assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty"
+
assert 'headers' in headers_data, "Should have headers field"
assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
- assert len(headers_data['headers']) > 0, "Headers dict should not be empty"
# Verify common HTTP headers are present
- headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()}
+ headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()}
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
"Should have at least one common HTTP header"
+ assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \
+ "Response headers should include :status pseudo header"
+
def test_headers_output_structure():
"""Test that headers plugin produces correctly structured output."""
@@ -128,21 +185,25 @@ def test_headers_output_structure():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Run headers extraction against real example.com
- result = subprocess.run(
- ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testformat'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ headers_dir = snapshot_chrome_dir.parent / 'headers'
+ headers_dir.mkdir(exist_ok=True)
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+ result = run_headers_capture(
+ headers_dir,
+ snapshot_chrome_dir,
+ env,
+ TEST_URL,
+ 'testformat',
+ )
+
+ hook_code, stdout, stderr, nav_result, headers_file = result
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
+ assert hook_code == 0, f"Extraction failed: {stderr}"
# Parse clean JSONL output
result_json = None
- for line in result.stdout.strip().split('\n'):
+ for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
@@ -158,27 +219,30 @@ def test_headers_output_structure():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output structure
- output_headers_file = tmpdir / 'headers.json'
- assert output_headers_file.exists(), "Output headers.json not created"
+ assert headers_file.exists(), "Output headers.json not created"
- output_data = json.loads(output_headers_file.read_text())
+ output_data = json.loads(headers_file.read_text())
# Verify all required fields are present
assert 'url' in output_data, "Output should have url field"
assert 'status' in output_data, "Output should have status field"
+ assert 'request_headers' in output_data, "Output should have request_headers field"
+ assert 'response_headers' in output_data, "Output should have response_headers field"
assert 'headers' in output_data, "Output should have headers field"
# Verify data types
assert isinstance(output_data['status'], int), "Status should be integer"
+ assert isinstance(output_data['request_headers'], dict), "Request headers should be dict"
+ assert isinstance(output_data['response_headers'], dict), "Response headers should be dict"
assert isinstance(output_data['headers'], dict), "Headers should be dict"
# Verify example.com returns expected headers
- assert output_data['url'] == TEST_URL
+ assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL)
assert output_data['status'] in [200, 301, 302]
-def test_falls_back_to_http_when_chrome_unavailable():
- """Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
+def test_fails_without_chrome_session():
+ """Test that headers plugin fails when chrome session is missing."""
if not shutil.which('node'):
pass
@@ -186,8 +250,6 @@ def test_falls_back_to_http_when_chrome_unavailable():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Don't create chrome directory - force HTTP fallback
-
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
@@ -198,34 +260,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
,
env=get_test_env())
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
-
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
-
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
-
- # Verify output exists and has real HTTP headers
- output_headers_file = tmpdir / 'headers.json'
- assert output_headers_file.exists(), "Output headers.json not created"
-
- output_data = json.loads(output_headers_file.read_text())
- assert output_data['url'] == TEST_URL
- assert output_data['status'] in [200, 301, 302]
- assert isinstance(output_data['headers'], dict)
- assert len(output_data['headers']) > 0
+ assert result.returncode != 0, "Should fail without chrome session"
+ assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
def test_config_timeout_honored():
@@ -239,20 +275,26 @@ def test_config_timeout_honored():
# Set very short timeout (but example.com should still succeed)
import os
- env = os.environ.copy()
- env['TIMEOUT'] = '5'
+ env_override = os.environ.copy()
+ env_override['TIMEOUT'] = '5'
- result = subprocess.run(
- ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30
- )
+ with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ headers_dir = snapshot_chrome_dir.parent / 'headers'
+ headers_dir.mkdir(exist_ok=True)
+ env.update(env_override)
+
+ result = run_headers_capture(
+ headers_dir,
+ snapshot_chrome_dir,
+ env,
+ TEST_URL,
+ 'testtimeout',
+ )
# Should complete (success or fail, but not hang)
- assert result.returncode in (0, 1), "Should complete without hanging"
+ hook_code, _stdout, _stderr, nav_result, _headers_file = result
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
+ assert hook_code in (0, 1), "Should complete without hanging"
def test_config_user_agent():
@@ -266,23 +308,29 @@ def test_config_user_agent():
# Set custom user agent
import os
- env = os.environ.copy()
- env['USER_AGENT'] = 'TestBot/1.0'
+ env_override = os.environ.copy()
+ env_override['USER_AGENT'] = 'TestBot/1.0'
- result = subprocess.run(
- ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=60
- )
+ with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ headers_dir = snapshot_chrome_dir.parent / 'headers'
+ headers_dir.mkdir(exist_ok=True)
+ env.update(env_override)
+
+ result = run_headers_capture(
+ headers_dir,
+ snapshot_chrome_dir,
+ env,
+ TEST_URL,
+ 'testua',
+ )
# Should succeed (example.com doesn't block)
- if result.returncode == 0:
+ hook_code, stdout, _stderr, nav_result, _headers_file = result
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
+ if hook_code == 0:
# Parse clean JSONL output
result_json = None
- for line in result.stdout.strip().split('\n'):
+ for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
@@ -307,20 +355,23 @@ def test_handles_https_urls():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- result = subprocess.run(
- ['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ headers_dir = snapshot_chrome_dir.parent / 'headers'
+ headers_dir.mkdir(exist_ok=True)
+ result = run_headers_capture(
+ headers_dir,
+ snapshot_chrome_dir,
+ env,
+ 'https://example.org',
+ 'testhttps',
+ )
- if result.returncode == 0:
- output_headers_file = tmpdir / 'headers.json'
- if output_headers_file.exists():
- output_data = json.loads(output_headers_file.read_text())
- assert output_data['url'] == 'https://example.org'
+ hook_code, _stdout, _stderr, nav_result, headers_file = result
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
+ if hook_code == 0:
+ if headers_file.exists():
+ output_data = json.loads(headers_file.read_text())
+ assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org')
assert output_data['status'] in [200, 301, 302]
@@ -333,21 +384,24 @@ def test_handles_404_gracefully():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- result = subprocess.run(
- ['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ headers_dir = snapshot_chrome_dir.parent / 'headers'
+ headers_dir.mkdir(exist_ok=True)
+ result = run_headers_capture(
+ headers_dir,
+ snapshot_chrome_dir,
+ env,
+ 'https://example.com/nonexistent-page-404',
+ 'test404',
+ )
# May succeed or fail depending on server behavior
# If it succeeds, verify 404 status is captured
- if result.returncode == 0:
- output_headers_file = tmpdir / 'headers.json'
- if output_headers_file.exists():
- output_data = json.loads(output_headers_file.read_text())
+ hook_code, _stdout, _stderr, nav_result, headers_file = result
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
+ if hook_code == 0:
+ if headers_file.exists():
+ output_data = json.loads(headers_file.read_text())
assert output_data['status'] == 404, "Should capture 404 status"
diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js
index 3003d370..8275d61c 100755
--- a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js
+++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js
@@ -42,6 +42,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';
const CHROME_SESSION_DIR = '../chrome';
+const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
@@ -330,7 +331,7 @@ async function main() {
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
- console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
+ console.error(CHROME_SESSION_REQUIRED_ERROR);
process.exit(1);
}
@@ -363,10 +364,6 @@ async function main() {
page = pages[pages.length - 1];
}
- // Set viewport to ensure proper page rendering
- const resolution = getEnv('CHROME_RESOLUTION', '1440,2000').split(',').map(x => parseInt(x.trim(), 10));
- await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 });
-
console.error(`Starting infinite scroll on ${url}`);
// Expand and comments before scrolling (if enabled)
diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
index 1248518a..a2c1cb58 100644
--- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
+++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
@@ -79,10 +79,12 @@ def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
+ infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll'
+ infiniscroll_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
- cwd=tmpdir,
+ cwd=infiniscroll_dir,
capture_output=True,
text=True,
env=get_test_env(),
diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
index 7fdc1c4a..1371b5c7 100644
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
@@ -16,6 +16,7 @@ import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
+ get_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
@@ -291,8 +292,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
- cwd=str(tmpdir,
- env=get_test_env()),
+ cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
@@ -444,8 +444,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
- cwd=str(script_dir,
- env=get_test_env()),
+ cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
@@ -539,7 +538,7 @@ def test_hides_cookie_consent_on_filmin():
print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
- pytest.skip(
+ pytest.fail(
f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
f"Elements found: {len(baseline_result['elements_found'])}. "
f"The site may have changed or cookie consent may be region-specific."
@@ -559,8 +558,7 @@ def test_hides_cookie_consent_on_filmin():
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
- cwd=str(tmpdir,
- env=get_test_env()),
+ cwd=str(tmpdir),
capture_output=True,
text=True,
env=env_with_ext,
diff --git a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py
index b131c14c..1af0bdb6 100644
--- a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py
@@ -15,11 +15,13 @@ Environment variables:
Note: Requires postlight-parser: npm install -g @postlight/parser
"""
+import html
import json
import os
import subprocess
import sys
from pathlib import Path
+from urllib.parse import urlparse
import rich_click as click
@@ -115,13 +117,39 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
# Save HTML content and metadata
html_content = html_json.pop('content', '')
+ # Some sources return HTML-escaped markup inside the content blob.
+ # If it looks heavily escaped, unescape once so it renders properly.
+ if html_content:
+ escaped_count = html_content.count('<') + html_content.count('>')
+ tag_count = html_content.count('<')
+ if escaped_count and escaped_count > tag_count * 2:
+ html_content = html.unescape(html_content)
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
# Save article metadata
metadata = {k: v for k, v in text_json.items() if k != 'content'}
(output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8')
- return True, OUTPUT_DIR, ''
+ # Link images/ to responses capture (if available)
+ try:
+ hostname = urlparse(url).hostname or ''
+ if hostname:
+ responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve()
+ link_path = output_dir / 'images'
+ if responses_images.exists() and responses_images.is_dir():
+ if link_path.exists() or link_path.is_symlink():
+ if link_path.is_symlink() or link_path.is_file():
+ link_path.unlink()
+ else:
+ # Don't remove real directories
+ responses_images = None
+ if responses_images:
+ rel_target = os.path.relpath(str(responses_images), str(output_dir))
+ link_path.symlink_to(rel_target)
+ except Exception:
+ pass
+
+ return True, 'content.html', ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
diff --git a/archivebox/plugins/merkletree/templates/icon.html b/archivebox/plugins/merkletree/templates/icon.html
deleted file mode 100644
index b8d3579c..00000000
--- a/archivebox/plugins/merkletree/templates/icon.html
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
index 38b2a604..7f9e664b 100644
--- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
+++ b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
@@ -237,7 +237,7 @@ async function main() {
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (!cdpUrl) {
- console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
+ console.error('No Chrome session found (chrome plugin must run first)');
process.exit(1);
}
diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py
index b66d20d2..53c62479 100644
--- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py
+++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py
@@ -81,10 +81,12 @@ def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
+ modalcloser_dir = tmpdir / 'snapshot' / 'modalcloser'
+ modalcloser_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
- cwd=tmpdir,
+ cwd=modalcloser_dir,
capture_output=True,
text=True,
env=get_test_env(),
diff --git a/archivebox/plugins/npm/tests/test_npm_provider.py b/archivebox/plugins/npm/tests/test_npm_provider.py
index 5492738a..9f00d9d7 100644
--- a/archivebox/plugins/npm/tests/test_npm_provider.py
+++ b/archivebox/plugins/npm/tests/test_npm_provider.py
@@ -91,9 +91,9 @@ class TestNpmProviderHook(TestCase):
self.assertIn('npm provider not allowed', result.stderr)
self.assertEqual(result.returncode, 0)
- @pytest.mark.skipif(not npm_available(), reason="npm not installed")
def test_hook_creates_npm_prefix(self):
"""Hook should create npm prefix directory."""
+ assert npm_available(), "npm not installed"
env = os.environ.copy()
env['LIB_DIR'] = str(self.lib_dir)
diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
index e900d9b5..3076fe61 100755
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
@@ -81,7 +81,7 @@ function getCdpUrl() {
}
// Extract outlinks
-async function extractOutlinks(url) {
+async function extractOutlinks(url, snapshotId, crawlId, depth) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
@@ -253,7 +253,7 @@ async function main() {
}
}
- const result = await extractOutlinks(url);
+ const result = await extractOutlinks(url, snapshotId, crawlId, depth);
if (result.success) {
status = 'succeeded';
diff --git a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
index cf6df8ed..6f45eb4b 100644
--- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
@@ -47,7 +47,6 @@ class TestParseDomOutlinksPlugin(TestCase):
self.assertTrue(OUTLINKS_HOOK.exists(), f"Hook not found: {OUTLINKS_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestParseDomOutlinksWithChrome(TestCase):
"""Integration tests for parse_dom_outlinks plugin with Chrome."""
@@ -112,9 +111,7 @@ class TestParseDomOutlinksWithChrome(TestCase):
# example.com has at least one link (to iana.org)
self.assertIsInstance(outlinks_data['hrefs'], list)
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
+ except RuntimeError:
raise
diff --git a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
index 05648a81..d46a3779 100644
--- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
@@ -2,19 +2,12 @@
/**
* Print a URL to PDF using Chrome/Puppeteer.
*
- * If a Chrome session exists (from chrome plugin), connects to it via CDP.
- * Otherwise launches a new Chrome instance.
+ * Requires a Chrome session (from chrome plugin) and connects to it via CDP.
*
* Usage: on_Snapshot__52_pdf.js --url= --snapshot-id=
* Output: Writes pdf/output.pdf
*
* Environment variables:
- * CHROME_BINARY: Path to Chrome/Chromium binary
- * CHROME_TIMEOUT: Timeout in seconds (default: 60)
- * CHROME_RESOLUTION: Page resolution (default: 1440,2000)
- * CHROME_USER_AGENT: User agent string (optional)
- * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
- * CHROME_HEADLESS: Run in headless mode (default: true)
* PDF_ENABLED: Enable PDF generation (default: true)
*/
@@ -24,11 +17,7 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
- findChromium,
- getEnv,
getEnvBool,
- getEnvInt,
- parseResolution,
parseArgs,
readCdpUrl,
} = require('../chrome/chrome_utils.js');
@@ -86,81 +75,30 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
}
async function printToPdf(url) {
- const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
- const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
- const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
- const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
- const headless = getEnvBool('CHROME_HEADLESS', true);
-
- const { width, height } = parseResolution(resolution);
-
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
- let connectedToSession = false;
try {
- // Try to connect to existing Chrome session
+ // Connect to existing Chrome session (required)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (cdpUrl) {
- try {
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- defaultViewport: { width, height },
- });
- connectedToSession = true;
-
- // Get existing pages or create new one
- const pages = await browser.pages();
- page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- page = await browser.newPage();
- }
-
- // Set viewport on the page
- await page.setViewport({ width, height });
-
- } catch (e) {
- console.error(`Failed to connect to CDP session: ${e.message}`);
- browser = null;
- }
+ if (!cdpUrl) {
+ return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
- // Fall back to launching new browser
- if (!browser) {
- const executablePath = findChromium();
- if (!executablePath) {
- return { success: false, error: 'Chrome binary not found' };
- }
+ browser = await puppeteer.connect({
+ browserWSEndpoint: cdpUrl,
+ defaultViewport: null,
+ });
- browser = await puppeteer.launch({
- executablePath,
- headless: headless ? 'new' : false,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- `--window-size=${width},${height}`,
- ...(checkSsl ? [] : ['--ignore-certificate-errors']),
- ],
- defaultViewport: { width, height },
- });
+ // Get existing pages or create new one
+ const pages = await browser.pages();
+ page = pages.find(p => p.url().startsWith('http')) || pages[0];
+ if (!page) {
page = await browser.newPage();
-
- // Navigate to URL (only if we launched fresh browser)
- if (userAgent) {
- await page.setUserAgent(userAgent);
- }
-
- await page.goto(url, {
- waitUntil: 'networkidle2',
- timeout,
- });
}
// Print to PDF
@@ -185,9 +123,8 @@ async function printToPdf(url) {
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
- // Only close browser if we launched it (not if we connected to session)
- if (browser && !connectedToSession) {
- await browser.close();
+ if (browser) {
+ browser.disconnect();
}
}
}
@@ -215,14 +152,15 @@ async function main() {
process.exit(0);
}
- // Only wait for page load if using shared Chrome session
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (cdpUrl) {
- // Wait for page to be fully loaded
- const pageLoaded = await waitForChromeTabLoaded(60000);
- if (!pageLoaded) {
- throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
- }
+ if (!cdpUrl) {
+ throw new Error('No Chrome session found (chrome plugin must run first)');
+ }
+
+ // Wait for page to be fully loaded
+ const pageLoaded = await waitForChromeTabLoaded(60000);
+ if (!pageLoaded) {
+ throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await printToPdf(url);
diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py
index 8751faef..f9388129 100644
--- a/archivebox/plugins/pdf/tests/test_pdf.py
+++ b/archivebox/plugins/pdf/tests/test_pdf.py
@@ -29,6 +29,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
+ chrome_session,
)
@@ -62,15 +63,19 @@ def test_extracts_pdf_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Run PDF extraction hook
- result = subprocess.run(
- ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=120
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
+ pdf_dir = snapshot_chrome_dir.parent / 'pdf'
+ pdf_dir.mkdir(exist_ok=True)
+
+ # Run PDF extraction hook
+ result = subprocess.run(
+ ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
+ cwd=pdf_dir,
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
# Parse clean JSONL output (hook might fail due to network issues)
result_json = None
@@ -98,7 +103,7 @@ def test_extracts_pdf_from_example_com():
assert result.returncode == 0, f"Should exit 0 on success: {result.stderr}"
# Verify filesystem output (hook writes to current directory)
- pdf_file = tmpdir / 'output.pdf'
+ pdf_file = pdf_dir / 'output.pdf'
assert pdf_file.exists(), "output.pdf not created"
# Verify file is valid PDF
@@ -117,7 +122,7 @@ def test_config_save_pdf_false_skips():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- env = os.environ.copy()
+ env = get_test_env()
env['PDF_ENABLED'] = 'False'
result = subprocess.run(
@@ -140,50 +145,46 @@ def test_config_save_pdf_false_skips():
def test_reports_missing_chrome():
- """Test that script reports error when Chrome is not found."""
+ """Test that script reports error when Chrome session is missing."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
-
- # Set CHROME_BINARY to nonexistent path
- env = os.environ.copy()
- env['CHROME_BINARY'] = '/nonexistent/chrome'
+ env = get_test_env()
+ pdf_dir = tmpdir / 'snapshot' / 'pdf'
+ pdf_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
- cwd=tmpdir,
+ cwd=pdf_dir,
capture_output=True,
text=True,
env=env,
timeout=30
)
- # Should fail and report missing Chrome
- if result.returncode != 0:
- combined = result.stdout + result.stderr
- assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
+ assert result.returncode != 0, "Should fail without shared Chrome session"
+ combined = result.stdout + result.stderr
+ assert 'chrome session' in combined.lower() or 'chrome plugin' in combined.lower()
-def test_config_timeout_honored():
- """Test that CHROME_TIMEOUT config is respected."""
- import os
-
+def test_runs_with_shared_chrome_session():
+ """Test that PDF hook completes when shared Chrome session is available."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Set very short timeout
- env = os.environ.copy()
- env['CHROME_TIMEOUT'] = '5'
+ with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
+ pdf_dir = snapshot_chrome_dir.parent / 'pdf'
+ pdf_dir.mkdir(exist_ok=True)
- result = subprocess.run(
- ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30
- )
+ result = subprocess.run(
+ ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
+ cwd=pdf_dir,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=30
+ )
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
diff --git a/archivebox/plugins/pip/tests/test_pip_provider.py b/archivebox/plugins/pip/tests/test_pip_provider.py
index 4a4fe610..d24c7e64 100644
--- a/archivebox/plugins/pip/tests/test_pip_provider.py
+++ b/archivebox/plugins/pip/tests/test_pip_provider.py
@@ -142,13 +142,14 @@ class TestPipProviderIntegration(TestCase):
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
- @pytest.mark.skipif(
- subprocess.run([sys.executable, '-m', 'pip', '--version'],
- capture_output=True).returncode != 0,
- reason="pip not available"
- )
def test_hook_finds_pip_installed_binary(self):
"""Hook should find binaries installed via pip."""
+ pip_check = subprocess.run(
+ [sys.executable, '-m', 'pip', '--version'],
+ capture_output=True,
+ text=True,
+ )
+ assert pip_check.returncode == 0, "pip not available"
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
diff --git a/archivebox/plugins/puppeteer/tests/test_puppeteer.py b/archivebox/plugins/puppeteer/tests/test_puppeteer.py
index 5d230a7d..a35db7a1 100644
--- a/archivebox/plugins/puppeteer/tests/test_puppeteer.py
+++ b/archivebox/plugins/puppeteer/tests/test_puppeteer.py
@@ -46,8 +46,8 @@ def test_crawl_hook_emits_puppeteer_binary():
assert 'npm' in binaries[0].get('binproviders', ''), "puppeteer should be installable via npm provider"
-@pytest.mark.skipif(shutil.which('npm') is None, reason='npm is required for puppeteer installation')
def test_puppeteer_installs_chromium():
+ assert shutil.which('npm'), "npm is required for puppeteer installation"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
lib_dir = tmpdir / 'lib' / 'arm64-darwin'
diff --git a/archivebox/plugins/readability/on_Snapshot__56_readability.py b/archivebox/plugins/readability/on_Snapshot__56_readability.py
index e02e24e6..2c083fb6 100644
--- a/archivebox/plugins/readability/on_Snapshot__56_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py
@@ -22,6 +22,7 @@ import subprocess
import sys
import tempfile
from pathlib import Path
+from urllib.parse import urlparse
import rich_click as click
@@ -135,6 +136,24 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
+ # Link images/ to responses capture (if available)
+ try:
+ hostname = urlparse(url).hostname or ''
+ if hostname:
+ responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve()
+ link_path = output_dir / 'images'
+ if responses_images.exists() and responses_images.is_dir():
+ if link_path.exists() or link_path.is_symlink():
+ if link_path.is_symlink() or link_path.is_file():
+ link_path.unlink()
+ else:
+ responses_images = None
+ if responses_images:
+ rel_target = os.path.relpath(str(responses_images), str(output_dir))
+ link_path.symlink_to(rel_target)
+ except Exception:
+ pass
+
return True, OUTPUT_FILE, ''
except subprocess.TimeoutExpired:
diff --git a/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js
index 66aac407..96defe1b 100755
--- a/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js
+++ b/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js
@@ -38,6 +38,7 @@ let originalUrl = '';
let finalUrl = '';
let page = null;
let browser = null;
+let initialRecorded = false;
async function setupRedirectListener() {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
@@ -62,6 +63,20 @@ async function setupRedirectListener() {
client.on('Network.requestWillBeSent', (params) => {
const { requestId, request, redirectResponse } = params;
+ if (!initialRecorded && request.url && request.url.startsWith('http')) {
+ const initialEntry = {
+ timestamp: new Date().toISOString(),
+ from_url: null,
+ to_url: request.url,
+ status: null,
+ type: 'initial',
+ request_id: requestId,
+ };
+ redirectChain.push(initialEntry);
+ fs.appendFileSync(outputPath, JSON.stringify(initialEntry) + '\n');
+ initialRecorded = true;
+ }
+
if (redirectResponse) {
// This is a redirect
const redirectEntry = {
diff --git a/archivebox/plugins/redirects/tests/test_redirects.py b/archivebox/plugins/redirects/tests/test_redirects.py
index 452c5dd6..c26ac273 100644
--- a/archivebox/plugins/redirects/tests/test_redirects.py
+++ b/archivebox/plugins/redirects/tests/test_redirects.py
@@ -48,7 +48,6 @@ class TestRedirectsPlugin(TestCase):
self.assertTrue(REDIRECTS_HOOK.exists(), f"Hook not found: {REDIRECTS_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestRedirectsWithChrome(TestCase):
"""Integration tests for redirects plugin with Chrome."""
@@ -142,9 +141,7 @@ class TestRedirectsWithChrome(TestCase):
self.assertNotIn('Traceback', stderr)
self.assertNotIn('Error:', stderr)
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
+ except RuntimeError:
raise
diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
index c7dd6491..7f4587c1 100755
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
@@ -39,7 +39,7 @@ let responseCount = 0;
let shuttingDown = false;
// Resource types to capture (by default, capture everything)
-const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
+const DEFAULT_TYPES = ['document', 'script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
function getExtensionFromMimeType(mimeType) {
const mimeMap = {
@@ -176,11 +176,17 @@ async function setupListener() {
const hostname = urlObj.hostname;
const pathname = urlObj.pathname || '/';
const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
- const dirPath = path.dirname(pathname);
+ const dirPathRaw = path.dirname(pathname);
+ const dirPath = dirPathRaw === '.' ? '' : dirPathRaw.replace(/^\/+/, '');
const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
const symlinkPath = path.join(symlinkDir, filename);
await createSymlink(uniquePath, symlinkPath);
+
+ // Also create a site-style symlink without resource type for easy browsing
+ const siteDir = path.join(OUTPUT_DIR, hostname, dirPath);
+ const sitePath = path.join(siteDir, filename);
+ await createSymlink(uniquePath, sitePath);
} catch (e) {
// URL parsing or symlink creation failed, skip
}
diff --git a/archivebox/plugins/responses/tests/test_responses.py b/archivebox/plugins/responses/tests/test_responses.py
index 82a5fa77..b6404dcd 100644
--- a/archivebox/plugins/responses/tests/test_responses.py
+++ b/archivebox/plugins/responses/tests/test_responses.py
@@ -13,27 +13,18 @@ import tempfile
import time
from pathlib import Path
-import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
- get_test_env,
+ CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
-def chrome_available() -> bool:
- """Check if Chrome/Chromium is available."""
- for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
- if shutil.which(name):
- return True
- return False
-
-
# Get the path to the responses hook
PLUGIN_DIR = get_plugin_dir(__file__)
RESPONSES_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_responses.*')
@@ -48,7 +39,6 @@ class TestResponsesPlugin(TestCase):
self.assertTrue(RESPONSES_HOOK.exists(), f"Hook not found: {RESPONSES_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestResponsesWithChrome(TestCase):
"""Integration tests for responses plugin with Chrome."""
@@ -65,68 +55,72 @@ class TestResponsesWithChrome(TestCase):
test_url = 'https://example.com'
snapshot_id = 'test-responses-snapshot'
- try:
- with chrome_session(
- self.temp_dir,
- crawl_id='test-responses-crawl',
- snapshot_id=snapshot_id,
- test_url=test_url,
- navigate=True,
- timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
- # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+ with chrome_session(
+ self.temp_dir,
+ crawl_id='test-responses-crawl',
+ snapshot_id=snapshot_id,
+ test_url=test_url,
+ navigate=False,
+ timeout=30,
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ responses_dir = snapshot_chrome_dir.parent / 'responses'
+ responses_dir.mkdir(exist_ok=True)
+ # Run responses hook with the active Chrome session (background hook)
+ result = subprocess.Popen(
+ ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(responses_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
- # Run responses hook with the active Chrome session (background hook)
- result = subprocess.Popen(
- ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
+ self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
- # Check for output directory and index file
- index_output = snapshot_chrome_dir / 'index.jsonl'
+ # Check for output directory and index file
+ index_output = responses_dir / 'index.jsonl'
- # Wait briefly for background hook to write output
- for _ in range(10):
- if index_output.exists() and index_output.stat().st_size > 0:
- break
- time.sleep(1)
+ # Wait briefly for background hook to write output
+ for _ in range(30):
+ if index_output.exists() and index_output.stat().st_size > 0:
+ break
+ time.sleep(1)
- # Verify hook ran (may keep running waiting for cleanup signal)
- if result.poll() is None:
- result.terminate()
- try:
- stdout, stderr = result.communicate(timeout=5)
- except subprocess.TimeoutExpired:
- result.kill()
- stdout, stderr = result.communicate()
- else:
+ # Verify hook ran (may keep running waiting for cleanup signal)
+ if result.poll() is None:
+ result.terminate()
+ try:
+ stdout, stderr = result.communicate(timeout=5)
+ except subprocess.TimeoutExpired:
+ result.kill()
stdout, stderr = result.communicate()
- self.assertNotIn('Traceback', stderr)
+ else:
+ stdout, stderr = result.communicate()
+ self.assertNotIn('Traceback', stderr)
- # If index file exists, verify it's valid JSONL
- if index_output.exists():
- with open(index_output) as f:
- content = f.read().strip()
- if content:
- for line in content.split('\n'):
- if line.strip():
- try:
- record = json.loads(line)
- # Verify structure
- self.assertIn('url', record)
- self.assertIn('resourceType', record)
- except json.JSONDecodeError:
- pass # Some lines may be incomplete
-
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
- raise
+ # If index file exists, verify it's valid JSONL
+ if index_output.exists():
+ with open(index_output) as f:
+ content = f.read().strip()
+ self.assertTrue(content, "Responses output should not be empty")
+ for line in content.split('\n'):
+ if line.strip():
+ try:
+ record = json.loads(line)
+ # Verify structure
+ self.assertIn('url', record)
+ self.assertIn('resourceType', record)
+ except json.JSONDecodeError:
+ pass # Some lines may be incomplete
if __name__ == '__main__':
diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
index 76390846..34cd7a44 100644
--- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
@@ -9,7 +9,6 @@
* Output: Writes screenshot/screenshot.png
*
* Environment variables:
- * CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
*/
@@ -34,9 +33,10 @@ function flushCoverageAndExit(exitCode) {
const {
getEnv,
getEnvBool,
- parseResolution,
parseArgs,
- readCdpUrl,
+ connectToPage,
+ waitForPageLoaded,
+ readTargetId,
} = require('../chrome/chrome_utils.js');
// Check if screenshot is enabled BEFORE requiring puppeteer
@@ -75,77 +75,58 @@ function hasStaticFileOutput() {
return false;
}
-// Wait for chrome tab to be fully loaded
-async function waitForChromeTabLoaded(timeoutMs = 10000) {
- const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
- const startTime = Date.now();
-
- while (Date.now() - startTime < timeoutMs) {
- if (fs.existsSync(navigationFile)) {
- return true;
- }
- // Wait 100ms before checking again
- await new Promise(resolve => setTimeout(resolve, 100));
- }
-
- return false;
-}
-
async function takeScreenshot(url) {
- const resolution = getEnv('CHROME_RESOLUTION', '1440,2000');
- const { width, height } = parseResolution(resolution);
-
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Wait for chrome_navigate to complete (writes navigation.json)
const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10);
const timeoutMs = timeoutSeconds * 1000;
- const pageLoaded = await waitForChromeTabLoaded(timeoutMs);
- if (!pageLoaded) {
- throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`);
+ const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
+ if (!fs.existsSync(navigationFile)) {
+ await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs);
}
- // Connect to existing Chrome session (required - no fallback)
- const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (!cdpUrl) {
+ const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+ const targetFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
+ if (!fs.existsSync(cdpFile)) {
throw new Error('No Chrome session found (chrome plugin must run first)');
}
-
- // Read target_id.txt to get the specific tab for this snapshot
- const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
- if (!fs.existsSync(targetIdFile)) {
+ if (!fs.existsSync(targetFile)) {
throw new Error('No target_id.txt found (chrome_tab must run first)');
}
- const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
+ const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim();
+ if (!cdpUrl.startsWith('ws://') && !cdpUrl.startsWith('wss://')) {
+ throw new Error('Invalid CDP URL in cdp_url.txt');
+ }
- const browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- defaultViewport: { width, height },
+ const { browser, page } = await connectToPage({
+ chromeSessionDir: CHROME_SESSION_DIR,
+ timeoutMs,
+ puppeteer,
});
try {
- // Get the specific page for this snapshot by target ID
- const targets = await browser.targets();
- const target = targets.find(t => t._targetId === targetId);
- if (!target) {
- throw new Error(`Target ${targetId} not found in Chrome session`);
+ const expectedTargetId = readTargetId(CHROME_SESSION_DIR);
+ if (!expectedTargetId) {
+ throw new Error('No target_id.txt found (chrome_tab must run first)');
+ }
+ const actualTargetId = page.target()._targetId;
+ if (actualTargetId !== expectedTargetId) {
+ throw new Error(`Target ${expectedTargetId} not found in Chrome session`);
}
- const page = await target.page();
- if (!page) {
- throw new Error(`Could not get page for target ${targetId}`);
- }
-
- // Set viewport on the page
- await page.setViewport({ width, height });
-
- // Take screenshot (Puppeteer throws on failure)
- await page.screenshot({
- path: outputPath,
- fullPage: true,
+ const captureTimeoutMs = Math.max(timeoutMs, 10000);
+ const timeoutPromise = new Promise((_, reject) => {
+ setTimeout(() => reject(new Error('Screenshot capture timed out')), captureTimeoutMs);
});
+ await page.bringToFront();
+ await Promise.race([
+ page.screenshot({ path: outputPath, fullPage: true }),
+ timeoutPromise,
+ ]);
+
return outputPath;
} finally {
@@ -188,6 +169,7 @@ async function main() {
status: 'succeeded',
output_str: outputPath,
}));
+ flushCoverageAndExit(0);
}
main().catch(e => {
diff --git a/archivebox/plugins/screenshot/templates/card.html b/archivebox/plugins/screenshot/templates/card.html
index 5d49374d..83cc2adc 100644
--- a/archivebox/plugins/screenshot/templates/card.html
+++ b/archivebox/plugins/screenshot/templates/card.html
@@ -2,7 +2,7 @@
diff --git a/archivebox/plugins/screenshot/templates/full.html b/archivebox/plugins/screenshot/templates/full.html
index b5f8901a..62226828 100644
--- a/archivebox/plugins/screenshot/templates/full.html
+++ b/archivebox/plugins/screenshot/templates/full.html
@@ -1,8 +1,7 @@
-
-
+
+
+ style="width: auto; max-width: 100%; height: auto; display: block;">
diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py
index 9627ec02..ddc466d3 100644
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -112,27 +112,7 @@ def test_screenshot_with_chrome_session():
assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000
assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n'
- # Scenario 2: Custom resolution
- screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2'
- screenshot_dir2.mkdir()
- env['CHROME_RESOLUTION'] = '800,600'
-
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(screenshot_dir2),
- capture_output=True,
- text=True,
- timeout=30,
- env=env
- )
-
- assert result.returncode == 0
- screenshot_file2 = screenshot_dir2 / 'screenshot.png'
- assert screenshot_file2.exists()
- file_size = screenshot_file2.stat().st_size
- assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}"
-
- # Scenario 3: Wrong target ID (error case)
+ # Scenario 2: Wrong target ID (error case)
screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3'
screenshot_dir3.mkdir()
(snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id')
@@ -149,9 +129,7 @@ def test_screenshot_with_chrome_session():
assert result.returncode != 0
assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower()
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- pytest.skip(f"Chrome session setup failed: {e}")
+ except RuntimeError:
raise
@@ -362,30 +340,6 @@ def test_missing_snapshot_id_argument():
assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
-def test_invalid_resolution_format():
- """Test that invalid CHROME_RESOLUTION format is handled gracefully."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
- # Invalid resolution formats to test parseResolution error handling
- for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']:
- env['CHROME_RESOLUTION'] = bad_resolution
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
- # Should either fail gracefully or fall back to default
- # (depending on implementation - script should not crash with uncaught error)
- assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
-
def test_no_cdp_url_fails():
"""Test error when chrome dir exists but no cdp_url.txt."""
with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/archivebox/plugins/search_backend_ripgrep/search.py b/archivebox/plugins/search_backend_ripgrep/search.py
index 171b60bb..dd94f153 100644
--- a/archivebox/plugins/search_backend_ripgrep/search.py
+++ b/archivebox/plugins/search_backend_ripgrep/search.py
@@ -18,6 +18,8 @@ import shutil
from pathlib import Path
from typing import List, Iterable
+from django.conf import settings
+
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
@@ -51,6 +53,12 @@ def _get_archive_dir() -> Path:
data_dir = os.environ.get('DATA_DIR', '').strip()
if data_dir:
return Path(data_dir) / 'archive'
+ settings_archive_dir = getattr(settings, 'ARCHIVE_DIR', None)
+ if settings_archive_dir:
+ return Path(settings_archive_dir)
+ settings_data_dir = getattr(settings, 'DATA_DIR', None)
+ if settings_data_dir:
+ return Path(settings_data_dir) / 'archive'
return Path.cwd() / 'archive'
diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
index 60eb6e3a..26b3f118 100644
--- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
@@ -25,9 +25,7 @@ def test_ripgrep_hook_detects_binary_from_path():
"""Test that ripgrep hook finds binary using abx-pkg when env var is just a name."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py'
- # Skip if rg is not installed
- if not shutil.which('rg'):
- pass
+ assert shutil.which('rg'), "ripgrep not installed"
# Set SEARCH_BACKEND_ENGINE to enable the hook
env = os.environ.copy()
@@ -78,8 +76,7 @@ def test_ripgrep_hook_handles_absolute_path():
hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py'
rg_path = shutil.which('rg')
- if not rg_path:
- pytest.skip("ripgrep not installed")
+ assert rg_path, "ripgrep not installed"
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
@@ -214,8 +211,7 @@ def test_ripgrep_only_detected_when_backend_enabled():
import sys
from pathlib import Path
- if not shutil.which('rg'):
- pytest.skip("ripgrep not installed")
+ assert shutil.which('rg'), "ripgrep not installed"
hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py'
diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py
index 1f0ce7fa..8c1f957a 100644
--- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py
@@ -151,7 +151,6 @@ class TestRipgrepSearch(TestCase):
results = search('test')
self.assertEqual(results, [])
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_single_match(self):
"""search should find matching snapshot."""
results = search('Python programming')
@@ -160,7 +159,6 @@ class TestRipgrepSearch(TestCase):
self.assertNotIn('snap-002', results)
self.assertNotIn('snap-003', results)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_multiple_matches(self):
"""search should find all matching snapshots."""
# 'guide' appears in snap-002 (JavaScript guide) and snap-003 (Archiving Guide)
@@ -170,7 +168,6 @@ class TestRipgrepSearch(TestCase):
self.assertIn('snap-003', results)
self.assertNotIn('snap-001', results)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_case_insensitive_by_default(self):
"""search should be case-sensitive (ripgrep default)."""
# By default rg is case-sensitive
@@ -181,13 +178,11 @@ class TestRipgrepSearch(TestCase):
self.assertIsInstance(results_upper, list)
self.assertIsInstance(results_lower, list)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_no_results(self):
"""search should return empty list for no matches."""
results = search('xyznonexistent123')
self.assertEqual(results, [])
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_regex(self):
"""search should support regex patterns."""
results = search('(Python|JavaScript)')
@@ -195,7 +190,6 @@ class TestRipgrepSearch(TestCase):
self.assertIn('snap-001', results)
self.assertIn('snap-002', results)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_distinct_snapshots(self):
"""search should return distinct snapshot IDs."""
# Query matches both files in snap-001
@@ -212,7 +206,6 @@ class TestRipgrepSearch(TestCase):
search('test')
self.assertIn('ripgrep binary not found', str(context.exception))
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_with_custom_args(self):
"""search should use custom RIPGREP_ARGS."""
with patch.dict(os.environ, {'RIPGREP_ARGS': '["-i"]'}): # Case insensitive
@@ -220,7 +213,6 @@ class TestRipgrepSearch(TestCase):
# With -i flag, should find regardless of case
self.assertIn('snap-001', results)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_timeout(self):
"""search should handle timeout gracefully."""
with patch.dict(os.environ, {'RIPGREP_TIMEOUT': '1'}):
@@ -285,19 +277,16 @@ class TestRipgrepSearchIntegration(TestCase):
else:
file_path.write_text(content)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_archivebox(self):
"""Search for archivebox should find documentation snapshot."""
results = search('archivebox')
self.assertIn('1704067200.123456', results)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_python(self):
"""Search for python should find Python news snapshot."""
results = search('Python')
self.assertIn('1704153600.654321', results)
- @pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_pip_install(self):
"""Search for installation command."""
results = search('pip install')
diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js
index e7e905f0..cc107d64 100755
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -21,86 +21,37 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
+// Import shared utilities from chrome_utils.js
+const {
+ getEnvBool,
+ getEnvInt,
+ parseArgs,
+ connectToPage,
+ waitForPageLoaded,
+} = require('../chrome/chrome_utils.js');
+
// Extractor metadata
const PLUGIN_NAME = 'seo';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'seo.json';
const CHROME_SESSION_DIR = '../chrome';
-// Parse command line arguments
-function parseArgs() {
- const args = {};
- process.argv.slice(2).forEach(arg => {
- if (arg.startsWith('--')) {
- const [key, ...valueParts] = arg.slice(2).split('=');
- args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
- }
- });
- return args;
-}
-
-// Get environment variable with default
-function getEnv(name, defaultValue = '') {
- return (process.env[name] || defaultValue).trim();
-}
-
-function getEnvBool(name, defaultValue = false) {
- const val = getEnv(name, '').toLowerCase();
- if (['true', '1', 'yes', 'on'].includes(val)) return true;
- if (['false', '0', 'no', 'off'].includes(val)) return false;
- return defaultValue;
-}
-
-// Wait for chrome tab to be fully loaded
-async function waitForChromeTabLoaded(timeoutMs = 60000) {
- const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
- const startTime = Date.now();
-
- while (Date.now() - startTime < timeoutMs) {
- if (fs.existsSync(navigationFile)) {
- return true;
- }
- // Wait 100ms before checking again
- await new Promise(resolve => setTimeout(resolve, 100));
- }
-
- return false;
-}
-
-// Get CDP URL from chrome plugin
-function getCdpUrl() {
- const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
- if (fs.existsSync(cdpFile)) {
- return fs.readFileSync(cdpFile, 'utf8').trim();
- }
- return null;
-}
-
// Extract SEO metadata
async function extractSeo(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
-
+ const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
let browser = null;
try {
- // Connect to existing Chrome session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
- }
-
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
+ // Connect to existing Chrome session and get target page
+ const connection = await connectToPage({
+ chromeSessionDir: CHROME_SESSION_DIR,
+ timeoutMs: timeout,
+ puppeteer,
});
-
- // Get the page
- const pages = await browser.pages();
- const page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- return { success: false, error: 'No page found in Chrome session' };
- }
+ browser = connection.browser;
+ const page = connection.page;
// Extract all meta tags
const seoData = await page.evaluate(() => {
@@ -179,15 +130,8 @@ async function main() {
process.exit(0);
}
- // Check if Chrome session exists, then wait for page load
- const cdpUrl = getCdpUrl();
- if (cdpUrl) {
- // Wait for page to be fully loaded
- const pageLoaded = await waitForChromeTabLoaded(60000);
- if (!pageLoaded) {
- throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
- }
- }
+ const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
+ await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
const result = await extractSeo(url);
diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py
index 63233b16..d0e2f09f 100644
--- a/archivebox/plugins/seo/tests/test_seo.py
+++ b/archivebox/plugins/seo/tests/test_seo.py
@@ -6,33 +6,24 @@ meta tag extraction.
"""
import json
-import shutil
import subprocess
import sys
import tempfile
+import shutil
from pathlib import Path
-import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
- get_test_env,
+ CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
-def chrome_available() -> bool:
- """Check if Chrome/Chromium is available."""
- for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
- if shutil.which(name):
- return True
- return False
-
-
# Get the path to the SEO hook
PLUGIN_DIR = get_plugin_dir(__file__)
SEO_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_seo.*')
@@ -47,7 +38,6 @@ class TestSEOPlugin(TestCase):
self.assertTrue(SEO_HOOK.exists(), f"Hook not found: {SEO_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestSEOWithChrome(TestCase):
"""Integration tests for SEO plugin with Chrome."""
@@ -64,71 +54,75 @@ class TestSEOWithChrome(TestCase):
test_url = 'https://example.com'
snapshot_id = 'test-seo-snapshot'
- try:
- with chrome_session(
- self.temp_dir,
- crawl_id='test-seo-crawl',
- snapshot_id=snapshot_id,
- test_url=test_url,
- navigate=True,
- timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
- # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+ with chrome_session(
+ self.temp_dir,
+ crawl_id='test-seo-crawl',
+ snapshot_id=snapshot_id,
+ test_url=test_url,
+ navigate=False,
+ timeout=30,
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ seo_dir = snapshot_chrome_dir.parent / 'seo'
+ seo_dir.mkdir(exist_ok=True)
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
+ self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
- # Run SEO hook with the active Chrome session
- result = subprocess.run(
- ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- capture_output=True,
- text=True,
- timeout=60,
- env=env
- )
+ # Run SEO hook with the active Chrome session
+ result = subprocess.run(
+ ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(seo_dir),
+ capture_output=True,
+ text=True,
+ timeout=60,
+ env=env
+ )
- # Check for output file
- seo_output = snapshot_chrome_dir / 'seo.json'
+ # Check for output file
+ seo_output = seo_dir / 'seo.json'
- seo_data = None
+ seo_data = None
- # Try parsing from file first
- if seo_output.exists():
- with open(seo_output) as f:
+ # Try parsing from file first
+ if seo_output.exists():
+ with open(seo_output) as f:
+ try:
+ seo_data = json.load(f)
+ except json.JSONDecodeError:
+ pass
+
+ # Try parsing from stdout if not in file
+ if not seo_data:
+ for line in result.stdout.split('\n'):
+ line = line.strip()
+ if line.startswith('{'):
try:
- seo_data = json.load(f)
+ record = json.loads(line)
+ # SEO data typically has title, description, or og: tags
+ if any(key in record for key in ['title', 'description', 'og:title', 'canonical']):
+ seo_data = record
+ break
except json.JSONDecodeError:
- pass
+ continue
- # Try parsing from stdout if not in file
- if not seo_data:
- for line in result.stdout.split('\n'):
- line = line.strip()
- if line.startswith('{'):
- try:
- record = json.loads(line)
- # SEO data typically has title, description, or og: tags
- if any(key in record for key in ['title', 'description', 'og:title', 'canonical']):
- seo_data = record
- break
- except json.JSONDecodeError:
- continue
+ # Verify hook ran successfully
+ self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
+ self.assertNotIn('Traceback', result.stderr)
+ self.assertNotIn('Error:', result.stderr)
- # Verify hook ran successfully
- self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
- self.assertNotIn('Traceback', result.stderr)
- self.assertNotIn('Error:', result.stderr)
+ # example.com has a title, so we MUST get SEO data
+ self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout")
- # example.com has a title, so we MUST get SEO data
- self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout")
-
- # Verify we got some SEO data
- has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta'])
- self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}")
-
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
- raise
+ # Verify we got some SEO data
+ has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta'])
+ self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}")
if __name__ == '__main__':
diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
index 3590c793..4d91e0e7 100644
--- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
@@ -9,12 +9,12 @@ Environment variables:
SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
- SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY)
+ SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) [unused; shared Chrome session required]
SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
- SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS)
+ SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) [unused; shared Chrome session required]
SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
@@ -138,8 +138,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Archive URL using SingleFile.
- If a Chrome session exists (from chrome plugin), connects to it via CDP.
- Otherwise launches a new Chrome instance.
+ Requires a Chrome session (from chrome plugin) and connects to it via CDP.
Returns: (success, output_path, error_message)
"""
@@ -151,8 +150,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
- chrome_args = get_env_array('SINGLEFILE_CHROME_ARGS') or get_env_array('CHROME_ARGS', [])
- chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '')
+ # Chrome args/binary are intentionally ignored because we require a shared Chrome session
cmd = [binary, *singlefile_args]
@@ -176,14 +174,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if cdp_remote_url:
print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr)
cmd.extend(['--browser-server', cdp_remote_url])
- elif chrome:
- print(f'[singlefile] Launching Chrome binary: {chrome}', file=sys.stderr)
- cmd.extend(['--browser-executable-path', chrome])
-
- # Pass Chrome arguments (only when launching a new browser)
- if chrome_args and not cdp_remote_url:
- # SingleFile expects --browser-args as a JSON array string
- cmd.extend(['--browser-args', json.dumps(chrome_args)])
+ else:
+ return False, None, 'No Chrome session found (chrome plugin must run first)'
# SSL handling
if not check_ssl:
@@ -267,8 +259,8 @@ def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str |
# Only attempt if chrome session exists
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
if not cdp_url:
- print('[singlefile] No chrome session (cdp_url.txt missing)', file=sys.stderr)
- return False, None, 'No Chrome session available'
+ print('[singlefile] No Chrome session found (chrome plugin must run first)', file=sys.stderr)
+ return False, None, 'No Chrome session found (chrome plugin must run first)'
if not EXTENSION_SAVE_SCRIPT.exists():
print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr)
diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py
index dd533e3c..8de0a163 100644
--- a/archivebox/plugins/singlefile/tests/test_singlefile.py
+++ b/archivebox/plugins/singlefile/tests/test_singlefile.py
@@ -59,27 +59,71 @@ def test_verify_deps_with_abx_pkg():
def test_singlefile_cli_archives_example_com():
- """Test that singlefile CLI archives example.com and produces valid HTML."""
+ """Test that singlefile archives example.com and produces valid HTML."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- env = get_test_env()
- env['SINGLEFILE_ENABLED'] = 'true'
+ data_dir = tmpdir / 'data'
+ extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
+ downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
+ user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
+ extensions_dir.mkdir(parents=True, exist_ok=True)
+ downloads_dir.mkdir(parents=True, exist_ok=True)
+ user_data_dir.mkdir(parents=True, exist_ok=True)
+
+ env_install = os.environ.copy()
+ env_install.update({
+ 'DATA_DIR': str(data_dir),
+ 'CHROME_EXTENSIONS_DIR': str(extensions_dir),
+ 'CHROME_DOWNLOADS_DIR': str(downloads_dir),
+ })
- # Run singlefile snapshot hook
result = subprocess.run(
- [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
- cwd=tmpdir,
+ ['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
- env=env,
- timeout=120
+ env=env_install,
+ timeout=120,
)
+ assert result.returncode == 0, f"Extension install failed: {result.stderr}"
+
+ old_env = os.environ.copy()
+ os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
+ os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
+ os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
+ try:
+ with chrome_session(
+ tmpdir=tmpdir,
+ crawl_id='singlefile-cli-crawl',
+ snapshot_id='singlefile-cli-snap',
+ test_url=TEST_URL,
+ navigate=True,
+ timeout=30,
+ ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
+ env['SINGLEFILE_ENABLED'] = 'true'
+ env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
+ env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
+
+ singlefile_output_dir = snapshot_chrome_dir.parent / 'singlefile'
+ singlefile_output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Run singlefile snapshot hook
+ result = subprocess.run(
+ [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
+ cwd=singlefile_output_dir,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=120,
+ )
+ finally:
+ os.environ.clear()
+ os.environ.update(old_env)
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
# Verify output file exists
- output_file = tmpdir / 'singlefile.html'
+ output_file = singlefile_output_dir / 'singlefile.html'
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
# Verify it contains real HTML
diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
index 59740e5c..6559d9fd 100755
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
@@ -34,18 +34,26 @@ const CHROME_SESSION_DIR = '../chrome';
let browser = null;
let page = null;
+let client = null;
let sslCaptured = false;
let shuttingDown = false;
async function setupListener(url) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000;
+ let targetHost = null;
// Only extract SSL for HTTPS URLs
if (!url.startsWith('https://')) {
throw new Error('URL is not HTTPS');
}
+ try {
+ targetHost = new URL(url).host;
+ } catch (e) {
+ targetHost = null;
+ }
+
// Connect to Chrome page using shared utility
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
@@ -53,54 +61,54 @@ async function setupListener(url) {
puppeteer,
});
- // Set up listener to capture SSL details during navigation
- page.on('response', async (response) => {
+ client = await page.target().createCDPSession();
+ await client.send('Network.enable');
+
+ client.on('Network.responseReceived', (params) => {
try {
- const request = response.request();
+ if (sslCaptured) return;
+ if (params.type && params.type !== 'Document') return;
+ const response = params.response || {};
+ const responseUrl = response.url || '';
+ if (!responseUrl.startsWith('http')) return;
- // Only capture the main navigation request
- if (!request.isNavigationRequest() || request.frame() !== page.mainFrame()) {
- return;
+ if (targetHost) {
+ try {
+ const responseHost = new URL(responseUrl).host;
+ if (responseHost !== targetHost) return;
+ } catch (e) {
+ // Ignore URL parse errors, fall through
+ }
}
- // Only capture if it's for our target URL
- if (!response.url().startsWith(url.split('?')[0])) {
- return;
- }
-
- // Get security details from the response
- const securityDetails = response.securityDetails();
- let sslInfo = {};
+ const securityDetails = response.securityDetails || null;
+ let sslInfo = { url: responseUrl };
if (securityDetails) {
- sslInfo.protocol = securityDetails.protocol();
- sslInfo.subjectName = securityDetails.subjectName();
- sslInfo.issuer = securityDetails.issuer();
- sslInfo.validFrom = securityDetails.validFrom();
- sslInfo.validTo = securityDetails.validTo();
- sslInfo.certificateId = securityDetails.subjectName();
- sslInfo.securityState = 'secure';
+ sslInfo.protocol = securityDetails.protocol;
+ sslInfo.subjectName = securityDetails.subjectName;
+ sslInfo.issuer = securityDetails.issuer;
+ sslInfo.validFrom = securityDetails.validFrom;
+ sslInfo.validTo = securityDetails.validTo;
+ sslInfo.certificateId = securityDetails.subjectName;
+ sslInfo.securityState = response.securityState || 'secure';
sslInfo.schemeIsCryptographic = true;
- const sanList = securityDetails.sanList();
+ const sanList = securityDetails.sanList;
if (sanList && sanList.length > 0) {
sslInfo.subjectAlternativeNames = sanList;
}
- } else if (response.url().startsWith('https://')) {
- // HTTPS URL but no security details means something went wrong
- sslInfo.securityState = 'unknown';
+ } else if (responseUrl.startsWith('https://')) {
+ sslInfo.securityState = response.securityState || 'unknown';
sslInfo.schemeIsCryptographic = true;
sslInfo.error = 'No security details available';
} else {
- // Non-HTTPS URL
sslInfo.securityState = 'insecure';
sslInfo.schemeIsCryptographic = false;
}
- // Write output directly to file
fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
sslCaptured = true;
-
} catch (e) {
// Ignore errors
}
diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py
index 5dfa17df..6f8375c1 100644
--- a/archivebox/plugins/ssl/tests/test_ssl.py
+++ b/archivebox/plugins/ssl/tests/test_ssl.py
@@ -13,26 +13,18 @@ import tempfile
import time
from pathlib import Path
-import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
+ CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
-def chrome_available() -> bool:
- """Check if Chrome/Chromium is available."""
- for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
- if shutil.which(name):
- return True
- return False
-
-
# Get the path to the SSL hook
PLUGIN_DIR = get_plugin_dir(__file__)
SSL_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_ssl.*')
@@ -47,7 +39,6 @@ class TestSSLPlugin(TestCase):
self.assertTrue(SSL_HOOK.exists(), f"Hook not found: {SSL_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestSSLWithChrome(TestCase):
"""Integration tests for SSL plugin with Chrome."""
@@ -64,88 +55,92 @@ class TestSSLWithChrome(TestCase):
test_url = 'https://example.com'
snapshot_id = 'test-ssl-snapshot'
- try:
- with chrome_session(
- self.temp_dir,
- crawl_id='test-ssl-crawl',
- snapshot_id=snapshot_id,
- test_url=test_url,
- navigate=True,
- timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
- # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+ with chrome_session(
+ self.temp_dir,
+ crawl_id='test-ssl-crawl',
+ snapshot_id=snapshot_id,
+ test_url=test_url,
+ navigate=False,
+ timeout=30,
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ ssl_dir = snapshot_chrome_dir.parent / 'ssl'
+ ssl_dir.mkdir(exist_ok=True)
+ # Run SSL hook with the active Chrome session (background hook)
+ result = subprocess.Popen(
+ ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(ssl_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
- # Run SSL hook with the active Chrome session (background hook)
- result = subprocess.Popen(
- ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
+ self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
- # Allow it to run briefly, then terminate (background hook)
- time.sleep(3)
- if result.poll() is None:
- result.terminate()
- try:
- stdout, stderr = result.communicate(timeout=5)
- except subprocess.TimeoutExpired:
- result.kill()
- stdout, stderr = result.communicate()
- else:
+ # Check for output file
+ ssl_output = ssl_dir / 'ssl.jsonl'
+ for _ in range(30):
+ if ssl_output.exists() and ssl_output.stat().st_size > 0:
+ break
+ time.sleep(1)
+
+ if result.poll() is None:
+ result.terminate()
+ try:
+ stdout, stderr = result.communicate(timeout=5)
+ except subprocess.TimeoutExpired:
+ result.kill()
stdout, stderr = result.communicate()
+ else:
+ stdout, stderr = result.communicate()
- # Check for output file
- ssl_output = snapshot_chrome_dir / 'ssl.jsonl'
+ ssl_data = None
- ssl_data = None
+ # Try parsing from file first
+ if ssl_output.exists():
+ with open(ssl_output) as f:
+ content = f.read().strip()
+ if content.startswith('{'):
+ try:
+ ssl_data = json.loads(content)
+ except json.JSONDecodeError:
+ pass
- # Try parsing from file first
- if ssl_output.exists():
- with open(ssl_output) as f:
- for line in f:
- line = line.strip()
- if line.startswith('{'):
- try:
- ssl_data = json.loads(line)
- break
- except json.JSONDecodeError:
- continue
+ # Try parsing from stdout if not in file
+ if not ssl_data:
+ for line in stdout.split('\n'):
+ line = line.strip()
+ if line.startswith('{'):
+ try:
+ record = json.loads(line)
+ if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL':
+ ssl_data = record
+ break
+ except json.JSONDecodeError:
+ continue
- # Try parsing from stdout if not in file
- if not ssl_data:
- for line in stdout.split('\n'):
- line = line.strip()
- if line.startswith('{'):
- try:
- record = json.loads(line)
- if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL':
- ssl_data = record
- break
- except json.JSONDecodeError:
- continue
+ # Verify hook ran successfully
+ self.assertNotIn('Traceback', stderr)
+ self.assertNotIn('Error:', stderr)
- # Verify hook ran successfully
- self.assertNotIn('Traceback', stderr)
- self.assertNotIn('Error:', stderr)
+ # example.com uses HTTPS, so we MUST get SSL certificate data
+ self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL")
- # example.com uses HTTPS, so we MUST get SSL certificate data
- self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL")
-
- # Verify we got certificate info
- self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}")
- self.assertTrue(
- ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'),
- f"Unexpected protocol: {ssl_data['protocol']}"
- )
-
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
- raise
+ # Verify we got certificate info
+ self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}")
+ self.assertTrue(
+ ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'),
+ f"Unexpected protocol: {ssl_data['protocol']}"
+ )
if __name__ == '__main__':
diff --git a/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js
index 33531d93..984e15c7 100644
--- a/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js
+++ b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js
@@ -149,6 +149,17 @@ function getFilenameFromUrl(url) {
}
}
+function normalizeUrl(url) {
+ try {
+ const parsed = new URL(url);
+ let path = parsed.pathname || '';
+ if (path === '/') path = '';
+ return `${parsed.origin}${path}`;
+ } catch (e) {
+ return url;
+ }
+}
+
async function setupStaticFileListener() {
const timeout = getEnvInt('STATICFILE_TIMEOUT', 30) * 1000;
@@ -174,7 +185,7 @@ async function setupStaticFileListener() {
const status = response.status();
// Only process the main document response
- if (url !== originalUrl) return;
+ if (normalizeUrl(url) !== normalizeUrl(originalUrl)) return;
if (status < 200 || status >= 300) return;
firstResponseHandled = true;
@@ -313,6 +324,19 @@ async function main() {
// Wait for chrome_navigate to complete (non-fatal)
try {
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
+ if (!detectedContentType && page) {
+ try {
+ const inferred = await page.evaluate(() => document.contentType || '');
+ if (inferred) {
+ detectedContentType = inferred.split(';')[0].trim();
+ if (isStaticContentType(detectedContentType)) {
+ isStaticFile = true;
+ }
+ }
+ } catch (e) {
+ // Best-effort only
+ }
+ }
} catch (e) {
console.error(`WARN: ${e.message}`);
}
diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py
index b99be87c..f40b0677 100644
--- a/archivebox/plugins/staticfile/tests/test_staticfile.py
+++ b/archivebox/plugins/staticfile/tests/test_staticfile.py
@@ -48,7 +48,6 @@ class TestStaticfilePlugin(TestCase):
self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}")
-@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestStaticfileWithChrome(TestCase):
"""Integration tests for staticfile plugin with Chrome."""
@@ -116,9 +115,7 @@ class TestStaticfileWithChrome(TestCase):
except json.JSONDecodeError:
continue
- except RuntimeError as e:
- if 'Chrome' in str(e) or 'CDP' in str(e):
- self.skipTest(f"Chrome session setup failed: {e}")
+ except RuntimeError:
raise
diff --git a/archivebox/plugins/title/on_Snapshot__54_title.js b/archivebox/plugins/title/on_Snapshot__54_title.js
index cfad4add..af89e779 100644
--- a/archivebox/plugins/title/on_Snapshot__54_title.js
+++ b/archivebox/plugins/title/on_Snapshot__54_title.js
@@ -2,22 +2,27 @@
/**
* Extract the title of a URL.
*
- * If a Chrome session exists (from chrome plugin), connects to it via CDP
+ * Requires a Chrome session (from chrome plugin) and connects to it via CDP
* to get the page title (which includes JS-rendered content).
- * Otherwise falls back to fetching the URL and parsing HTML.
*
* Usage: on_Snapshot__10_title.js --url= --snapshot-id=
* Output: Writes title/title.txt
*
* Environment variables:
- * TIMEOUT: Timeout in seconds (default: 30)
- * USER_AGENT: User agent string (optional)
+ * TITLE_TIMEOUT: Timeout in seconds (default: 30)
*/
const fs = require('fs');
const path = require('path');
-const https = require('https');
-const http = require('http');
+const puppeteer = require('puppeteer-core');
+
+// Import shared utilities from chrome_utils.js
+const {
+ getEnvInt,
+ parseArgs,
+ connectToPage,
+ waitForPageLoaded,
+} = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'title';
@@ -25,189 +30,47 @@ const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'title.txt';
const CHROME_SESSION_DIR = '../chrome';
-// Parse command line arguments
-function parseArgs() {
- const args = {};
- process.argv.slice(2).forEach(arg => {
- if (arg.startsWith('--')) {
- const [key, ...valueParts] = arg.slice(2).split('=');
- args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
- }
- });
- return args;
-}
-
-// Get environment variable with default
-function getEnv(name, defaultValue = '') {
- return (process.env[name] || defaultValue).trim();
-}
-
-function getEnvInt(name, defaultValue = 0) {
- const val = parseInt(getEnv(name, String(defaultValue)), 10);
- return isNaN(val) ? defaultValue : val;
-}
-
-// Wait for chrome tab to be fully loaded
-async function waitForChromeTabLoaded(timeoutMs = 60000) {
- const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
- const startTime = Date.now();
-
- while (Date.now() - startTime < timeoutMs) {
- if (fs.existsSync(navigationFile)) {
- return true;
- }
- // Wait 100ms before checking again
- await new Promise(resolve => setTimeout(resolve, 100));
- }
-
- return false;
-}
-
-// Get CDP URL from chrome plugin if available
-function getCdpUrl() {
- const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
- if (fs.existsSync(cdpFile)) {
- return fs.readFileSync(cdpFile, 'utf8').trim();
- }
- return null;
-}
-
-// Extract title from HTML
-function extractTitleFromHtml(html) {
- // Try tag
- const titleMatch = html.match(/]*>([^<]+)<\/title>/i);
- if (titleMatch) {
- return titleMatch[1].trim();
- }
-
- // Try og:title
- const ogMatch = html.match(/]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i);
- if (ogMatch) {
- return ogMatch[1].trim();
- }
-
- // Try twitter:title
- const twitterMatch = html.match(/]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i);
- if (twitterMatch) {
- return twitterMatch[1].trim();
- }
-
- return null;
-}
-
-// Fetch URL and extract title (fallback method)
-function fetchTitle(url) {
- return new Promise((resolve, reject) => {
- const timeout = getEnvInt('TIMEOUT', 30) * 1000;
- const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
-
- const client = url.startsWith('https') ? https : http;
-
- const req = client.get(url, {
- headers: { 'User-Agent': userAgent },
- timeout,
- }, (res) => {
- // Handle redirects
- if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
- fetchTitle(res.headers.location).then(resolve).catch(reject);
- return;
- }
-
- let data = '';
- res.on('data', chunk => {
- data += chunk;
- // Only need first 64KB to find title
- if (data.length > 65536) {
- req.destroy();
- }
- });
- res.on('end', () => {
- const title = extractTitleFromHtml(data);
- if (title) {
- resolve(title);
- } else {
- reject(new Error('No title found in HTML'));
- }
- });
- });
-
- req.on('error', reject);
- req.on('timeout', () => {
- req.destroy();
- reject(new Error('Request timeout'));
- });
- });
-}
-
-// Get title using Puppeteer CDP connection
-async function getTitleFromCdp(cdpUrl) {
- // Wait for page to be fully loaded
- const pageLoaded = await waitForChromeTabLoaded(60000);
- if (!pageLoaded) {
- throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
- }
-
- const puppeteer = require('puppeteer-core');
-
- const browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- });
+async function extractTitle(url) {
+ // Output directory is current directory (hook already runs in output dir)
+ const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+ const timeoutMs = getEnvInt('TITLE_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
+ let browser = null;
try {
- // Get existing pages
- const pages = await browser.pages();
- const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+ const connection = await connectToPage({
+ chromeSessionDir: CHROME_SESSION_DIR,
+ timeoutMs,
+ puppeteer,
+ });
+ browser = connection.browser;
+ const page = connection.page;
- if (!page) {
- throw new Error('No page found in Chrome session');
- }
+ await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs * 4, 200);
// Get title from page
- const title = await page.title();
+ let title = await page.title();
if (!title) {
// Try getting from DOM directly
- const domTitle = await page.evaluate(() => {
+ title = await page.evaluate(() => {
return document.title ||
document.querySelector('meta[property="og:title"]')?.content ||
document.querySelector('meta[name="twitter:title"]')?.content ||
document.querySelector('h1')?.textContent?.trim();
});
- return domTitle;
}
- return title;
- } finally {
- // Disconnect without closing browser
- browser.disconnect();
- }
-}
-
-async function extractTitle(url) {
- // Output directory is current directory (hook already runs in output dir)
- const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
-
- // Try Chrome session first
- const cdpUrl = getCdpUrl();
- if (cdpUrl) {
- try {
- const title = await getTitleFromCdp(cdpUrl);
- if (title) {
- fs.writeFileSync(outputPath, title, 'utf8');
- return { success: true, output: outputPath, title, method: 'cdp' };
- }
- } catch (e) {
- console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`);
+ if (title) {
+ fs.writeFileSync(outputPath, title, 'utf8');
+ return { success: true, output: outputPath, title, method: 'cdp' };
}
- }
-
- // Fallback to HTTP fetch
- try {
- const title = await fetchTitle(url);
- fs.writeFileSync(outputPath, title, 'utf8');
- return { success: true, output: outputPath, title, method: 'http' };
+ return { success: false, error: 'No title found in Chrome session' };
} catch (e) {
return { success: false, error: e.message };
+ } finally {
+ if (browser) {
+ browser.disconnect();
+ }
}
}
diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py
index 91b548d6..78b2ffbd 100644
--- a/archivebox/plugins/title/tests/test_title.py
+++ b/archivebox/plugins/title/tests/test_title.py
@@ -7,8 +7,7 @@ Tests verify:
3. Title extraction works for real example.com
4. Output file contains actual page title
5. Handles various title sources (, og:title, twitter:title)
-6. Config options work (TIMEOUT, USER_AGENT)
-7. Fallback to HTTP when chrome not available
+6. Config options work (TITLE_TIMEOUT)
"""
import json
@@ -23,6 +22,9 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
+ get_test_env,
+ chrome_session,
+ CHROME_NAVIGATE_HOOK,
)
@@ -30,6 +32,25 @@ PLUGIN_DIR = get_plugin_dir(__file__)
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
TEST_URL = 'https://example.com'
+def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id):
+ nav_result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env,
+ )
+ result = subprocess.run(
+ ['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
+ cwd=title_dir,
+ capture_output=True,
+ text=True,
+ timeout=60,
+ env=env,
+ )
+ return nav_result, result
+
def test_hook_script_exists():
"""Verify hook script exists."""
@@ -46,15 +67,18 @@ def test_extracts_title_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # Run title extraction
- result = subprocess.run(
- ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ title_dir = snapshot_chrome_dir.parent / 'title'
+ title_dir.mkdir(exist_ok=True)
+
+ nav_result, result = run_title_capture(
+ title_dir,
+ snapshot_chrome_dir,
+ env,
+ TEST_URL,
+ 'test789',
+ )
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -76,7 +100,7 @@ def test_extracts_title_from_example_com():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file exists (hook writes to current directory)
- title_file = tmpdir / 'title.txt'
+ title_file = title_dir / 'title.txt'
assert title_file.exists(), "title.txt not created"
# Verify title contains REAL example.com title
@@ -88,56 +112,33 @@ def test_extracts_title_from_example_com():
assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
-def test_falls_back_to_http_when_chrome_unavailable():
- """Test that title plugin falls back to HTTP when chrome unavailable."""
+def test_fails_without_chrome_session():
+ """Test that title plugin fails when chrome session is missing."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
-
- # Don't create chrome directory - force HTTP fallback
+ title_dir = tmpdir / 'snapshot' / 'title'
+ title_dir.mkdir(parents=True, exist_ok=True)
# Run title extraction
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
- cwd=tmpdir,
+ cwd=title_dir,
capture_output=True,
text=True,
- timeout=60
- ,
- env=get_test_env())
+ timeout=60,
+ env=get_test_env(),
+ )
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
-
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
-
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
-
- # Verify output exists and has real title (hook writes to current directory)
- output_title_file = tmpdir / 'title.txt'
- assert output_title_file.exists(), "Output title.txt not created"
-
- title_text = output_title_file.read_text().strip()
- assert 'example' in title_text.lower()
+ assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}"
+ assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
def test_config_timeout_honored():
- """Test that TIMEOUT config is respected."""
+ """Test that TITLE_TIMEOUT config is respected."""
if not shutil.which('node'):
pass
@@ -147,65 +148,27 @@ def test_config_timeout_honored():
# Set very short timeout (but example.com should still succeed)
import os
- env = os.environ.copy()
- env['TIMEOUT'] = '5'
+ env_override = os.environ.copy()
+ env_override['TITLE_TIMEOUT'] = '5'
- result = subprocess.run(
- ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30
- )
+ with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ title_dir = snapshot_chrome_dir.parent / 'title'
+ title_dir.mkdir(exist_ok=True)
+ env.update(env_override)
+
+ nav_result, result = run_title_capture(
+ title_dir,
+ snapshot_chrome_dir,
+ env,
+ TEST_URL,
+ 'testtimeout',
+ )
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
-def test_config_user_agent():
- """Test that USER_AGENT config is used."""
-
- if not shutil.which('node'):
- pass
-
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
-
- # Set custom user agent
- import os
- env = os.environ.copy()
- env['USER_AGENT'] = 'TestBot/1.0'
-
- result = subprocess.run(
- ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=60
- )
-
- # Should succeed (example.com doesn't block)
- if result.returncode == 0:
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
-
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
-
-
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
@@ -215,18 +178,22 @@ def test_handles_https_urls():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- result = subprocess.run(
- ['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
+ title_dir = snapshot_chrome_dir.parent / 'title'
+ title_dir.mkdir(exist_ok=True)
+
+ nav_result, result = run_title_capture(
+ title_dir,
+ snapshot_chrome_dir,
+ env,
+ 'https://example.org',
+ 'testhttps',
+ )
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if result.returncode == 0:
# Hook writes to current directory
- output_title_file = tmpdir / 'title.txt'
+ output_title_file = title_dir / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert len(title_text) > 0, "Title should not be empty"
@@ -246,14 +213,23 @@ def test_handles_404_gracefully():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- result = subprocess.run(
- ['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (
+ _process,
+ _pid,
+ snapshot_chrome_dir,
+ env,
+ ):
+ title_dir = snapshot_chrome_dir.parent / 'title'
+ title_dir.mkdir(exist_ok=True)
+
+ nav_result, result = run_title_capture(
+ title_dir,
+ snapshot_chrome_dir,
+ env,
+ 'https://example.com/nonexistent-page-404',
+ 'test404',
+ )
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# May succeed or fail depending on server behavior
# example.com returns "Example Domain" even for 404s
@@ -269,20 +245,29 @@ def test_handles_redirects():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- # http://example.com redirects to https://example.com
- result = subprocess.run(
- ['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- ,
- env=get_test_env())
+ with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as (
+ _process,
+ _pid,
+ snapshot_chrome_dir,
+ env,
+ ):
+ title_dir = snapshot_chrome_dir.parent / 'title'
+ title_dir.mkdir(exist_ok=True)
+
+ # http://example.com redirects to https://example.com
+ nav_result, result = run_title_capture(
+ title_dir,
+ snapshot_chrome_dir,
+ env,
+ 'http://example.com',
+ 'testredirect',
+ )
+ assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# Should succeed and follow redirect
if result.returncode == 0:
# Hook writes to current directory
- output_title_file = tmpdir / 'title.txt'
+ output_title_file = title_dir / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert 'example' in title_text.lower()
diff --git a/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js
index 3fe8a10a..2dd2002f 100755
--- a/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js
@@ -174,7 +174,7 @@ async function configure2Captcha() {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
- return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
+ return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
index 5738cc05..4569cb49 100644
--- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
+++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
@@ -44,7 +44,7 @@ class TestTwoCaptcha:
def setup(self):
self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
if not self.api_key:
- pytest.skip("TWOCAPTCHA_API_KEY required")
+ pytest.fail("TWOCAPTCHA_API_KEY required")
def test_install_and_load(self):
"""Extension installs and loads in Chromium."""
diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py
index debea7f3..a3ab08a8 100644
--- a/archivebox/plugins/ublock/tests/test_ublock.py
+++ b/archivebox/plugins/ublock/tests/test_ublock.py
@@ -14,6 +14,7 @@ import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
+ get_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
@@ -283,8 +284,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
- cwd=str(script_dir,
- env=get_test_env()),
+ cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
@@ -301,11 +301,10 @@ const puppeteer = require('puppeteer-core');
return json.loads(output_lines[-1])
-# Test URL: Yahoo has many ads that uBlock should block
+# Test URL: Yahoo has many ads that uBlock should block (no mocks)
TEST_URL = 'https://www.yahoo.com/'
-@pytest.mark.timeout(15)
def test_extension_loads_in_chromium():
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
@@ -519,15 +518,15 @@ const puppeteer = require('puppeteer-core');
pass
-def test_blocks_ads_on_test_page():
- """Live test: verify uBlock Origin blocks ads on a test page.
+def test_blocks_ads_on_yahoo_com():
+ """Live test: verify uBlock Origin blocks ads on yahoo.com (real network).
This test runs TWO browser sessions:
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
2. WITH extension - verifies ads ARE blocked
This ensures we're actually testing the extension's effect, not just
- that a test page happens to show ads as blocked.
+ that a test page happens to show ads as blocked. No mocks are used.
"""
import time
@@ -581,20 +580,15 @@ def test_blocks_ads_on_test_page():
# Verify baseline shows ads ARE visible (not blocked)
if baseline_result['adElementsFound'] == 0:
- pytest.skip(
- f"Cannot test extension: no ad elements found on {TEST_URL}. "
- f"The page may have changed or loaded differently."
+ pytest.fail(
+ f"Baseline must find ad elements on {TEST_URL}, but found none. "
+ f"This test requires a real ad-heavy page."
)
if baseline_result['adElementsVisible'] == 0:
- print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
- print("This suggests either:")
- print(" - There's another ad blocker interfering")
- print(" - Network-level ad blocking is in effect")
-
- pytest.skip(
- f"Cannot test extension: baseline shows no visible ads "
- f"despite finding {baseline_result['adElementsFound']} ad elements."
+ pytest.fail(
+ f"Baseline must have visible ads on {TEST_URL}, but none were visible. "
+ f"This likely means another ad blocker is active or network-level blocking is in effect."
)
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
@@ -713,6 +707,10 @@ const puppeteer = require('{env_base['NODE_MODULES_DIR']}/puppeteer-core');
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Expected fewer ads with extension."
+ # Ensure uBlock actually blocks at least some ad/track requests
+ assert ext_result['blockedRequests'] > 0, \
+ "uBlock should block at least one ad/track request on yahoo.com"
+
# Extension should block at least 20% of ads (was consistently blocking 5-13% without proper init time)
assert reduction_percent >= 20, \
f"uBlock should block at least 20% of ads.\n" \
diff --git a/archivebox/plugins/ytdlp/templates/card.html b/archivebox/plugins/ytdlp/templates/card.html
index 1694ceae..6fe32098 100644
--- a/archivebox/plugins/ytdlp/templates/card.html
+++ b/archivebox/plugins/ytdlp/templates/card.html
@@ -1,14 +1,17 @@
-
-