This commit is contained in:
Nick Sweeting
2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions

View File

@@ -393,7 +393,7 @@ VOLUME "$DATA_DIR"
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
CMD curl --silent 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK'
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

View File

@@ -104,7 +104,8 @@ archivebox init --setup
curl -fsSL 'https://get.archivebox.io' | bash
</code></pre>
<br/>
<sub>Open <a href="http://localhost:8000"><code>http://localhost:8000</code></a> to see your server's Web UI ➡️</sub>
<sub>Open <a href="http://web.archivebox.localhost:8000"><code>http://web.archivebox.localhost:8000</code></a> for the public UI and <a href="http://admin.archivebox.localhost:8000"><code>http://admin.archivebox.localhost:8000</code></a> for the admin UI ➡️</sub><br/>
<sub>Set <code>LISTEN_HOST</code> to change the base domain; <code>web.</code> and <code>admin.</code> subdomains are used automatically.</sub>
</details>
<br/>
@@ -469,6 +470,7 @@ For more discussion on managed and paid hosting options see here: <a href="https
#### ➡️&nbsp; Next Steps
- Import URLs from some of the supported [Input Formats](#input-formats) or view the supported [Output Formats](#output-formats)...
- (Optional) Create a persona and import browser cookies to archive logged-in sites: `archivebox persona create --import=chrome personal`
- Tweak your UI or archiving behavior [Configuration](#configuration), read about some of the [Caveats](#caveats), or [Troubleshoot](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
- Read about the [Dependencies](#dependencies) used for archiving, the [Upgrading Process](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives), or the [Archive Layout](#archive-layout) on disk...
- Or check out our full [Documentation](#documentation) or [Community Wiki](#internet-archiving-ecosystem)...
@@ -495,6 +497,11 @@ docker compose run archivebox help
# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help]
docker run -it -v $PWD:/data archivebox/archivebox help
# optional: import your browser cookies into a persona for logged-in archiving
archivebox persona create --import=chrome personal
# supported: chrome/chromium/brave/edge (Chromium-based only)
# re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data
```
#### ArchiveBox Subcommands
@@ -587,7 +594,8 @@ docker run -v $PWD:/data -it archivebox/archivebox archivebox manage createsuper
docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox
</code></pre>
<sup>Open <a href="http://localhost:8000"><code>http://localhost:8000</code></a> to see your server's Web UI ➡️</sup>
<sup>Open <a href="http://web.archivebox.localhost:8000"><code>http://web.archivebox.localhost:8000</code></a> for the public UI and <a href="http://admin.archivebox.localhost:8000"><code>http://admin.archivebox.localhost:8000</code></a> for the admin UI ➡️</sup><br/>
<sup>Set <code>LISTEN_HOST</code> to change the base domain; <code>web.</code> and <code>admin.</code> subdomains are used automatically.</sup>
<br/><br/>
<i>For more info, see our <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage">Usage: Web UI</a> wiki. ➡️</i>
<br/><br/>

View File

@@ -127,6 +127,20 @@ class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
pass
class DjangoSessionAuth:
"""Allow authenticating with existing Django session cookies (same-origin only)."""
def __call__(self, request: HttpRequest) -> Optional[AbstractBaseUser]:
return self.authenticate(request)
def authenticate(self, request: HttpRequest, **kwargs) -> Optional[AbstractBaseUser]:
user = getattr(request, 'user', None)
if user and user.is_authenticated:
request._api_auth_method = self.__class__.__name__
if not user.is_superuser:
raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
return cast(AbstractBaseUser, user)
return None
### Enabled Auth Methods
API_AUTH_METHODS = [
@@ -134,5 +148,4 @@ API_AUTH_METHODS = [
BearerTokenAuth(),
QueryParamTokenAuth(),
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
UsernameAndPasswordAuth(),
]

View File

@@ -0,0 +1,34 @@
__package__ = 'archivebox.api'
from django.http import HttpResponse
class ApiCorsMiddleware:
"""Attach permissive CORS headers for API routes (token-based auth)."""
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
if request.path.startswith('/api/'):
if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
response = HttpResponse(status=204)
return self._add_cors_headers(request, response)
response = self.get_response(request)
return self._add_cors_headers(request, response)
return self.get_response(request)
def _add_cors_headers(self, request, response):
origin = request.META.get('HTTP_ORIGIN')
if not origin:
return response
response['Access-Control-Allow-Origin'] = '*'
response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
response['Access-Control-Allow-Headers'] = (
'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
)
response['Access-Control-Max-Age'] = '600'
return response

View File

@@ -188,6 +188,11 @@ class SnapshotSchema(Schema):
return ArchiveResult.objects.none()
class SnapshotUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
class SnapshotFilterSchema(FilterSchema):
id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
created_by_id: str = Field(None, q='crawl__created_by_id')
@@ -225,6 +230,31 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
"""Update a snapshot (e.g., set status=sealed to cancel queued work)."""
try:
snapshot = Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
except Snapshot.DoesNotExist:
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
payload = data.dict(exclude_unset=True)
if 'status' in payload:
if payload['status'] not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {payload["status"]}')
snapshot.status = payload['status']
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
snapshot.retry_at = None
if 'retry_at' in payload:
snapshot.retry_at = payload['retry_at']
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
request.with_archiveresults = False
return snapshot
### Tag #########################################################################
class TagSchema(Schema):

View File

@@ -3,11 +3,13 @@ __package__ = 'archivebox.api'
from uuid import UUID
from typing import List
from datetime import datetime
from django.utils import timezone
from django.db.models import Q
from django.contrib.auth import get_user_model
from ninja import Router, Schema
from ninja.errors import HttpError
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
@@ -54,6 +56,11 @@ class CrawlSchema(Schema):
return Snapshot.objects.none()
class CrawlUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@@ -79,3 +86,32 @@ def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=F
return crawl
@router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl")
def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
crawl = Crawl.objects.get(id__icontains=crawl_id)
payload = data.dict(exclude_unset=True)
if 'status' in payload:
if payload['status'] not in Crawl.StatusChoices.values:
raise HttpError(400, f'Invalid status: {payload["status"]}')
crawl.status = payload['status']
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
crawl.retry_at = None
if 'retry_at' in payload:
crawl.retry_at = payload['retry_at']
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
if payload.get('status') == Crawl.StatusChoices.SEALED:
Snapshot.objects.filter(
crawl=crawl,
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
).update(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=timezone.now(),
)
return crawl

View File

@@ -15,6 +15,7 @@ Examples:
# Create a new persona
archivebox persona create work
archivebox persona create --import=chrome personal
archivebox persona create --import=edge work
# List all personas
archivebox persona list
@@ -34,6 +35,7 @@ import subprocess
import tempfile
from pathlib import Path
from typing import Optional, Iterable
from collections import OrderedDict
import rich_click as click
from rich import print as rprint
@@ -78,34 +80,6 @@ def get_chrome_user_data_dir() -> Optional[Path]:
return None
def get_firefox_profile_dir() -> Optional[Path]:
"""Get the default Firefox profile directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
elif system == 'Linux':
profiles_dir = home / '.mozilla' / 'firefox'
elif system == 'Windows':
app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
else:
return None
if not profiles_dir.exists():
return None
# Find the default profile (usually ends with .default or .default-release)
for profile in profiles_dir.iterdir():
if profile.is_dir() and ('default' in profile.name.lower()):
return profile
# If no default found, return the first profile
profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
return profiles[0] if profiles else None
def get_brave_user_data_dir() -> Optional[Path]:
"""Get the default Brave user data directory for the current platform."""
system = platform.system()
@@ -134,25 +108,99 @@ def get_brave_user_data_dir() -> Optional[Path]:
return None
def get_edge_user_data_dir() -> Optional[Path]:
"""Get the default Edge user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
candidates = [
home / 'Library' / 'Application Support' / 'Microsoft Edge',
]
elif system == 'Linux':
candidates = [
home / '.config' / 'microsoft-edge',
home / '.config' / 'microsoft-edge-beta',
home / '.config' / 'microsoft-edge-dev',
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
candidates = [
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
return candidate
return None
BROWSER_PROFILE_FINDERS = {
'chrome': get_chrome_user_data_dir,
'chromium': get_chrome_user_data_dir, # Same locations
'firefox': get_firefox_profile_dir,
'brave': get_brave_user_data_dir,
'edge': get_edge_user_data_dir,
}
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
# =============================================================================
# Cookie Extraction via CDP
# =============================================================================
NETSCAPE_COOKIE_HEADER = [
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by ArchiveBox persona cookie extraction',
'#',
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
'',
]
def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]":
cookies = OrderedDict()
if not path.exists():
return cookies
for line in path.read_text().splitlines():
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if len(parts) < 7:
continue
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
key = (domain, cookie_path, name)
cookies[key] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
return cookies
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
lines = list(NETSCAPE_COOKIE_HEADER)
for cookie in cookies.values():
lines.append('\t'.join(cookie))
path.write_text('\n'.join(lines) + '\n')
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
existing = _parse_netscape_cookies(existing_file)
new = _parse_netscape_cookies(new_file)
for key, cookie in new.items():
existing[key] = cookie
_write_netscape_cookies(existing_file, existing)
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
"""
Launch Chrome with the given user data dir and extract cookies via CDP.
Returns True if successful, False otherwise.
"""
from archivebox.config.constants import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
# Find the cookie extraction script
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
@@ -163,14 +211,21 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
return False
# Get node modules dir
node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
# Set up environment
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(node_modules_dir)
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
env['COOKIES_OUTPUT_FILE'] = str(output_file)
env['CHROME_HEADLESS'] = 'true'
output_path = output_file
temp_output = None
temp_dir = None
if output_file.exists():
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
temp_output = temp_dir / 'cookies.txt'
output_path = temp_output
env['COOKIES_OUTPUT_FILE'] = str(output_path)
try:
result = subprocess.run(
@@ -182,6 +237,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
)
if result.returncode == 0:
if temp_output and temp_output.exists():
_merge_netscape_cookies(output_file, temp_output)
return True
else:
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
@@ -196,6 +253,9 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
except Exception as e:
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
return False
finally:
if temp_dir and temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
# =============================================================================
@@ -323,6 +383,9 @@ def create_personas(
# Import browser profile if requested
if import_from and source_profile_dir:
cookies_file = Path(persona.path) / 'cookies.txt'
if import_from in CHROMIUM_BROWSERS:
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
# Copy the browser profile
@@ -349,7 +412,6 @@ def create_personas(
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
cookies_file = Path(persona.path) / 'cookies.txt'
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
@@ -589,7 +651,7 @@ def main():
@main.command('create')
@click.argument('names', nargs=-1)
@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
def create_cmd(names: tuple, import_from: Optional[str]):
"""Create Personas, optionally importing from a browser profile."""
sys.exit(create_personas(names, import_from=import_from))

View File

@@ -3,6 +3,9 @@
__package__ = 'archivebox.cli'
from typing import Iterable
import os
import sys
import subprocess
import rich_click as click
from rich import print
@@ -60,6 +63,26 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
pass
if run_in_debug:
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
if reload:
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
os.environ['ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER'] = '1'
from archivebox.config.common import STORAGE_CONFIG
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if not is_reloader_child:
env = os.environ.copy()
env['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
subprocess.Popen(
[sys.executable, '-m', 'archivebox', 'manage', 'orchestrator_watch', f'--pidfile={pidfile}'],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
@@ -79,7 +102,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
is_port_in_use,
)
from archivebox.workers.orchestrator import Orchestrator
import sys
# Check if port is already in use
if is_port_in_use(host, int(port)):

View File

@@ -99,8 +99,11 @@ class ServerConfig(BaseConfigSet):
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
ADMIN_BASE_URL: str = Field(default="")
ARCHIVE_BASE_URL: str = Field(default="")
ALLOWED_HOSTS: str = Field(default="*")
CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
CSRF_TRUSTED_ORIGINS: str = Field(default="http://admin.archivebox.localhost:8000")
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)

View File

@@ -118,6 +118,10 @@ class ConstantsDict(Mapping):
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
# Hard safety limits (seconds)
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((

View File

@@ -14,6 +14,7 @@ from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.models import ArchiveResult, Snapshot
@@ -57,7 +58,11 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_link = f'/{result.snapshot.archive_path}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/{result.snapshot.archive_path}/'
snapshot_id = str(getattr(result, 'snapshot_id', ''))
if embed_path and result.status == 'succeeded':
output_link = build_snapshot_url(snapshot_id, embed_path)
else:
output_link = build_snapshot_url(snapshot_id, '')
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
@@ -252,7 +257,7 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -300,10 +305,11 @@ class ArchiveResultAdmin(BaseModelAdmin):
description='Snapshot Info'
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
return format_html(
'<a href="/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.archive_path,
str(result.snapshot.id)[:8],
'<a href="{}"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
build_snapshot_url(snapshot_id, "index.html"),
snapshot_id[:8],
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
@@ -335,10 +341,10 @@ class ArchiveResultAdmin(BaseModelAdmin):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
snapshot_id = str(result.snapshot_id)
return format_html(
'<a href="/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.archive_path,
output_path,
'<a href="{}" class="output-link">↗️</a><pre>{}</pre>',
build_snapshot_url(snapshot_id, output_path),
result.output_str,
)
@@ -348,7 +354,11 @@ class ArchiveResultAdmin(BaseModelAdmin):
'<pre style="display: inline-block">{}</pre><br/>',
result.output_str,
)
output_html += format_html('<a href="/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.archive_path))
snapshot_id = str(result.snapshot_id)
output_html += format_html(
'<a href="{}#all">See result files ...</a><br/><pre><code>',
build_snapshot_url(snapshot_id, "index.html"),
)
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))

View File

@@ -8,6 +8,8 @@ from django.contrib import admin, messages
from django.urls import path
from django.utils.html import format_html, mark_safe
from django.utils import timezone
from django.db.models import Q, Sum, Count, Prefetch
from django.db.models.functions import Coalesce
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
@@ -18,11 +20,12 @@ from archivebox.misc.util import htmldecode, urldecode
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.misc.logging_util import printable_filesize
from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot
from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
@@ -36,7 +39,7 @@ class SnapshotActionForm(ActionForm):
super().__init__(*args, **kwargs)
# Define tags field in __init__ to avoid database access during app initialization
self.fields['tags'] = forms.CharField(
label='Edit tags',
label='',
required=False,
widget=TagEditorWidget(),
)
@@ -67,6 +70,19 @@ class SnapshotActionForm(ActionForm):
# )
class TagNameListFilter(admin.SimpleListFilter):
title = 'By tag name'
parameter_name = 'tag'
def lookups(self, request, model_admin):
return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by('name')]
def queryset(self, request, queryset):
if self.value():
return queryset.filter(tags__id=self.value())
return queryset
class SnapshotAdminForm(forms.ModelForm):
"""Custom form for Snapshot admin with tag editor widget."""
tags_editor = forms.CharField(
@@ -117,11 +133,11 @@ class SnapshotAdminForm(forms.ModelForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
form = SnapshotAdminForm
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
list_display = ('created_at', 'preview_icon', 'title_str', 'tags_inline', 'status_with_progress', 'files', 'size_with_stats')
sort_fields = ('title_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter)
fieldsets = (
('URL', {
@@ -163,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [] # Removed TagInline, using TagEditorWidget instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
@@ -182,6 +198,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
return super().changelist_view(request, GLOBAL_CONTEXT)
def get_actions(self, request):
actions = super().get_actions(request)
if 'delete_selected' in actions:
func, name, _desc = actions['delete_selected']
actions['delete_selected'] = (func, name, 'Delete')
return actions
def get_urls(self):
urls = super().get_urls()
@@ -196,6 +219,52 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
def get_queryset(self, request):
self.request = request
ordering_fields = self._get_ordering_fields(request)
needs_size_sort = 'size_with_stats' in ordering_fields
needs_files_sort = 'files' in ordering_fields
needs_tags_sort = 'tags_inline' in ordering_fields
prefetch_qs = ArchiveResult.objects.filter(
Q(status='succeeded')
).only(
'id',
'snapshot_id',
'plugin',
'status',
'output_size',
'output_files',
'output_str',
)
qs = (
super()
.get_queryset(request)
.defer('config', 'notes')
.prefetch_related('tags')
.prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs))
)
if needs_size_sort:
qs = qs.annotate(
output_size_sum=Coalesce(Sum(
'archiveresult__output_size',
filter=Q(archiveresult__status='succeeded'),
), 0),
)
if needs_files_sort:
qs = qs.annotate(
ar_succeeded_count=Count(
'archiveresult',
filter=Q(archiveresult__status='succeeded'),
),
)
if needs_tags_sort:
qs = qs.annotate(tag_count=Count('tags', distinct=True))
return qs
@admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
@@ -233,17 +302,19 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# )
def admin_actions(self, obj):
summary_url = build_web_url(f'/{obj.archive_path}')
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
return format_html(
'''
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/{}"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/{}/index.html#all"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
@@ -263,7 +334,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
⬇️ Get Missing
⬇️ Finish
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
@@ -291,8 +362,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
</p>
''',
obj.archive_path,
obj.archive_path,
summary_url,
results_url,
obj.url,
obj.pk,
obj.pk,
@@ -301,6 +372,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
def status_info(self, obj):
favicon_url = build_snapshot_url(str(obj.id), 'favicon.ico')
return format_html(
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
@@ -310,7 +382,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/{obj.archive_path}/favicon.ico',
favicon_url,
obj.extension or '-',
)
@@ -323,7 +395,37 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
ordering='title',
)
def title_str(self, obj):
# Render inline tag editor widget
title_raw = (obj.title or '').strip()
url_raw = (obj.url or '').strip()
title_normalized = title_raw.lower()
url_normalized = url_raw.lower()
show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized
css_class = 'fetched' if show_title else 'pending'
detail_url = build_web_url(f'/{obj.archive_path}/index.html')
title_html = ''
if show_title:
title_html = format_html(
'<a href="{}">'
'<b class="status-{}">{}</b>'
'</a>',
detail_url,
css_class,
urldecode(htmldecode(title_raw))[:128],
)
return format_html(
'{}'
'<div style="font-size: 11px; color: #64748b; margin-top: 2px;">'
'<a href="{}"><code style="user-select: all;">{}</code></a>'
'</div>',
title_html,
url_raw or obj.url,
(url_raw or obj.url)[:128],
)
@admin.display(description='Tags', ordering='tag_count')
def tags_inline(self, obj):
widget = InlineTagEditorWidget(snapshot_id=str(obj.pk))
tags_html = widget.render(
name=f'tags_{obj.pk}',
@@ -331,28 +433,58 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
attrs={'id': f'tags_{obj.pk}'},
snapshot_id=str(obj.pk),
)
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
# Show title if available, otherwise show URL
display_text = obj.title or obj.url
css_class = 'fetched' if obj.title else 'pending'
@admin.display(description='Preview', empty_value='')
def preview_icon(self, obj):
results = self._get_prefetched_results(obj)
has_screenshot = False
has_favicon = False
if results is not None:
has_screenshot = any(r.plugin == 'screenshot' for r in results)
has_favicon = any(r.plugin == 'favicon' for r in results)
if not has_screenshot and not has_favicon:
return None
if has_screenshot:
img_url = build_snapshot_url(str(obj.id), 'screenshot/screenshot.png')
fallbacks = [
build_snapshot_url(str(obj.id), 'screenshot.png'),
build_snapshot_url(str(obj.id), 'favicon/favicon.ico'),
build_snapshot_url(str(obj.id), 'favicon.ico'),
]
img_alt = 'Screenshot'
preview_class = 'screenshot'
else:
img_url = build_snapshot_url(str(obj.id), 'favicon/favicon.ico')
fallbacks = [
build_snapshot_url(str(obj.id), 'favicon.ico'),
]
img_alt = 'Favicon'
preview_class = 'favicon'
fallback_list = ','.join(fallbacks)
onerror_js = (
"this.dataset.fallbacks && this.dataset.fallbacks.length ? "
"(this.src=this.dataset.fallbacks.split(',').shift(), "
"this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : "
"this.remove()"
)
return format_html(
'<a href="/{}">'
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path,
obj.archive_path,
css_class,
urldecode(htmldecode(display_text))[:128]
) + mark_safe(f' <span class="tags-inline-editor">{tags_html}</span>')
'<img src="{}" alt="{}" class="snapshot-preview {}" decoding="async" loading="lazy" '
'onerror="{}" data-fallbacks="{}">',
img_url,
img_alt,
preview_class,
onerror_js,
fallback_list,
)
@admin.display(
description='Files Saved',
# ordering='archiveresult_count',
ordering='ar_succeeded_count',
)
def files(self, obj):
# return '-'
@@ -371,8 +503,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
'<a href="{}" title="View all files">{}</a>',
build_web_url(f'/{obj.archive_path}'),
size_txt,
)
@@ -382,7 +514,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
def status_with_progress(self, obj):
"""Show status with progress bar for in-progress snapshots."""
stats = obj.get_progress_stats()
stats = self._get_progress_stats(obj)
# Status badge colors
status_colors = {
@@ -440,16 +572,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.display(
description='Size',
ordering='output_size_sum',
)
def size_with_stats(self, obj):
"""Show archive size with output size from archive results."""
stats = obj.get_progress_stats()
# Use output_size from archive results if available, fallback to disk size
stats = self._get_progress_stats(obj)
output_size = stats['output_size']
archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
size_bytes = output_size or archive_size or 0
size_bytes = output_size or 0
if size_bytes:
size_txt = printable_filesize(size_bytes)
@@ -461,22 +590,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# Show hook statistics
if stats['total'] > 0:
return format_html(
'<a href="/{}" title="View all files" style="white-space: nowrap;">'
'<a href="{}" title="View all files" style="white-space: nowrap;">'
'{}</a>'
'<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">'
'{}/{} hooks</div>',
obj.archive_path,
build_web_url(f'/{obj.archive_path}'),
size_txt,
stats['succeeded'],
stats['total'],
)
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
'<a href="{}" title="View all files">{}</a>',
build_web_url(f'/{obj.archive_path}'),
size_txt,
)
def _get_progress_stats(self, obj):
results = self._get_prefetched_results(obj)
if results is None:
return obj.get_progress_stats()
total = len(results)
succeeded = sum(1 for r in results if r.status == 'succeeded')
failed = sum(1 for r in results if r.status == 'failed')
running = sum(1 for r in results if r.status == 'started')
skipped = sum(1 for r in results if r.status == 'skipped')
pending = max(total - succeeded - failed - running - skipped, 0)
completed = succeeded + failed + skipped
percent = int((completed / total * 100) if total > 0 else 0)
is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED)
output_size = None
if hasattr(obj, 'output_size_sum'):
output_size = obj.output_size_sum or 0
else:
output_size = sum(r.output_size or 0 for r in results if r.status == 'succeeded')
return {
'total': total,
'succeeded': succeeded,
'failed': failed,
'running': running,
'pending': pending,
'skipped': skipped,
'percent': percent,
'output_size': output_size or 0,
'is_sealed': is_sealed,
}
def _get_prefetched_results(self, obj):
if hasattr(obj, '_prefetched_objects_cache') and 'archiveresult_set' in obj._prefetched_objects_cache:
return obj.archiveresult_set.all()
return None
def _get_ordering_fields(self, request):
ordering = request.GET.get('o')
if not ordering:
return set()
fields = set()
for part in ordering.split('.'):
if not part:
continue
try:
idx = abs(int(part)) - 1
except ValueError:
continue
if 0 <= idx < len(self.list_display):
fields.add(self.list_display[idx])
return fields
@admin.display(
description='Original URL',
ordering='url',
@@ -524,20 +707,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# return super().changelist_view(request, extra_context=None)
@admin.action(
description=" Get Title"
)
def update_titles(self, request, queryset):
count = queryset.count()
# Queue snapshots for archiving via the state machine system
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
)
@admin.action(
description="⬇️ Get Missing"
description=" Finish"
)
def update_snapshots(self, request, queryset):
count = queryset.count()
@@ -551,7 +721,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.action(
description="🆕 Archive Again"
description="⬇️ Fresh"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
@@ -579,7 +749,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
@admin.action(
description=" Delete"
description="🗑 Delete"
)
def delete_snapshots(self, request, queryset):
"""Delete snapshots in a single transaction to avoid SQLite concurrency issues."""

View File

@@ -1,6 +1,9 @@
__package__ = 'archivebox.core'
from django.apps import AppConfig
import os
_ORCHESTRATOR_BOOTSTRAPPED = False
class CoreConfig(AppConfig):
@@ -10,6 +13,7 @@ class CoreConfig(AppConfig):
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
import sys
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
from archivebox.core.admin_site import register_admin_site
register_admin_site()
@@ -18,3 +22,45 @@ class CoreConfig(AppConfig):
# Skip during makemigrations to avoid premature state machine access
if 'makemigrations' not in sys.argv:
from archivebox.core import models # noqa: F401
pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE')
if pidfile:
should_write_pid = True
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if should_write_pid:
try:
with open(pidfile, 'w') as handle:
handle.write(str(os.getpid()))
except Exception:
pass
def _should_manage_orchestrator() -> bool:
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1':
return False
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1':
return False
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
return True
argv = ' '.join(sys.argv).lower()
if 'orchestrator' in argv:
return False
return 'daphne' in argv and '--reload' in sys.argv
if _should_manage_orchestrator():
global _ORCHESTRATOR_BOOTSTRAPPED
if _ORCHESTRATOR_BOOTSTRAPPED:
return
_ORCHESTRATOR_BOOTSTRAPPED = True
from archivebox.machine.models import Process, Machine
from archivebox.workers.orchestrator import Orchestrator
Process.cleanup_stale_running()
machine = Machine.current()
if not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()

View File

@@ -0,0 +1,189 @@
from __future__ import annotations
from __future__ import annotations
import re
from urllib.parse import urlparse
from archivebox.config.common import SERVER_CONFIG
_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$")
def split_host_port(host: str) -> tuple[str, str | None]:
parsed = urlparse(f"//{host}")
hostname = (parsed.hostname or host or "").lower()
port = str(parsed.port) if parsed.port else None
return hostname, port
def _normalize_base_url(value: str | None) -> str:
if not value:
return ""
base = value.strip()
if not base:
return ""
if "://" not in base:
base = f"http://{base}"
parsed = urlparse(base)
if not parsed.netloc:
return ""
return f"{parsed.scheme}://{parsed.netloc}"
def normalize_base_url(value: str | None) -> str:
return _normalize_base_url(value)
def get_listen_host() -> str:
return (SERVER_CONFIG.LISTEN_HOST or "").strip()
def get_listen_parts() -> tuple[str, str | None]:
return split_host_port(get_listen_host())
def _build_listen_host(subdomain: str | None) -> str:
host, port = get_listen_parts()
if not host:
return ""
full_host = f"{subdomain}.{host}" if subdomain else host
if port:
return f"{full_host}:{port}"
return full_host
def get_admin_host() -> str:
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return urlparse(override).netloc.lower()
return _build_listen_host("admin")
def get_web_host() -> str:
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return urlparse(override).netloc.lower()
return _build_listen_host("web")
def get_api_host() -> str:
return _build_listen_host("api")
def get_public_host() -> str:
return _build_listen_host("public")
def get_snapshot_host(snapshot_id: str) -> str:
return _build_listen_host(snapshot_id)
def get_original_host(domain: str) -> str:
return _build_listen_host(domain)
def is_snapshot_subdomain(subdomain: str) -> bool:
return bool(_SNAPSHOT_ID_RE.match(subdomain or ""))
def get_listen_subdomain(request_host: str) -> str:
req_host, req_port = split_host_port(request_host)
listen_host, listen_port = get_listen_parts()
if not listen_host:
return ""
if listen_port and req_port and listen_port != req_port:
return ""
if req_host == listen_host:
return ""
suffix = f".{listen_host}"
if req_host.endswith(suffix):
return req_host[: -len(suffix)]
return ""
def host_matches(request_host: str, target_host: str) -> bool:
if not request_host or not target_host:
return False
req_host, req_port = split_host_port(request_host)
target_host_only, target_port = split_host_port(target_host)
if req_host != target_host_only:
return False
if target_port and req_port and target_port != req_port:
return False
return True
def _scheme_from_request(request=None) -> str:
if request:
return request.scheme
return "http"
def _build_base_url_for_host(host: str, request=None) -> str:
if not host:
return ""
scheme = _scheme_from_request(request)
return f"{scheme}://{host}"
def get_admin_base_url(request=None) -> str:
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return override
return _build_base_url_for_host(get_admin_host(), request=request)
def get_web_base_url(request=None) -> str:
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return override
return _build_base_url_for_host(get_web_host(), request=request)
def get_api_base_url(request=None) -> str:
return _build_base_url_for_host(get_api_host(), request=request)
# Backwards-compat aliases (archive == web)
def get_archive_base_url(request=None) -> str:
return get_web_base_url(request=request)
def get_snapshot_base_url(snapshot_id: str, request=None) -> str:
return _build_base_url_for_host(get_snapshot_host(snapshot_id), request=request)
def get_original_base_url(domain: str, request=None) -> str:
return _build_base_url_for_host(get_original_host(domain), request=request)
def build_admin_url(path: str = "", request=None) -> str:
return _build_url(get_admin_base_url(request), path)
def build_web_url(path: str = "", request=None) -> str:
return _build_url(get_web_base_url(request), path)
def build_api_url(path: str = "", request=None) -> str:
return _build_url(get_api_base_url(request), path)
def build_archive_url(path: str = "", request=None) -> str:
return _build_url(get_archive_base_url(request), path)
def build_snapshot_url(snapshot_id: str, path: str = "", request=None) -> str:
return _build_url(get_snapshot_base_url(snapshot_id, request=request), path)
def build_original_url(domain: str, path: str = "", request=None) -> str:
return _build_url(get_original_base_url(domain, request=request), path)
def _build_url(base_url: str, path: str) -> str:
if not base_url:
if not path:
return ""
return path if path.startswith("/") else f"/{path}"
if not path:
return base_url
return f"{base_url}{path if path.startswith('/') else f'/{path}'}"

View File

@@ -2,11 +2,33 @@ __package__ = 'archivebox.core'
import ipaddress
import re
from pathlib import Path
from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
from django.shortcuts import redirect
from django.contrib.staticfiles import finders
from django.utils.http import http_date
from django.http import HttpResponseNotModified
from archivebox.config.common import SERVER_CONFIG
from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from archivebox.core.host_utils import (
build_admin_url,
build_api_url,
build_web_url,
get_api_host,
get_admin_host,
get_listen_host,
get_listen_subdomain,
get_public_host,
get_web_host,
host_matches,
is_snapshot_subdomain,
)
from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
def detect_timezone(request, activate: bool=True):
@@ -30,17 +52,112 @@ def TimezoneMiddleware(get_response):
def CacheControlMiddleware(get_response):
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip()
def middleware(request):
response = get_response(request)
if request.path.startswith('/static/'):
rel_path = request.path[len('/static/'):]
static_path = finders.find(rel_path)
if static_path:
try:
mtime = Path(static_path).stat().st_mtime
except OSError:
mtime = None
etag = f'"{static_cache_key}:{int(mtime) if mtime else 0}"'
inm = request.META.get("HTTP_IF_NONE_MATCH")
if inm:
inm_list = [item.strip() for item in inm.split(",")]
if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
not_modified = HttpResponseNotModified()
not_modified.headers["ETag"] = etag
not_modified.headers["Cache-Control"] = "public, max-age=31536000, immutable"
if mtime:
not_modified.headers["Last-Modified"] = http_date(mtime)
return not_modified
response.headers["ETag"] = etag
response.headers["Cache-Control"] = "public, max-age=31536000, immutable"
if mtime and not response.headers.get("Last-Modified"):
response.headers["Last-Modified"] = http_date(mtime)
return response
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control'])
if not response.get('Cache-Control'):
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control'])
return response
return middleware
def HostRoutingMiddleware(get_response):
def middleware(request):
request_host = (request.get_host() or "").lower()
admin_host = get_admin_host()
web_host = get_web_host()
api_host = get_api_host()
public_host = get_public_host()
listen_host = get_listen_host()
subdomain = get_listen_subdomain(request_host)
if host_matches(request_host, admin_host):
return get_response(request)
if host_matches(request_host, api_host):
request.user = AnonymousUser()
request._cached_user = request.user
if request.path.startswith("/admin"):
target = build_admin_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
if not request.path.startswith("/api/"):
target_path = f"/api{request.path if request.path.startswith('/') else f'/{request.path}'}"
if request.META.get("QUERY_STRING"):
target_path = f"{target_path}?{request.META['QUERY_STRING']}"
return redirect(target_path)
return get_response(request)
if host_matches(request_host, web_host):
request.user = AnonymousUser()
request._cached_user = request.user
if request.path.startswith("/admin"):
target = build_admin_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
if host_matches(request_host, public_host):
request.user = AnonymousUser()
request._cached_user = request.user
return get_response(request)
if subdomain:
if is_snapshot_subdomain(subdomain):
view = SnapshotHostView.as_view()
return view(request, snapshot_id=subdomain, path=request.path.lstrip("/"))
view = OriginalDomainHostView.as_view()
return view(request, domain=subdomain, path=request.path.lstrip("/"))
if host_matches(request_host, listen_host):
target = build_web_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
if admin_host or web_host:
target = build_web_url(request.path, request=request)
if target:
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
return middleware
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())

View File

@@ -0,0 +1,17 @@
# Generated by Codex on 2026-01-21
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0030_alter_archiveresult_id'),
]
operations = [
migrations.AddIndex(
model_name='archiveresult',
index=models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'),
),
]

View File

@@ -1297,7 +1297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
path = self.archive_path
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>'
# Get all plugins from hooks system (sorted by numeric prefix)
all_plugins = [get_plugin_name(e) for e in get_plugins()]
@@ -1322,7 +1322,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
icon
)
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
return format_html('<span class="files-icons" style="font-size: 1em; opacity: 0.8; display: inline-grid; grid-auto-flow: column; grid-auto-columns: auto; grid-template-rows: repeat(4, auto); gap: 0 0; justify-content: start; align-content: start;">{}</span>', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
@@ -1789,7 +1789,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)['total_size'] or 0
# Check if sealed
is_sealed = self.status in (self.StatusChoices.SEALED, self.StatusChoices.FAILED, self.StatusChoices.BACKOFF)
is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED)
return {
'total': total,
@@ -1992,6 +1992,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file())
else:
size = abs_path.stat().st_size
plugin_lower = (result.plugin or '').lower()
if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl'):
plugin_dir = snap_dir / result.plugin
if plugin_dir.exists():
try:
size = sum(p.stat().st_size for p in plugin_dir.rglob('*') if p.is_file())
except OSError:
pass
outputs.append({
'name': result.plugin,
'path': embed_path,
@@ -2057,6 +2065,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
from archivebox.misc.util import ts_to_date_str
from archivebox.core.host_utils import build_snapshot_url
result = {
'TYPE': 'core.models.Snapshot',
@@ -2078,6 +2087,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'is_static': self.is_static,
'is_archived': self.is_archived,
'archive_path': self.archive_path,
'archive_url': build_snapshot_url(str(self.id), 'index.html'),
'output_dir': self.output_dir,
'link_dir': self.output_dir, # backwards compatibility alias
'archive_size': self.archive_size,
@@ -2129,14 +2139,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
outputs_by_plugin = {out['name']: out for out in outputs}
best_preview_path = 'about:blank'
best_result = {'path': 'about:blank', 'result': None}
for plugin in preview_priority:
out = outputs_by_plugin.get(plugin)
if out and out.get('path'):
best_preview_path = out['path']
best_result = out
break
if best_preview_path == 'about:blank' and outputs:
best_preview_path = outputs[0].get('path') or 'about:blank'
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
@@ -2151,6 +2164,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'best_preview_path': best_preview_path,
'best_result': best_result,
'archiveresults': outputs,
}
rendered_html = render_to_string('snapshot.html', context)
@@ -2326,6 +2340,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
indexes = [
models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'),
]
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
@@ -2487,6 +2504,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_lower = (plugin_name or '').lower()
prefer_media = plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl')
preferred_text = []
if plugin_lower:
preferred_text.extend([
f'{plugin_lower}.jsonl',
f'{plugin_lower}.json',
f'{plugin_lower}.txt',
f'{plugin_lower}.log',
])
preferred_text.extend(['index.jsonl', 'index.json'])
for name in preferred_text:
candidate = dir_path / name
if candidate.exists() and candidate.is_file():
return candidate
if not prefer_media:
for name in ('index.html', 'index.htm'):
candidate = dir_path / name
@@ -2504,6 +2535,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if file_path.is_dir() or file_path.name.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext in ('pid', 'log', 'sh'):
continue
if ext not in embeddable_exts:
continue
try:
@@ -2547,20 +2580,44 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Fallback: treat output_str as a file path only if it exists on disk
if self.output_str:
try:
output_path = Path(self.output_str)
raw_output = str(self.output_str).strip()
if raw_output in ('.', './', ''):
best_file = self._find_best_output_file(plugin_dir, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
output_path = None
else:
output_path = Path(raw_output)
if output_path.is_absolute():
if output_path and output_path.is_absolute():
# If absolute and within snapshot dir, normalize to relative
if snapshot_dir in output_path.parents and output_path.exists():
return str(output_path.relative_to(snapshot_dir))
else:
if output_path.is_file():
return str(output_path.relative_to(snapshot_dir))
if output_path.is_dir():
best_file = self._find_best_output_file(output_path, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
elif output_path:
# If relative, prefer plugin-prefixed path, then direct path
if (plugin_dir / output_path).exists():
return f'{self.plugin}/{output_path}'
plugin_candidate = plugin_dir / output_path
if plugin_candidate.exists():
if plugin_candidate.is_file():
return f'{self.plugin}/{output_path}'
if plugin_candidate.is_dir():
best_file = self._find_best_output_file(plugin_candidate, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'):
return None
if (snapshot_dir / output_path).exists():
return str(output_path)
snapshot_candidate = snapshot_dir / output_path
if snapshot_candidate.exists():
if snapshot_candidate.is_file():
return str(output_path)
if snapshot_candidate.is_dir():
best_file = self._find_best_output_file(snapshot_candidate, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
except Exception:
pass
@@ -2569,7 +2626,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
output_candidates = [
f for f in self.output_files.keys()
if Path(f).name not in ignored
if Path(f).name not in ignored and Path(f).suffix not in ('.pid', '.log', '.sh')
]
first_file = output_candidates[0] if output_candidates else None
if first_file and (plugin_dir / first_file).exists():

View File

@@ -12,6 +12,7 @@ import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa
from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
@@ -77,9 +78,11 @@ MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"archivebox.api.middleware.ApiCorsMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
"archivebox.core.middleware.HostRoutingMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"archivebox.core.middleware.CacheControlMiddleware",
# Additional middlewares from plugins (if any)
@@ -347,6 +350,14 @@ SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnop
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",")
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(",")))
admin_base_url = normalize_base_url(get_admin_base_url())
if admin_base_url and admin_base_url not in CSRF_TRUSTED_ORIGINS:
CSRF_TRUSTED_ORIGINS.append(admin_base_url)
api_base_url = normalize_base_url(get_api_base_url())
if api_base_url and api_base_url not in CSRF_TRUSTED_ORIGINS:
CSRF_TRUSTED_ORIGINS.append(api_base_url)
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
for hostname in ALLOWED_HOSTS:
@@ -363,6 +374,7 @@ CSRF_COOKIE_SECURE = False
SESSION_COOKIE_SECURE = False
SESSION_COOKIE_HTTPONLY = True
SESSION_COOKIE_DOMAIN = None
CSRF_COOKIE_DOMAIN = None
SESSION_COOKIE_AGE = 1209600 # 2 weeks
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
SESSION_SAVE_EVERY_REQUEST = False

View File

@@ -15,6 +15,6 @@ def get_config(key: str) -> any:
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
"""
try:
return _get_config(key)
return _get_config().get(key)
except (KeyError, AttributeError):
return None

View File

@@ -9,10 +9,114 @@ from pathlib import Path
from archivebox.hooks import (
get_plugin_icon, get_plugin_template, get_plugin_name,
)
from archivebox.core.host_utils import (
get_admin_base_url,
get_web_base_url,
get_snapshot_base_url,
build_snapshot_url,
)
register = template.Library()
_MEDIA_FILE_EXTS = {
'.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v', '.mpg', '.mpeg', '.ts', '.m2ts', '.mts',
'.3gp', '.3g2', '.ogv',
'.mp3', '.m4a', '.aac', '.ogg', '.oga', '.opus', '.wav', '.flac', '.alac', '.aiff', '.wma', '.mka', '.ac3', '.eac3', '.dts',
}
def _count_media_files(result) -> int:
try:
output_files = getattr(result, 'output_files', None) or {}
except Exception:
output_files = {}
count_from_output = 0
if output_files:
count_from_output = sum(
1
for path in output_files.keys()
if Path(path).suffix.lower() in _MEDIA_FILE_EXTS
)
if count_from_output >= 2:
return count_from_output
try:
plugin_dir = Path(result.snapshot_dir) / result.plugin
except Exception:
return 0
if not plugin_dir.exists():
return 0
count = 0
scanned = 0
max_scan = 500
for file_path in plugin_dir.rglob('*'):
if scanned >= max_scan:
break
scanned += 1
if not file_path.is_file():
continue
if file_path.suffix.lower() in _MEDIA_FILE_EXTS:
count += 1
return max(count_from_output, count)
def _list_media_files(result) -> list[dict]:
media_files: list[dict] = []
try:
plugin_dir = Path(result.snapshot_dir) / result.plugin
snapshot_dir = Path(result.snapshot_dir)
except Exception:
return media_files
output_files = getattr(result, 'output_files', None) or {}
candidates: list[Path] = []
if output_files:
for path in output_files.keys():
rel_path = Path(path)
if rel_path.suffix.lower() in _MEDIA_FILE_EXTS:
candidates.append(rel_path)
if not candidates and plugin_dir.exists():
scanned = 0
max_scan = 2000
for file_path in plugin_dir.rglob('*'):
if scanned >= max_scan:
break
scanned += 1
if not file_path.is_file():
continue
if file_path.suffix.lower() in _MEDIA_FILE_EXTS:
try:
rel_path = file_path.relative_to(plugin_dir)
except ValueError:
continue
candidates.append(rel_path)
for rel_path in candidates:
file_path = plugin_dir / rel_path
if not file_path.exists() or not file_path.is_file():
continue
try:
size = file_path.stat().st_size
except OSError:
size = None
try:
href = str(file_path.relative_to(snapshot_dir))
except ValueError:
href = str(Path(result.plugin) / rel_path)
media_files.append({
'name': file_path.name,
'path': href,
'size': size,
})
media_files.sort(key=lambda item: item['name'].lower())
return media_files
@register.filter(name='split')
def split(value, separator: str=','):
return (value or '').split(separator)
@@ -52,6 +156,28 @@ def url_replace(context, **kwargs):
return dict_.urlencode()
@register.simple_tag(takes_context=True)
def admin_base_url(context) -> str:
return get_admin_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def web_base_url(context) -> str:
return get_web_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def snapshot_base_url(context, snapshot) -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)
return get_snapshot_base_url(str(snapshot_id), request=context.get('request'))
@register.simple_tag(takes_context=True)
def snapshot_url(context, snapshot, path: str = "") -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)
return build_snapshot_url(str(snapshot_id), path, request=context.get('request'))
@register.simple_tag
def plugin_icon(plugin: str) -> str:
"""
@@ -82,24 +208,41 @@ def plugin_card(context, result) -> str:
template_str = get_plugin_template(plugin, 'card')
# Use embed_path() for the display path
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
output_url = build_snapshot_url(
str(getattr(result, 'snapshot_id', '')),
raw_output_path or '',
request=context.get('request'),
)
icon_html = get_plugin_icon(plugin)
plugin_lower = (plugin or '').lower()
media_file_count = _count_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else 0
media_files = _list_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else []
if media_files:
snapshot_id = str(getattr(result, 'snapshot_id', ''))
request = context.get('request')
for item in media_files:
path = item.get('path') or ''
item['url'] = build_snapshot_url(snapshot_id, path, request=request) if path else ''
output_lower = (output_path or '').lower()
output_lower = (raw_output_path or '').lower()
text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
force_text_preview = output_lower.endswith(text_preview_exts)
# Create a mini template and render it with context
try:
if template_str and output_path and str(output_path).strip() not in ('.', '/', './') and not force_text_preview:
if template_str and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './') and not force_text_preview:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'output_path': output_url,
'output_path_raw': raw_output_path,
'plugin': plugin,
'plugin_icon': icon_html,
'media_file_count': media_file_count,
'media_files': media_files,
})
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
@@ -108,10 +251,10 @@ def plugin_card(context, result) -> str:
except Exception:
pass
if force_text_preview and output_path and str(output_path).strip() not in ('.', '/', './'):
output_file = Path(output_path)
if force_text_preview and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './'):
output_file = Path(raw_output_path)
if not output_file.is_absolute():
output_file = Path(result.snapshot_dir) / output_path
output_file = Path(result.snapshot_dir) / raw_output_path
try:
output_file = output_file.resolve()
snap_dir = Path(result.snapshot_dir).resolve()
@@ -169,14 +312,20 @@ def plugin_full(context, result) -> str:
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
output_url = build_snapshot_url(
str(getattr(result, 'snapshot_id', '')),
raw_output_path or '',
request=context.get('request'),
)
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'output_path': output_url,
'output_path_raw': raw_output_path,
'plugin': plugin,
})
rendered = tpl.render(ctx)
@@ -198,3 +347,30 @@ def plugin_name(value: str) -> str:
Usage: {{ result.plugin|plugin_name }}
"""
return get_plugin_name(value)
@register.filter
def plugin_display_name(value: str) -> str:
"""
Human-friendly plugin name overrides for UI display.
"""
name = get_plugin_name(value)
if name == 'merkletree':
return 'hashes'
return name
@register.simple_tag(takes_context=True)
def api_token(context) -> str:
"""
Return an API token string for the logged-in user, creating one if needed.
"""
from archivebox.api.auth import get_or_create_api_token
request = context.get('request')
user = getattr(request, 'user', None)
if not user or not user.is_authenticated:
return ''
token = get_or_create_api_token(user)
return token.token if token else ''

View File

@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view
from archivebox.workers.views import JobsDashboardView
@@ -29,11 +29,15 @@ urlpatterns = [
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
path('public/', PublicIndexView.as_view(), name='public-index'),
path('public.html', RedirectView.as_view(url='/public/'), name='public-index-html'),
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
re_path(r'^web/(?P<url>(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$', WebAddView.as_view(), name='web-add'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'),
re_path(r'^(?P<username>[^/]+)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url-nodate'),
re_path(r'^(?P<username>[^/]+)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path-nodate'),
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),

View File

@@ -1,13 +1,16 @@
__package__ = 'archivebox.core'
import os
import posixpath
from glob import glob, escape
from django.utils import timezone
import inspect
from typing import Callable, get_type_hints
from pathlib import Path
from urllib.parse import urlparse
from django.shortcuts import render, redirect
from django.http import HttpRequest, HttpResponse, Http404
from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.views import View
from django.views.generic.list import ListView
@@ -31,6 +34,21 @@ from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index
from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
target = archivefile or ''
if target == 'index.html':
target = ''
fullpath = Path(snapshot.output_dir) / target
if fullpath.is_file():
target = str(Path(target).parent)
if target == '.':
target = ''
return target
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
@@ -86,13 +104,95 @@ class SnapshotView(View):
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
outputs = snapshot.discover_outputs()
hidden_card_plugins = {'archivedotorg', 'favicon', 'title'}
outputs = [
out for out in snapshot.discover_outputs()
if (out.get('size') or 0) > 0 and out.get('name') not in hidden_card_plugins
]
archiveresults = {out['name']: out for out in outputs}
snap_dir = Path(snapshot.output_dir)
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
accounted_entries: set[str] = set()
for output in outputs:
output_name = output.get('name') or ''
if output_name:
accounted_entries.add(output_name)
output_path = output.get('path') or ''
if not output_path:
continue
parts = Path(output_path).parts
if parts:
accounted_entries.add(parts[0])
ignore_names = {
'.DS_Store',
'index.html',
'index.json',
'index.jsonl',
'favicon.ico',
}
ignored_suffixes = {'.log', '.pid', '.sh'}
max_loose_scan = 300
def has_meaningful_files(dir_path: Path) -> bool:
scanned = 0
for file_path in dir_path.rglob('*'):
scanned += 1
if scanned > max_loose_scan:
return True
if file_path.is_dir() or file_path.name.startswith('.'):
continue
if file_path.suffix.lower() in ignored_suffixes:
continue
try:
if file_path.stat().st_size == 0:
continue
except OSError:
continue
return True
return False
unaccounted_entries = []
if snap_dir.exists():
for entry in snap_dir.iterdir():
name = entry.name
if name.startswith('.') or name in ignore_names or name in accounted_entries:
continue
is_dir = entry.is_dir()
is_meaningful = False
size = None
if is_dir:
is_meaningful = has_meaningful_files(entry)
elif entry.is_file():
if entry.suffix.lower() not in ignored_suffixes:
try:
size = entry.stat().st_size
is_meaningful = size > 0
except OSError:
size = None
is_meaningful = False
unaccounted_entries.append({
'name': name,
'path': name,
'is_dir': is_dir,
'size': size,
'is_meaningful': is_meaningful,
})
unaccounted_entries.sort(key=lambda item: item['name'].lower())
loose_items = [item for item in unaccounted_entries if item['is_meaningful']]
failed_exclude_suffixes = {'.json', '.jsonl', '.sh', '.log'}
failed_items = [
item for item in unaccounted_entries
if not item['is_meaningful']
and not (
not item['is_dir']
and Path(item['name']).suffix.lower() in failed_exclude_suffixes
)
]
preview_priority = [
'singlefile',
'screenshot',
@@ -111,12 +211,48 @@ class SnapshotView(View):
break
snapshot_info = snapshot.to_dict(extended=True)
related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url)
related_snapshots = list(
related_snapshots_qs.exclude(id=snapshot.id).order_by('-bookmarked_at', '-created_at', '-timestamp')[:25]
)
related_years_map: dict[int, list[Snapshot]] = {}
for snap in [snapshot, *related_snapshots]:
snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at
if not snap_dt:
continue
related_years_map.setdefault(snap_dt.year, []).append(snap)
related_years = []
for year, snaps in related_years_map.items():
snaps_sorted = sorted(
snaps,
key=lambda s: (s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now()),
reverse=True,
)
related_years.append({
'year': year,
'latest': snaps_sorted[0],
'snapshots': snaps_sorted,
})
related_years.sort(key=lambda item: item['year'], reverse=True)
try:
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
except IndexError:
warc_path = 'warc/'
ordered_outputs = sorted(
archiveresults.values(),
key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size'],
)
non_compact_outputs = [
out for out in ordered_outputs
if not out.get('is_compact') and not out.get('is_metadata')
]
compact_outputs = [
out for out in ordered_outputs
if out.get('is_compact') or out.get('is_metadata')
]
context = {
**snapshot_info,
'title': htmlencode(
@@ -131,9 +267,13 @@ class SnapshotView(View):
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
'warc_path': warc_path,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'archiveresults': [*non_compact_outputs, *compact_outputs],
'best_result': best_result,
'snapshot': snapshot, # Pass the snapshot object for template tags
'related_snapshots': related_snapshots,
'related_years': related_years,
'loose_items': loose_items,
'failed_items': failed_items,
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -168,13 +308,20 @@ class SnapshotView(View):
target_path = f'{target_path}?{query}'
return redirect(target_path)
if archivefile == 'index.html':
if request.GET.get('files'):
target_path = _files_index_target(snapshot, archivefile)
response = serve_static_with_byterange_support(
request, target_path, document_root=snapshot.output_dir, show_indexes=True,
)
elif archivefile == 'index.html':
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
else:
response = serve_static_with_byterange_support(
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
)
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
query = request.META.get('QUERY_STRING')
if query:
target = f'{target}?{query}'
return redirect(target)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist:
@@ -328,13 +475,16 @@ class SnapshotView(View):
class SnapshotPathView(View):
"""Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""
def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
if username == 'system':
return redirect(request.path.replace('/system/', '/web/', 1))
if date and domain and domain == date:
raise Http404
requested_url = url
if not requested_url and domain and domain.startswith(('http://', 'https://')):
requested_url = domain
@@ -358,19 +508,20 @@ class SnapshotPathView(View):
else:
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
try:
if len(date) == 4:
qs = qs.filter(created_at__year=int(date))
elif len(date) == 6:
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
elif len(date) == 8:
qs = qs.filter(
created_at__year=int(date[:4]),
created_at__month=int(date[4:6]),
created_at__day=int(date[6:8]),
)
except ValueError:
pass
if date:
try:
if len(date) == 4:
qs = qs.filter(created_at__year=int(date))
elif len(date) == 6:
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
elif len(date) == 8:
qs = qs.filter(
created_at__year=int(date[:4]),
created_at__month=int(date[4:6]),
created_at__day=int(date[6:8]),
)
except ValueError:
pass
if requested_url:
snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
@@ -401,7 +552,10 @@ class SnapshotPathView(View):
)
canonical_base = snapshot.url_path
requested_base = f'{username}/{date}/{domain or url or ""}'
if date:
requested_base = f'{username}/{date}/{domain or url or ""}'
else:
requested_base = f'{username}/{domain or url or ""}'
if snapshot_id:
requested_base = f'{requested_base}/{snapshot_id}'
if canonical_base != requested_base:
@@ -412,6 +566,18 @@ class SnapshotPathView(View):
return redirect(target)
archivefile = path or "index.html"
if archivefile != "index.html" and not request.GET.get('files'):
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
query = request.META.get('QUERY_STRING')
if query:
target = f'{target}?{query}'
return redirect(target)
if request.GET.get('files'):
target_path = _files_index_target(snapshot, archivefile)
return serve_static_with_byterange_support(
request, target_path, document_root=snapshot.output_dir, show_indexes=True,
)
if archivefile == "index.html":
return SnapshotView.render_live_index(request, snapshot)
@@ -421,6 +587,202 @@ class SnapshotPathView(View):
)
def _safe_archive_relpath(path: str) -> str | None:
if not path:
return ""
cleaned = posixpath.normpath(path)
cleaned = cleaned.lstrip("/")
if cleaned.startswith("..") or "/../" in f"/{cleaned}/":
return None
return cleaned
def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None:
if not domain or not rel_path:
return None
domain = domain.split(":", 1)[0].lower()
# TODO: optimize by querying output_files in DB instead of globbing filesystem
data_root = DATA_DIR / "users"
escaped_domain = escape(domain)
escaped_path = escape(rel_path)
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path)
matches = glob(pattern)
if not matches:
return None
def sort_key(match_path: str) -> tuple[str, str]:
parts = Path(match_path).parts
date_str = ""
try:
idx = parts.index("snapshots")
date_str = parts[idx + 1]
except Exception:
date_str = ""
return (date_str, match_path)
best = max(matches, key=sort_key)
best_path = Path(best)
parts = best_path.parts
try:
responses_idx = parts.index("responses")
except ValueError:
return None
responses_root = Path(*parts[: responses_idx + 1])
rel_to_root = Path(*parts[responses_idx + 1 :])
return responses_root, rel_to_root
def _latest_responses_root(domain: str) -> Path | None:
if not domain:
return None
domain = domain.split(":", 1)[0].lower()
data_root = DATA_DIR / "users"
escaped_domain = escape(domain)
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain)
matches = glob(pattern)
if not matches:
return None
def sort_key(match_path: str) -> tuple[str, str]:
parts = Path(match_path).parts
date_str = ""
try:
idx = parts.index("snapshots")
date_str = parts[idx + 1]
except Exception:
date_str = ""
return (date_str, match_path)
best = max(matches, key=sort_key)
return Path(best)
def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool):
candidates: list[str] = []
rel_path = rel_path or ""
if rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
if "." not in Path(rel_path).name:
candidates.append(f"{rel_path.rstrip('/')}/index.html")
candidates.append(rel_path)
for candidate in candidates:
try:
return serve_static_with_byterange_support(
request,
candidate,
document_root=str(responses_root),
show_indexes=show_indexes,
)
except Http404:
pass
if rel_path.endswith("index.html"):
rel_dir = rel_path[: -len("index.html")]
try:
return serve_static_with_byterange_support(
request,
rel_dir,
document_root=str(responses_root),
show_indexes=True,
)
except Http404:
return None
return None
class SnapshotHostView(View):
"""Serve snapshot directory contents on <snapshot_id>.<listen_host>/<path>."""
def get(self, request, snapshot_id: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return HttpResponseForbidden("Public snapshots are disabled.")
snapshot = None
if snapshot_id:
try:
snapshot = Snapshot.objects.get(pk=snapshot_id)
except Snapshot.DoesNotExist:
try:
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
except Snapshot.DoesNotExist:
snapshot = None
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
if not snapshot:
raise Http404
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
else:
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
try:
return serve_static_with_byterange_support(
request,
rel_path,
document_root=snapshot.output_dir,
show_indexes=show_indexes,
)
except Http404:
pass
# Fallback to responses/<domain>/<path>
host = urlparse(snapshot.url).hostname or snapshot.domain
responses_root = Path(snapshot.output_dir) / "responses" / host
if responses_root.exists():
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
class OriginalDomainHostView(View):
"""Serve responses from the most recent snapshot when using <domain>.<listen_host>/<path>."""
def get(self, request, domain: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return HttpResponseForbidden("Public snapshots are disabled.")
rel_path = path or ""
if not rel_path or rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
domain = domain.lower()
match = _latest_response_match(domain, rel_path)
if not match and "." not in Path(rel_path).name:
index_path = f"{rel_path.rstrip('/')}/index.html"
match = _latest_response_match(domain, index_path)
if not match and "." not in Path(rel_path).name:
html_path = f"{rel_path}.html"
match = _latest_response_match(domain, html_path)
show_indexes = bool(request.GET.get("files"))
if match:
responses_root, rel_to_root = match
response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
if response is not None:
return response
# If no direct match, try serving directory index from latest responses root
responses_root = _latest_responses_root(domain)
if responses_root:
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
class PublicIndexView(ListView):
template_name = 'public_index.html'
model = Snapshot
@@ -508,7 +870,7 @@ class AddView(UserPassesTestMixin, FormView):
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def form_valid(self, form):
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
@@ -522,13 +884,21 @@ class AddView(UserPassesTestMixin, FormView):
update = form.cleaned_data.get("update", False)
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
custom_config = form.cleaned_data.get("config", {})
custom_config = form.cleaned_data.get("config") or {}
from archivebox.config.permissions import HOSTNAME
if created_by_id is None:
if self.request.user.is_authenticated:
created_by_id = self.request.user.pk
else:
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Crawl with the URLs from the file
@@ -552,8 +922,8 @@ class AddView(UserPassesTestMixin, FormView):
max_depth=depth,
tags_str=tag,
notes=notes,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
created_by_id=self.request.user.pk,
label=f'{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}',
created_by_id=created_by_id,
config=config
)
@@ -566,7 +936,7 @@ class AddView(UserPassesTestMixin, FormView):
is_enabled=True,
label=crawl.label,
notes=f"Auto-created from add page. {notes}".strip(),
created_by_id=self.request.user.pk,
created_by_id=created_by_id,
)
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
@@ -576,7 +946,13 @@ class AddView(UserPassesTestMixin, FormView):
# from archivebox.crawls.actors import CrawlActor
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
return crawl
def form_valid(self, form):
crawl = self._create_crawl_from_form(form)
urls = form.cleaned_data["url"]
schedule = form.cleaned_data.get("schedule", "").strip()
rough_url_count = urls.count('://')
# Build success message with schedule link if created
@@ -593,6 +969,74 @@ class AddView(UserPassesTestMixin, FormView):
return redirect(crawl.admin_change_url)
class WebAddView(AddView):
def _latest_snapshot_for_url(self, requested_url: str):
return SnapshotView.find_snapshots_for_url(requested_url).order_by(
'-created_at', '-bookmarked_at', '-timestamp'
).first()
def _normalize_add_url(self, requested_url: str) -> str:
if requested_url.startswith(('http://', 'https://')):
return requested_url
return f'https://{requested_url}'
def dispatch(self, request, *args, **kwargs):
requested_url = urldecode(kwargs.get('url', '') or '')
if requested_url:
snapshot = self._latest_snapshot_for_url(requested_url)
if snapshot:
return redirect(f'/{snapshot.url_path}')
if not self.test_func():
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
'Return to the <a href="/" target="_top">Main Index</a>'
'</center>'
),
requested_url or '',
),
content_type="text/html",
status=404,
)
return super().dispatch(request, *args, **kwargs)
def get(self, request, url: str):
requested_url = urldecode(url)
if not requested_url:
raise Http404
snapshot = self._latest_snapshot_for_url(requested_url)
if snapshot:
return redirect(f'/{snapshot.url_path}')
add_url = self._normalize_add_url(requested_url)
defaults_form = self.form_class()
form_data = {
'url': add_url,
'depth': defaults_form.fields['depth'].initial or '0',
'persona': defaults_form.fields['persona'].initial or 'Default',
'config': {},
}
if defaults_form.fields['update'].initial:
form_data['update'] = 'on'
if defaults_form.fields['overwrite'].initial:
form_data['overwrite'] = 'on'
if defaults_form.fields['index_only'].initial:
form_data['index_only'] = 'on'
form = self.form_class(data=form_data)
if not form.is_valid():
return self.form_invalid(form)
crawl = self._create_crawl_from_form(form)
snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
return redirect(f'/{snapshot.url_path}')
class HealthCheckView(View):
"""
A Django view that renders plain text "OK" for service discovery tools
@@ -617,11 +1061,19 @@ def live_progress_view(request):
from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
machine = Machine.current()
orchestrator_proc = Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
).order_by('-started_at').first()
orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
@@ -653,24 +1105,47 @@ def live_progress_view(request):
ext = embed.lower().split('.')[-1] if '.' in embed else ''
is_embeddable = ext in ('png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'pdf', 'html')
if is_embeddable or ar.plugin in ('screenshot', 'favicon', 'dom'):
archive_path = embed or ''
recent_thumbnails.append({
'id': str(ar.id),
'plugin': ar.plugin,
'snapshot_id': str(ar.snapshot_id),
'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '',
'embed_path': embed,
'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '',
'archive_path': archive_path,
'archive_url': build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else '',
'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
})
# Build hierarchical active crawls with nested snapshots and archive results
from django.db.models import Prefetch
running_workers = Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
)
crawl_worker_pids: dict[str, int] = {}
snapshot_worker_pids: dict[str, int] = {}
for proc in running_workers:
env = proc.env or {}
if not isinstance(env, dict):
continue
if proc.worker_type == 'crawl':
crawl_id = env.get('CRAWL_ID')
if crawl_id:
crawl_worker_pids[str(crawl_id)] = proc.pid
elif proc.worker_type == 'snapshot':
snapshot_id = env.get('SNAPSHOT_ID')
if snapshot_id:
snapshot_worker_pids[str(snapshot_id)] = proc.pid
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).prefetch_related(
'snapshot_set',
'snapshot_set__archiveresult_set',
'snapshot_set__archiveresult_set__process',
).distinct().order_by('-modified_at')[:10]
active_crawls = []
@@ -710,8 +1185,9 @@ def live_progress_view(request):
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress
snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
# Calculate snapshot progress using per-plugin progress
now = timezone.now()
plugin_progress_values: list[int] = []
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
@@ -724,14 +1200,42 @@ def live_progress_view(request):
}
return (status_order.get(ar.status, 4), ar.plugin)
all_plugins = [
{
all_plugins = []
for ar in sorted(snapshot_results, key=plugin_sort_key):
status = ar.status
progress_value = 0
if status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
):
progress_value = 100
elif status == ArchiveResult.StatusChoices.STARTED:
started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
timeout = ar.timeout or 120
if started_at and timeout:
elapsed = max(0.0, (now - started_at).total_seconds())
progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100)))
else:
progress_value = 1
else:
progress_value = 0
plugin_progress_values.append(progress_value)
plugin_payload = {
'id': str(ar.id),
'plugin': ar.plugin,
'status': ar.status,
'status': status,
}
for ar in sorted(snapshot_results, key=plugin_sort_key)
]
if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
plugin_payload['pid'] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload['progress'] = progress_value
plugin_payload['timeout'] = ar.timeout or 120
all_plugins.append(plugin_payload)
snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
@@ -744,6 +1248,7 @@ def live_progress_view(request):
'failed_plugins': failed_plugins,
'pending_plugins': pending_plugins,
'all_plugins': all_plugins,
'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
})
# Check if crawl can start (for debugging stuck crawls)
@@ -772,10 +1277,12 @@ def live_progress_view(request):
'urls_preview': urls_preview,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
'worker_pid': crawl_worker_pids.get(str(crawl.id)),
})
return JsonResponse({
'orchestrator_running': orchestrator_running,
'orchestrator_pid': orchestrator_pid,
'total_workers': total_workers,
'crawls_pending': crawls_pending,
'crawls_started': crawls_started,

View File

@@ -1,8 +1,11 @@
__package__ = 'archivebox.core'
import json
import re
import hashlib
from django import forms
from django.utils.html import escape
from django.utils.safestring import mark_safe
class TagEditorWidget(forms.Widget):
@@ -27,6 +30,23 @@ class TagEditorWidget(forms.Widget):
"""Escape HTML entities in value."""
return escape(str(value)) if value else ''
def _normalize_id(self, value):
"""Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start)."""
normalized = re.sub(r'[^A-Za-z0-9_]', '_', str(value))
if not normalized or not re.match(r'[A-Za-z_]', normalized):
normalized = f't_{normalized}'
return normalized
def _tag_style(self, value):
"""Compute a stable pastel color style for a tag value."""
tag = (value or '').strip().lower()
digest = hashlib.md5(tag.encode('utf-8')).hexdigest()
hue = int(digest[:4], 16) % 360
bg = f'hsl({hue}, 70%, 92%)'
border = f'hsl({hue}, 60%, 82%)'
fg = f'hsl({hue}, 35%, 28%)'
return f'--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};'
def render(self, name, value, attrs=None, renderer=None):
"""
Render the tag editor widget.
@@ -67,13 +87,14 @@ class TagEditorWidget(forms.Widget):
elif isinstance(value, str):
tags = sorted([t.strip() for t in value.split(',') if t.strip()])
widget_id = attrs.get('id', name) if attrs else name
widget_id_raw = attrs.get('id', name) if attrs else name
widget_id = self._normalize_id(widget_id_raw)
# Build pills HTML
pills_html = ''
for tag in tags:
pills_html += f'''
<span class="tag-pill" data-tag="{self._escape(tag)}">
<span class="tag-pill" data-tag="{self._escape(tag)}" style="{self._tag_style(tag)}">
{self._escape(tag)}
<button type="button" class="tag-remove-btn" data-tag-name="{self._escape(tag)}">&times;</button>
</span>
@@ -92,6 +113,7 @@ class TagEditorWidget(forms.Widget):
placeholder="Add tag..."
autocomplete="off"
onkeydown="handleTagKeydown_{widget_id}(event)"
onkeypress="if(event.key==='Enter' || event.keyCode===13){{event.preventDefault(); event.stopPropagation();}}"
oninput="fetchTagAutocomplete_{widget_id}(this.value)"
>
<datalist id="{widget_id}_datalist"></datalist>
@@ -112,6 +134,47 @@ class TagEditorWidget(forms.Widget):
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
}};
function computeTagStyle_{widget_id}(tagName) {{
var hash = 0;
var name = String(tagName || '').toLowerCase();
for (var i = 0; i < name.length; i++) {{
hash = (hash * 31 + name.charCodeAt(i)) % 360;
}}
var bg = 'hsl(' + hash + ', 70%, 92%)';
var border = 'hsl(' + hash + ', 60%, 82%)';
var fg = 'hsl(' + hash + ', 35%, 28%)';
return {{ bg: bg, border: border, fg: fg }};
}}
function applyTagStyle_{widget_id}(el, tagName) {{
var colors = computeTagStyle_{widget_id}(tagName);
el.style.setProperty('--tag-bg', colors.bg);
el.style.setProperty('--tag-border', colors.border);
el.style.setProperty('--tag-fg', colors.fg);
}}
function getApiKey() {{
return (window.ARCHIVEBOX_API_KEY || '').trim();
}}
function buildApiUrl(path) {{
var apiKey = getApiKey();
if (!apiKey) return path;
var sep = path.indexOf('?') !== -1 ? '&' : '?';
return path + sep + 'api_key=' + encodeURIComponent(apiKey);
}}
function buildApiHeaders() {{
var headers = {{
'Content-Type': 'application/json',
}};
var apiKey = getApiKey();
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
var csrfToken = getCSRFToken();
if (csrfToken) headers['X-CSRFToken'] = csrfToken;
return headers;
}}
window.addTag_{widget_id} = function(tagName) {{
tagName = tagName.trim();
if (!tagName) return;
@@ -139,12 +202,9 @@ class TagEditorWidget(forms.Widget):
document.getElementById('{widget_id}_input').value = '';
// Create tag via API if it doesn't exist (fire and forget)
fetch('/api/v1/core/tags/create/', {{
fetch(buildApiUrl('/api/v1/core/tags/create/'), {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': getCSRFToken()
}},
headers: buildApiHeaders(),
body: JSON.stringify({{ name: tagName }})
}}).catch(function(err) {{
console.log('Tag creation note:', err);
@@ -166,6 +226,7 @@ class TagEditorWidget(forms.Widget):
var pill = document.createElement('span');
pill.className = 'tag-pill';
pill.setAttribute('data-tag', tag);
applyTagStyle_{widget_id}(pill, tag);
var tagText = document.createTextNode(tag);
pill.appendChild(tagText);
@@ -195,14 +256,16 @@ class TagEditorWidget(forms.Widget):
var input = event.target;
var value = input.value.trim();
if (event.key === 'Enter' || event.key === ' ' || event.key === ',') {{
if (event.key === 'Enter' || event.keyCode === 13 || event.key === ' ' || event.key === ',') {{
event.preventDefault();
event.stopPropagation();
if (value) {{
// Handle comma-separated values
value.split(',').forEach(function(tag) {{
addTag_{widget_id}(tag.trim());
}});
}}
return false;
}} else if (event.key === 'Backspace' && !value && currentTags_{widget_id}.length > 0) {{
// Remove last tag on backspace when input is empty
var lastTag = currentTags_{widget_id}.pop();
@@ -222,7 +285,7 @@ class TagEditorWidget(forms.Widget):
return;
}}
fetch('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))
fetch(buildApiUrl('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query)))
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
var datalist = document.getElementById('{widget_id}_datalist');
@@ -261,7 +324,7 @@ class TagEditorWidget(forms.Widget):
</script>
'''
return html
return mark_safe(html)
class InlineTagEditorWidget(TagEditorWidget):
@@ -295,20 +358,23 @@ class InlineTagEditorWidget(TagEditorWidget):
tag_data.sort(key=lambda x: x['name'].lower())
tags = [t['name'] for t in tag_data]
widget_id = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
widget_id = self._normalize_id(widget_id_raw)
# Build pills HTML with filter links
pills_html = ''
for td in tag_data:
pills_html += f'''
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}">
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}" style="{self._tag_style(td['name'])}">
<a href="/admin/core/snapshot/?tags__id__exact={td['id']}" class="tag-link">{self._escape(td['name'])}</a>
<button type="button" class="tag-remove-btn" data-tag-id="{td['id']}" data-tag-name="{self._escape(td['name'])}">&times;</button>
</span>
'''
tags_json = escape(json.dumps(tag_data))
html = f'''
<span id="{widget_id}_container" class="tag-editor-inline" onclick="focusInlineTagInput_{widget_id}(event)">
<span id="{widget_id}_container" class="tag-editor-inline" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}">
<span id="{widget_id}_pills" class="tag-pills-inline">
{pills_html}
</span>
@@ -318,195 +384,10 @@ class InlineTagEditorWidget(TagEditorWidget):
list="{widget_id}_datalist"
placeholder="+"
autocomplete="off"
onkeydown="handleInlineTagKeydown_{widget_id}(event)"
oninput="fetchInlineTagAutocomplete_{widget_id}(this.value)"
onfocus="this.placeholder='add tag...'"
onblur="this.placeholder='+'"
data-inline-tag-input="1"
>
<datalist id="{widget_id}_datalist"></datalist>
</span>
<script>
(function() {{
var snapshotId_{widget_id} = '{snapshot_id}';
var currentTagData_{widget_id} = {json.dumps(tag_data)};
var autocompleteTimeout_{widget_id} = null;
window.focusInlineTagInput_{widget_id} = function(event) {{
event.stopPropagation();
if (event.target.classList.contains('tag-remove-btn') || event.target.classList.contains('tag-link')) return;
document.getElementById('{widget_id}_input').focus();
}};
window.addInlineTag_{widget_id} = function(tagName) {{
tagName = tagName.trim();
if (!tagName) return;
// Check if tag already exists
var exists = currentTagData_{widget_id}.some(function(t) {{
return t.name.toLowerCase() === tagName.toLowerCase();
}});
if (exists) {{
document.getElementById('{widget_id}_input').value = '';
return;
}}
// Add via API
fetch('/api/v1/core/tags/add-to-snapshot/', {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': getCSRFToken()
}},
body: JSON.stringify({{
snapshot_id: snapshotId_{widget_id},
tag_name: tagName
}})
}})
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
if (data.success) {{
currentTagData_{widget_id}.push({{ id: data.tag_id, name: data.tag_name }});
currentTagData_{widget_id}.sort(function(a, b) {{
return a.name.toLowerCase().localeCompare(b.name.toLowerCase());
}});
rebuildInlinePills_{widget_id}();
}}
}})
.catch(function(err) {{
console.error('Error adding tag:', err);
}});
document.getElementById('{widget_id}_input').value = '';
}};
window.removeInlineTag_{widget_id} = function(tagId) {{
fetch('/api/v1/core/tags/remove-from-snapshot/', {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': getCSRFToken()
}},
body: JSON.stringify({{
snapshot_id: snapshotId_{widget_id},
tag_id: tagId
}})
}})
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
if (data.success) {{
currentTagData_{widget_id} = currentTagData_{widget_id}.filter(function(t) {{
return t.id !== tagId;
}});
rebuildInlinePills_{widget_id}();
}}
}})
.catch(function(err) {{
console.error('Error removing tag:', err);
}});
}};
window.rebuildInlinePills_{widget_id} = function() {{
var container = document.getElementById('{widget_id}_pills');
container.innerHTML = '';
currentTagData_{widget_id}.forEach(function(td) {{
var pill = document.createElement('span');
pill.className = 'tag-pill';
pill.setAttribute('data-tag', td.name);
pill.setAttribute('data-tag-id', td.id);
var link = document.createElement('a');
link.href = '/admin/core/snapshot/?tags__id__exact=' + td.id;
link.className = 'tag-link';
link.textContent = td.name;
pill.appendChild(link);
var removeBtn = document.createElement('button');
removeBtn.type = 'button';
removeBtn.className = 'tag-remove-btn';
removeBtn.setAttribute('data-tag-id', td.id);
removeBtn.setAttribute('data-tag-name', td.name);
removeBtn.innerHTML = '&times;';
pill.appendChild(removeBtn);
container.appendChild(pill);
}});
}};
// Add event delegation for remove buttons
document.getElementById('{widget_id}_pills').addEventListener('click', function(event) {{
if (event.target.classList.contains('tag-remove-btn')) {{
event.stopPropagation();
event.preventDefault();
var tagId = parseInt(event.target.getAttribute('data-tag-id'), 10);
if (tagId) {{
removeInlineTag_{widget_id}(tagId);
}}
}}
}});
window.handleInlineTagKeydown_{widget_id} = function(event) {{
event.stopPropagation();
var input = event.target;
var value = input.value.trim();
if (event.key === 'Enter' || event.key === ',') {{
event.preventDefault();
if (value) {{
value.split(',').forEach(function(tag) {{
addInlineTag_{widget_id}(tag.trim());
}});
}}
}}
}};
window.fetchInlineTagAutocomplete_{widget_id} = function(query) {{
if (autocompleteTimeout_{widget_id}) {{
clearTimeout(autocompleteTimeout_{widget_id});
}}
autocompleteTimeout_{widget_id} = setTimeout(function() {{
if (!query || query.length < 1) {{
document.getElementById('{widget_id}_datalist').innerHTML = '';
return;
}}
fetch('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
var datalist = document.getElementById('{widget_id}_datalist');
datalist.innerHTML = '';
(data.tags || []).forEach(function(tag) {{
var option = document.createElement('option');
option.value = tag.name;
datalist.appendChild(option);
}});
}})
.catch(function(err) {{
console.log('Autocomplete error:', err);
}});
}}, 150);
}};
function escapeHtml(text) {{
var div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}}
function getCSRFToken() {{
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {{
var cookie = cookies[i].trim();
if (cookie.startsWith('csrftoken=')) {{
return cookie.substring('csrftoken='.length);
}}
}}
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
return input ? input.value : '';
}}
}})();
</script>
'''
return html
return mark_safe(html)

View File

@@ -62,6 +62,7 @@ import json
import signal
import time
import subprocess
from functools import lru_cache
from pathlib import Path
from typing import List, Dict, Any, Optional, TypedDict
@@ -255,6 +256,7 @@ def run_hook(
records = process.get_records() # Get parsed JSONL output
"""
from archivebox.machine.models import Process, Machine
from archivebox.config.constants import CONSTANTS
import time
import sys
start_time = time.time()
@@ -264,6 +266,8 @@ def run_hook(
plugin_name = script.parent.name
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config['timeout']
if timeout:
timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
# Get current machine
machine = Machine.current()
@@ -568,6 +572,7 @@ def run_hooks(
return results
@lru_cache(maxsize=1)
def get_plugins() -> List[str]:
"""
Get list of available plugins by discovering Snapshot hooks.
@@ -988,6 +993,8 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
Template content as string, or None if not found and fallback=False.
"""
base_name = get_plugin_name(plugin)
if base_name in ('yt-dlp', 'youtube-dl'):
base_name = 'ytdlp'
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
@@ -1011,6 +1018,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
return None
@lru_cache(maxsize=None)
def get_plugin_icon(plugin: str) -> str:
"""
Get the icon for a plugin from its icon.html template.

View File

@@ -1685,8 +1685,11 @@ class Process(models.Model):
TimeoutError if process doesn't exit in time
"""
import time
from archivebox.config.constants import CONSTANTS
timeout = timeout or self.timeout
if self.process_type == self.TypeChoices.HOOK:
timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
start = time.time()
while True:

View File

@@ -1,3 +1,6 @@
import html
import json
import re
import os
import stat
import posixpath
@@ -10,6 +13,267 @@ from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpRespon
from django.utils._os import safe_join
from django.utils.http import http_date
from django.utils.translation import gettext as _
from archivebox.config.common import SERVER_CONFIG
_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}
def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
if not hashes_path.exists():
return None
try:
mtime = hashes_path.stat().st_mtime
except OSError:
return None
cached = _HASHES_CACHE.get(hashes_path)
if cached and cached[0] == mtime:
return cached[1]
try:
data = json.loads(hashes_path.read_text(encoding='utf-8'))
except Exception:
return None
file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
_HASHES_CACHE[hashes_path] = (mtime, file_map)
return file_map
def _hash_for_path(document_root: Path, rel_path: str) -> str | None:
file_map = _load_hash_map(document_root)
if not file_map:
return None
return file_map.get(rel_path)
def _cache_policy() -> str:
return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
# Ensure common web types are mapped consistently across platforms.
mimetypes.add_type("text/html", ".html")
mimetypes.add_type("text/html", ".htm")
mimetypes.add_type("text/css", ".css")
mimetypes.add_type("application/javascript", ".js")
mimetypes.add_type("application/json", ".json")
mimetypes.add_type("application/x-ndjson", ".jsonl")
mimetypes.add_type("text/markdown", ".md")
mimetypes.add_type("text/yaml", ".yml")
mimetypes.add_type("text/yaml", ".yaml")
mimetypes.add_type("text/csv", ".csv")
mimetypes.add_type("text/tab-separated-values", ".tsv")
mimetypes.add_type("application/xml", ".xml")
mimetypes.add_type("image/svg+xml", ".svg")
try:
import markdown as _markdown
except Exception:
_markdown = None
MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
def _extract_markdown_candidate(text: str) -> str:
candidate = text
body_match = HTML_BODY_RE.search(candidate)
if body_match:
candidate = body_match.group(1)
candidate = re.sub(r'^\s*<p[^>]*>', '', candidate, flags=re.IGNORECASE)
candidate = re.sub(r'</p>\s*$', '', candidate, flags=re.IGNORECASE)
return candidate.strip()
def _looks_like_markdown(text: str) -> bool:
lower = text.lower()
if "<html" in lower and "<head" in lower and "</body>" in lower:
return False
md_markers = 0
md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
md_markers += text.count('[TOC]')
md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
md_markers += text.count('\n---') + text.count('\n***')
return md_markers >= 6
def _render_markdown_fallback(text: str) -> str:
if _markdown is not None and not HTML_TAG_RE.search(text):
try:
return _markdown.markdown(
text,
extensions=["extra", "toc", "sane_lists"],
output_format="html5",
)
except Exception:
pass
lines = text.splitlines()
headings = []
def slugify(value: str) -> str:
slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
return slug or "section"
for raw_line in lines:
heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
if heading_match:
level = len(heading_match.group(1))
content = heading_match.group(2).strip()
headings.append((level, content, slugify(content)))
html_lines = []
in_code = False
in_ul = False
in_ol = False
in_blockquote = False
def render_inline(markup: str) -> str:
content = MARKDOWN_INLINE_IMAGE_RE.sub(r'<img alt="\1" src="\2">', markup)
content = MARKDOWN_INLINE_LINK_RE.sub(r'<a href="\2">\1</a>', content)
content = MARKDOWN_BOLD_RE.sub(r'<strong>\1</strong>', content)
content = MARKDOWN_ITALIC_RE.sub(r'<em>\1</em>', content)
return content
def close_lists():
nonlocal in_ul, in_ol
if in_ul:
html_lines.append("</ul>")
in_ul = False
if in_ol:
html_lines.append("</ol>")
in_ol = False
for raw_line in lines:
line = raw_line.rstrip("\n")
stripped = line.strip()
if stripped.startswith("```"):
if in_code:
html_lines.append("</code></pre>")
in_code = False
else:
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
html_lines.append("<pre><code>")
in_code = True
continue
if in_code:
html_lines.append(html.escape(line))
continue
if not stripped:
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
html_lines.append("<br/>")
continue
heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
if heading_match:
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
leading_tags = heading_match.group(1).strip()
level = len(heading_match.group(2))
content = heading_match.group(3).strip()
if leading_tags:
html_lines.append(leading_tags)
html_lines.append(f"<h{level} id=\"{slugify(content)}\">{render_inline(content)}</h{level}>")
continue
if stripped in ("---", "***"):
close_lists()
html_lines.append("<hr/>")
continue
if stripped.startswith("> "):
if not in_blockquote:
close_lists()
html_lines.append("<blockquote>")
in_blockquote = True
content = stripped[2:]
html_lines.append(render_inline(content))
continue
else:
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
if ul_match:
if in_ol:
html_lines.append("</ol>")
in_ol = False
if not in_ul:
html_lines.append("<ul>")
in_ul = True
html_lines.append(f"<li>{render_inline(ul_match.group(1))}</li>")
continue
ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
if ol_match:
if in_ul:
html_lines.append("</ul>")
in_ul = False
if not in_ol:
html_lines.append("<ol>")
in_ol = True
html_lines.append(f"<li>{render_inline(ol_match.group(1))}</li>")
continue
close_lists()
# Inline conversions (leave raw HTML intact)
if stripped == "[TOC]":
toc_items = []
for level, title, slug in headings:
toc_items.append(
f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>'
)
html_lines.append(
'<nav class="toc"><ul>' + "".join(toc_items) + '</ul></nav>'
)
continue
html_lines.append(f"<p>{render_inline(line)}</p>")
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
if in_code:
html_lines.append("</code></pre>")
return "\n".join(html_lines)
def _render_markdown_document(markdown_text: str) -> str:
body = _render_markdown_fallback(markdown_text)
wrapped = (
"<!doctype html><html><head><meta charset=\"utf-8\">"
"<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
"<style>body{max-width:900px;margin:24px auto;padding:0 16px;"
"font-family:system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
"line-height:1.55;} img{max-width:100%;} pre{background:#f6f6f6;padding:12px;overflow:auto;}"
".toc ul{list-style:none;padding-left:0;} .toc li{margin:4px 0;}</style>"
"</head><body>"
f"{body}"
"</body></html>"
)
return wrapped
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
@@ -28,18 +292,101 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
if not os.access(fullpath, os.R_OK):
raise Http404(_("%(path)s” does not exist") % {"path": fullpath})
# Respect the If-Modified-Since header.
statobj = fullpath.stat()
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return HttpResponseNotModified()
document_root = Path(document_root) if document_root else None
rel_path = path
etag = None
if document_root:
file_hash = _hash_for_path(document_root, rel_path)
if file_hash:
etag = f'"{file_hash}"'
if etag:
inm = request.META.get("HTTP_IF_NONE_MATCH")
if inm:
inm_list = [item.strip() for item in inm.split(",")]
if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
not_modified = HttpResponseNotModified()
not_modified.headers["ETag"] = etag
not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
return not_modified
content_type, encoding = mimetypes.guess_type(str(fullpath))
content_type = content_type or "application/octet-stream"
# Add charset for text-like types (best guess), but don't override the type.
is_text_like = (
content_type.startswith("text/")
or content_type in {
"application/json",
"application/javascript",
"application/xml",
"application/x-ndjson",
"image/svg+xml",
}
)
if is_text_like and "charset=" not in content_type:
content_type = f"{content_type}; charset=utf-8"
# Respect the If-Modified-Since header for non-markdown responses.
if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return HttpResponseNotModified()
# Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
# are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
if content_type.startswith("text/plain") or content_type.startswith("text/html"):
try:
max_unescape_size = 10 * 1024 * 1024 # 10MB cap to avoid heavy memory use
if statobj.st_size <= max_unescape_size:
raw = fullpath.read_bytes()
decoded = raw.decode("utf-8", errors="replace")
escaped_count = decoded.count("&lt;") + decoded.count("&gt;")
tag_count = decoded.count("<")
if escaped_count and escaped_count > tag_count * 2:
decoded = html.unescape(decoded)
markdown_candidate = _extract_markdown_candidate(decoded)
if _looks_like_markdown(markdown_candidate):
wrapped = _render_markdown_document(markdown_candidate)
response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return response
if escaped_count and escaped_count > tag_count * 2:
response = HttpResponse(decoded, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return response
except Exception:
pass
# setup resposne object
ranged_file = RangedFileReader(open(fullpath, "rb"))
response = StreamingHttpResponse(ranged_file, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
if is_text_like:
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if content_type.startswith("image/"):
response.headers["Cache-Control"] = "public, max-age=604800, immutable"
# handle byte-range requests by serving chunk of file
if stat.S_ISREG(statobj.st_mode):

View File

@@ -26,6 +26,7 @@ const PLUGIN_NAME = 'accessibility';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
// Parse command line arguments
function parseArgs() {
@@ -76,6 +77,27 @@ function getCdpUrl() {
return null;
}
function assertChromeSession() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
try {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
process.kill(pid, 0);
} catch (e) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
return cdpUrl;
}
// Extract accessibility info
async function extractAccessibility(url) {
// Output directory is current directory (hook already runs in output dir)
@@ -85,10 +107,7 @@ async function extractAccessibility(url) {
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
const cdpUrl = assertChromeSession();
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
@@ -226,13 +245,10 @@ async function main() {
}
// Check if Chrome session exists, then wait for page load
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
assertChromeSession();
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await extractAccessibility(url);

View File

@@ -47,7 +47,6 @@ class TestAccessibilityPlugin(TestCase):
self.assertTrue(ACCESSIBILITY_HOOK.exists(), f"Hook not found: {ACCESSIBILITY_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestAccessibilityWithChrome(TestCase):
"""Integration tests for accessibility plugin with Chrome."""
@@ -109,9 +108,7 @@ class TestAccessibilityWithChrome(TestCase):
self.assertIn('headings', accessibility_data, f"Missing headings: {accessibility_data}")
self.assertIn('url', accessibility_data, f"Missing url: {accessibility_data}")
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
except RuntimeError:
raise
def test_accessibility_disabled_skips(self):

View File

@@ -70,9 +70,9 @@ class TestAptProviderHook(TestCase):
self.assertEqual(result.returncode, 0)
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
@pytest.mark.skipif(not apt_available(), reason="apt not installed")
def test_hook_detects_apt(self):
"""Hook should detect apt binary when available."""
assert apt_available(), "apt not installed"
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
@@ -112,12 +112,12 @@ class TestAptProviderHook(TestCase):
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
@pytest.mark.skipif(not apt_available(), reason="apt not installed")
class TestAptProviderSystemBinaries(TestCase):
"""Test apt provider with system binaries."""
def test_detect_existing_binary(self):
"""apt provider should detect already-installed system binaries."""
assert apt_available(), "apt not installed"
# Check for a binary that's almost certainly installed (like 'ls' or 'bash')
result = subprocess.run(
[

View File

@@ -18,6 +18,8 @@ const { finished } = require('stream/promises');
const execAsync = promisify(exec);
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
// ============================================================================
// Environment helpers
// ============================================================================
@@ -373,6 +375,7 @@ async function launchChromium(options = {}) {
outputDir = 'chrome',
userDataDir = getEnv('CHROME_USER_DATA_DIR'),
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''),
headless = getEnvBool('CHROME_HEADLESS', true),
sandbox = getEnvBool('CHROME_SANDBOX', true),
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
@@ -450,17 +453,17 @@ async function launchChromium(options = {}) {
const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []);
// Build dynamic Chrome arguments (these must be computed at runtime)
const inDocker = getEnvBool('IN_DOCKER', false);
const dynamicArgs = [
// Remote debugging setup
`--remote-debugging-port=${debugPort}`,
'--remote-debugging-address=127.0.0.1',
// Sandbox settings (disable in Docker)
...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']),
...(sandbox ? [] : (inDocker ? ['--no-sandbox', '--disable-setuid-sandbox'] : [])),
// Docker-specific workarounds
'--disable-dev-shm-usage',
'--disable-gpu',
// Window size
`--window-size=${width},${height}`,
@@ -468,6 +471,9 @@ async function launchChromium(options = {}) {
// User data directory (for persistent sessions with persona)
...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
// User agent
...(userAgent ? [`--user-agent=${userAgent}`] : []),
// Headless mode
...(headless ? ['--headless=new'] : []),
@@ -1387,6 +1393,18 @@ function findChromium() {
return null;
}
/**
* Find Chromium binary path only (never Chrome/Brave/Edge).
* Prefers CHROME_BINARY if set, then Chromium.
*
* @returns {string|null} - Absolute path or command name to browser binary
*/
function findAnyChromiumBinary() {
const chromiumBinary = findChromium();
if (chromiumBinary) return chromiumBinary;
return null;
}
// ============================================================================
// Shared Extension Installer Utilities
// ============================================================================
@@ -1658,13 +1676,13 @@ async function connectToPage(options = {}) {
// Wait for chrome session to be ready
const sessionReady = await waitForChromeSession(chromeSessionDir, timeoutMs);
if (!sessionReady) {
throw new Error(`Chrome session not ready after ${timeoutMs/1000}s (chrome plugin must run first)`);
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Read session files
const cdpUrl = readCdpUrl(chromeSessionDir);
if (!cdpUrl) {
throw new Error('No Chrome session found (cdp_url.txt missing)');
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const targetId = readTargetId(chromeSessionDir);
@@ -1749,6 +1767,7 @@ module.exports = {
installPuppeteerCore,
// Chromium binary finding
findChromium,
findAnyChromiumBinary,
// Extension utilities
getExtensionId,
loadExtensionManifest,

View File

@@ -23,7 +23,7 @@ if (process.env.NODE_MODULES_DIR) {
const fs = require('fs');
const path = require('path');
const {
findChromium,
findAnyChromiumBinary,
launchChromium,
killChrome,
getEnv,
@@ -109,9 +109,9 @@ async function main() {
process.exit(1);
}
const binary = findChromium();
const binary = findAnyChromiumBinary();
if (!binary) {
console.error('ERROR: Chromium binary not found');
console.error('ERROR: Chromium-based browser binary not found');
process.exit(1);
}

View File

@@ -31,12 +31,15 @@ if (process.env.NODE_MODULES_DIR) {
const fs = require('fs');
const path = require('path');
const http = require('http');
const puppeteer = require('puppeteer');
const {
findChromium,
launchChromium,
killChrome,
getEnv,
getEnvBool,
getExtensionId,
writePidWithMtime,
getExtensionsDir,
} = require('./chrome_utils.js');
@@ -154,6 +157,84 @@ async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
}
function getPortFromCdpUrl(cdpUrl) {
if (!cdpUrl) return null;
const match = cdpUrl.match(/:(\d+)\/devtools\//);
return match ? match[1] : null;
}
async function fetchDevtoolsTargets(cdpUrl) {
const port = getPortFromCdpUrl(cdpUrl);
if (!port) return [];
const urlPath = '/json/list';
return new Promise((resolve, reject) => {
const req = http.get(
{ hostname: '127.0.0.1', port, path: urlPath },
(res) => {
let data = '';
res.on('data', (chunk) => (data += chunk));
res.on('end', () => {
try {
const targets = JSON.parse(data);
resolve(Array.isArray(targets) ? targets : []);
} catch (e) {
reject(e);
}
});
}
);
req.on('error', reject);
});
}
async function discoverExtensionTargets(cdpUrl, installedExtensions) {
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
let targets = [];
for (let i = 0; i < 10; i += 1) {
try {
targets = await fetchDevtoolsTargets(cdpUrl);
if (targets.length > 0) break;
} catch (e) {
// Ignore and retry
}
await new Promise(r => setTimeout(r, 500));
}
const customExtTargets = targets.filter(t => {
const url = t.url || '';
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
});
console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`);
for (const target of customExtTargets) {
const url = target.url || '';
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`);
}
const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0]));
for (const ext of installedExtensions) {
if (ext.id) {
ext.loaded = runtimeIds.has(ext.id);
}
}
if (customExtTargets.length === 0 && installedExtensions.length > 0) {
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
}
}
// Parse command line arguments
function parseArgs() {
const args = {};
@@ -257,6 +338,17 @@ async function main() {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Ensure extension IDs are available without chrome://extensions
for (const ext of installedExtensions) {
if (!ext.id && ext.unpacked_path) {
try {
ext.id = getExtensionId(ext.unpacked_path);
} catch (e) {
console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`);
}
}
}
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
@@ -280,131 +372,31 @@ async function main() {
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browserInstance = browser;
// Import cookies into Chrome profile at crawl start
await importCookiesFromFile(browser, cookiesFile, userDataDir);
// Get actual extension IDs from chrome://extensions page
// Discover extension targets at launch (no chrome://extensions)
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 2000));
console.error('[*] Discovering extension targets via devtools /json/list...');
await discoverExtensionTargets(cdpUrl, installedExtensions);
}
// Only connect to CDP when cookies import is needed to reduce crash risk.
if (cookiesFile) {
console.error(`[*] Connecting puppeteer to CDP for cookie import...`);
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browserInstance = browser;
// Import cookies into Chrome profile at crawl start
await importCookiesFromFile(browser, cookiesFile, userDataDir);
try {
const extPage = await browser.newPage();
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 2000));
// Parse extension info from the page
const extensionsFromPage = await extPage.evaluate(() => {
const extensions = [];
// Extensions manager uses shadow DOM
const manager = document.querySelector('extensions-manager');
if (!manager || !manager.shadowRoot) return extensions;
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
if (!itemList || !itemList.shadowRoot) return extensions;
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
for (const item of items) {
const id = item.getAttribute('id');
const nameEl = item.shadowRoot?.querySelector('#name');
const name = nameEl?.textContent?.trim() || '';
if (id && name) {
extensions.push({ id, name });
}
}
return extensions;
});
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
for (const e of extensionsFromPage) {
console.error(` - ${e.id}: "${e.name}"`);
}
// Match extensions by name (strict matching)
for (const ext of installedExtensions) {
// Read the extension's manifest to get its display name
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
let manifestName = manifest.name || '';
// Resolve message placeholder (e.g., __MSG_extName__)
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
const defaultLocale = manifest.default_locale || 'en';
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
if (fs.existsSync(messagesPath)) {
try {
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
if (messages[msgKey] && messages[msgKey].message) {
manifestName = messages[msgKey].message;
}
} catch (e) {
console.error(`[!] Failed to read messages.json: ${e.message}`);
}
}
}
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
// Find matching extension from page by exact name match first
let match = extensionsFromPage.find(e => e.name === manifestName);
// If no exact match, try case-insensitive exact match
if (!match) {
match = extensionsFromPage.find(e =>
e.name.toLowerCase() === manifestName.toLowerCase()
);
}
if (match) {
ext.id = match.id;
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
} else {
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
}
}
}
await extPage.close();
} catch (e) {
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
}
// Fallback: check browser targets
const targets = browser.targets();
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = targets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
});
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension target: ${extId} (${target.type()})`);
}
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
}
browser.disconnect();
} catch (e) {}
browserInstance = null;
} else {
console.error('[*] Skipping puppeteer CDP connection (no cookies to import)');
}
// Write extensions metadata with actual IDs

View File

@@ -2,9 +2,8 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
* Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js)
* and creates a new tab. This hook does NOT launch its own Chrome instance.
*
* Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
* Output: Creates chrome/ directory under snapshot output dir with:
@@ -15,11 +14,7 @@
*
* Environment variables:
* CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
* CHROME_BINARY: Path to Chromium binary (for fallback)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* CHROME_BINARY: Path to Chromium binary (optional, for version info)
*
* This is a background hook that stays alive until SIGTERM so the tab
* can be closed cleanly at the end of the snapshot run.
@@ -27,24 +22,18 @@
const fs = require('fs');
const path = require('path');
const { spawn } = require('child_process');
const { execSync } = require('child_process');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer');
const {
findChromium,
getEnv,
getEnvBool,
parseResolution,
findFreePort,
waitForDebugPort,
} = require('./chrome_utils.js');
const { getEnv, getEnvInt } = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_tab';
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
const CHROME_SESSION_DIR = '.';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
let finalStatus = 'failed';
let finalOutput = '';
@@ -118,61 +107,75 @@ process.on('SIGTERM', () => cleanup('SIGTERM'));
process.on('SIGINT', () => cleanup('SIGINT'));
// Try to find the crawl's Chrome session
function findCrawlChromeSession() {
function getCrawlChromeSession() {
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
if (!crawlOutputDir) return null;
if (!crawlOutputDir) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const crawlChromeDir = path.join(crawlOutputDir, 'chrome');
const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
const pidFile = path.join(crawlChromeDir, 'chrome.pid');
if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) {
try {
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
// Verify the process is still running
try {
process.kill(pid, 0); // Signal 0 = check if process exists
return { cdpUrl, pid };
} catch (e) {
// Process not running
return null;
}
} catch (e) {
return null;
}
if (!fs.existsSync(cdpFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
if (!fs.existsSync(pidFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
return null;
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
if (!cdpUrl) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
if (!pid || Number.isNaN(pid)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Verify the process is still running
try {
process.kill(pid, 0); // Signal 0 = check if process exists
} catch (e) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
return { cdpUrl, pid };
}
async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) {
const startTime = Date.now();
let lastError = null;
while (Date.now() - startTime < timeoutMs) {
try {
return getCrawlChromeSession();
} catch (e) {
lastError = e;
}
await new Promise(resolve => setTimeout(resolve, intervalMs));
}
if (lastError) {
throw lastError;
}
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Create a new tab in an existing Chrome session
async function createTabInExistingChrome(cdpUrl, url, pid) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const { width, height } = parseResolution(resolution);
console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
// Connect Puppeteer to the running Chrome
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
defaultViewport: null,
});
// Create a new tab for this snapshot
const page = await browser.newPage();
// Set viewport
await page.setViewport({ width, height });
// Set user agent if specified
if (userAgent) {
await page.setUserAgent(userAgent);
}
// Get the page target ID
const target = page.target();
const targetId = target._targetId;
@@ -189,112 +192,6 @@ async function createTabInExistingChrome(cdpUrl, url, pid) {
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid };
}
// Fallback: Launch a new Chrome instance for this snapshot
async function launchNewChrome(url, binary) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Find a free port for Chrome DevTools
const debugPort = await findFreePort();
console.log(`[*] Launching new Chrome on port: ${debugPort}`);
// Build Chrome arguments
const chromeArgs = [
`--remote-debugging-port=${debugPort}`,
'--remote-debugging-address=127.0.0.1',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...(headless ? ['--headless=new'] : []),
...(checkSsl ? [] : ['--ignore-certificate-errors']),
'about:blank',
];
// Launch Chrome as a detached process (since no crawl-level Chrome exists)
const chromeProcess = spawn(binary, chromeArgs, {
detached: true,
stdio: ['ignore', 'ignore', 'ignore'],
});
chromeProcess.unref();
const chromePid = chromeProcess.pid;
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write PID immediately for cleanup
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
try {
// Wait for Chrome to be ready
const versionInfo = await waitForDebugPort(debugPort, 30000);
console.log(`[+] Chrome ready: ${versionInfo.Browser}`);
const wsUrl = versionInfo.webSocketDebuggerUrl;
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
// Connect Puppeteer to get page info
const browser = await puppeteer.connect({
browserWSEndpoint: wsUrl,
defaultViewport: { width, height },
});
let pages = await browser.pages();
let page = pages[0];
if (!page) {
page = await browser.newPage();
}
await page.setViewport({ width, height });
if (userAgent) {
await page.setUserAgent(userAgent);
}
const target = page.target();
const targetId = target._targetId;
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId);
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
browser.disconnect();
return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid };
} catch (e) {
try {
process.kill(chromePid, 'SIGTERM');
} catch (killErr) {
// Ignore
}
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
@@ -312,33 +209,21 @@ async function main() {
let version = '';
try {
const binary = findChromium();
if (!binary) {
console.error('ERROR: Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chromium');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
process.exit(1);
}
// Get Chrome version
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
const binary = getEnv('CHROME_BINARY', '').trim();
if (binary) {
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
}
} catch (e) {
version = '';
}
// Try to use existing crawl Chrome session
const crawlSession = findCrawlChromeSession();
let result;
if (crawlSession) {
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
} else {
result = { success: false, error: 'No crawl Chrome session found (CRAWL_OUTPUT_DIR missing or chrome not running)' };
}
// Try to use existing crawl Chrome session (wait for readiness)
const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000);
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
if (result.success) {
status = 'succeeded';

View File

@@ -21,6 +21,7 @@ const {
} = require('./chrome_utils.js');
const CHROME_SESSION_DIR = '.';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
@@ -50,7 +51,7 @@ async function main() {
const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
if (!ready) {
const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`;
const error = CHROME_SESSION_REQUIRED_ERROR;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
@@ -59,7 +60,7 @@ async function main() {
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
const targetId = readTargetId(CHROME_SESSION_DIR);
if (!cdpUrl || !targetId) {
const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)';
const error = CHROME_SESSION_REQUIRED_ERROR;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);

View File

@@ -24,6 +24,7 @@ const puppeteer = require('puppeteer');
const PLUGIN_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '.';
const OUTPUT_DIR = '.';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
@@ -175,13 +176,13 @@ async function main() {
// Wait for chrome tab to be open (up to 60s)
const tabOpen = await waitForChromeTabOpen(60000);
if (!tabOpen) {
console.error('ERROR: Chrome tab not open after 60s (chrome_tab must run first)');
console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
process.exit(1);
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: Chrome CDP URL not found (chrome tab not initialized)');
console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
process.exit(1);
}

View File

@@ -229,6 +229,33 @@ def get_extensions_dir() -> str:
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
def link_puppeteer_cache(lib_dir: Path) -> None:
"""Best-effort symlink from system Puppeteer cache into test lib_dir.
Avoids repeated Chromium downloads across tests by reusing the
default Puppeteer cache directory.
"""
cache_dir = lib_dir / 'puppeteer'
cache_dir.mkdir(parents=True, exist_ok=True)
candidates = [
Path.home() / 'Library' / 'Caches' / 'puppeteer',
Path.home() / '.cache' / 'puppeteer',
]
for src_root in candidates:
if not src_root.exists():
continue
for item in src_root.iterdir():
dst = cache_dir / item.name
if dst.exists():
continue
try:
os.symlink(item, dst, target_is_directory=item.is_dir())
except Exception:
# Best-effort only; if symlink fails, leave as-is.
pass
def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
"""Find the Chromium binary path.
@@ -632,9 +659,8 @@ def setup_test_env(tmpdir: Path) -> dict:
tmpdir: Base temporary directory for the test
Returns:
Environment dict with all paths set, or pytest.skip() if Chrome install fails
Environment dict with all paths set.
"""
import pytest
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
@@ -688,7 +714,7 @@ def setup_test_env(tmpdir: Path) -> dict:
try:
install_chromium_with_hooks(env)
except RuntimeError as e:
pytest.skip(str(e))
raise RuntimeError(str(e))
return env
@@ -873,6 +899,7 @@ def chrome_session(
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
node_modules_dir = npm_dir / 'node_modules'
puppeteer_cache_dir = lib_dir / 'puppeteer'
# Create lib structure for puppeteer installation
node_modules_dir.mkdir(parents=True, exist_ok=True)
@@ -893,8 +920,12 @@ def chrome_session(
'NODE_PATH': str(node_modules_dir),
'NPM_BIN_DIR': str(npm_dir / '.bin'),
'CHROME_HEADLESS': 'true',
'PUPPETEER_CACHE_DIR': str(puppeteer_cache_dir),
})
# Reuse system Puppeteer cache to avoid redundant Chromium downloads
link_puppeteer_cache(lib_dir)
# Install Chromium via npm + puppeteer hooks using normal Binary flow
install_chromium_with_hooks(env)

View File

@@ -125,10 +125,10 @@ def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
try:
chromium_binary = install_chromium_with_hooks(env)
except RuntimeError as e:
pytest.skip(str(e))
raise RuntimeError(str(e))
if not chromium_binary:
pytest.skip("Chromium not found after install")
raise RuntimeError("Chromium not found after install")
os.environ['CHROME_BINARY'] = chromium_binary
for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):

View File

@@ -13,27 +13,18 @@ import tempfile
import time
from pathlib import Path
import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
get_test_env,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
def chrome_available() -> bool:
"""Check if Chrome/Chromium is available."""
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
if shutil.which(name):
return True
return False
# Get the path to the consolelog hook
PLUGIN_DIR = get_plugin_dir(__file__)
CONSOLELOG_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_consolelog.*')
@@ -48,7 +39,6 @@ class TestConsolelogPlugin(TestCase):
self.assertTrue(CONSOLELOG_HOOK.exists(), f"Hook not found: {CONSOLELOG_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestConsolelogWithChrome(TestCase):
"""Integration tests for consolelog plugin with Chrome."""
@@ -62,68 +52,75 @@ class TestConsolelogWithChrome(TestCase):
def test_consolelog_captures_output(self):
"""Consolelog hook should capture console output from page."""
test_url = 'https://example.com'
test_url = 'data:text/html,<script>console.log("archivebox-console-test")</script>'
snapshot_id = 'test-consolelog-snapshot'
try:
with chrome_session(
self.temp_dir,
crawl_id='test-consolelog-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
with chrome_session(
self.temp_dir,
crawl_id='test-consolelog-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
console_dir = snapshot_chrome_dir.parent / 'consolelog'
console_dir.mkdir(exist_ok=True)
# Run consolelog hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(console_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Run consolelog hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
# Check for output file
console_output = snapshot_chrome_dir / 'console.jsonl'
# Check for output file
console_output = console_dir / 'console.jsonl'
# Allow it to run briefly, then terminate (background hook)
time.sleep(3)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
# Allow it to run briefly, then terminate (background hook)
for _ in range(10):
if console_output.exists() and console_output.stat().st_size > 0:
break
time.sleep(1)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
stdout, stderr = result.communicate()
# At minimum, verify no crash
self.assertNotIn('Traceback', stderr)
# At minimum, verify no crash
self.assertNotIn('Traceback', stderr)
# If output file exists, verify it's valid JSONL
if console_output.exists():
with open(console_output) as f:
content = f.read().strip()
if content:
for line in content.split('\n'):
if line.strip():
try:
record = json.loads(line)
# Verify structure
self.assertIn('timestamp', record)
self.assertIn('type', record)
except json.JSONDecodeError:
pass # Some lines may be incomplete
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
raise
# If output file exists, verify it's valid JSONL and has output
if console_output.exists():
with open(console_output) as f:
content = f.read().strip()
self.assertTrue(content, "Console output should not be empty")
for line in content.split('\n'):
if line.strip():
try:
record = json.loads(line)
# Verify structure
self.assertIn('timestamp', record)
self.assertIn('type', record)
except json.JSONDecodeError:
pass # Some lines may be incomplete
if __name__ == '__main__':

View File

@@ -0,0 +1,126 @@
"""
Tests for the DNS plugin.
Tests the real DNS hook with an actual URL to verify
DNS resolution capture.
"""
import json
import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
# Get the path to the DNS hook
PLUGIN_DIR = get_plugin_dir(__file__)
DNS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dns.*')
class TestDNSPlugin(TestCase):
"""Test the DNS plugin."""
def test_dns_hook_exists(self):
"""DNS hook script should exist."""
self.assertIsNotNone(DNS_HOOK, "DNS hook not found in plugin directory")
self.assertTrue(DNS_HOOK.exists(), f"Hook not found: {DNS_HOOK}")
class TestDNSWithChrome(TestCase):
"""Integration tests for DNS plugin with Chrome."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = Path(tempfile.mkdtemp())
def tearDown(self):
"""Clean up."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_dns_records_captured(self):
"""DNS hook should capture DNS records from a real URL."""
test_url = 'https://example.com'
snapshot_id = 'test-dns-snapshot'
with chrome_session(
self.temp_dir,
crawl_id='test-dns-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (_process, _pid, snapshot_chrome_dir, env):
dns_dir = snapshot_chrome_dir.parent / 'dns'
dns_dir.mkdir(exist_ok=True)
result = subprocess.Popen(
['node', str(DNS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(dns_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
dns_output = dns_dir / 'dns.jsonl'
for _ in range(30):
if dns_output.exists() and dns_output.stat().st_size > 0:
break
time.sleep(1)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
stdout, stderr = result.communicate()
self.assertNotIn('Traceback', stderr)
self.assertTrue(dns_output.exists(), "dns.jsonl not created")
content = dns_output.read_text().strip()
self.assertTrue(content, "DNS output should not be empty")
records = []
for line in content.split('\n'):
line = line.strip()
if not line:
continue
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
self.assertTrue(records, "No DNS records parsed")
has_ip_record = any(r.get('hostname') and r.get('ip') for r in records)
self.assertTrue(has_ip_record, f"No DNS record with hostname + ip: {records}")
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -2,19 +2,12 @@
/**
* Dump the DOM of a URL using Chrome/Puppeteer.
*
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
* Otherwise launches a new Chrome instance.
* Requires a Chrome session (from chrome plugin) and connects to it via CDP.
*
* Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
* Output: Writes dom/output.html
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* DOM_ENABLED: Enable DOM extraction (default: true)
*/
@@ -24,11 +17,7 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
findChromium,
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
parseArgs,
readCdpUrl,
} = require('../chrome/chrome_utils.js');
@@ -86,81 +75,30 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
}
async function dumpDom(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
let connectedToSession = false;
try {
// Try to connect to existing Chrome session
// Connect to existing Chrome session (required)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (cdpUrl) {
try {
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
connectedToSession = true;
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Set viewport on the page
await page.setViewport({ width, height });
} catch (e) {
console.error(`Failed to connect to CDP session: ${e.message}`);
browser = null;
}
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
// Fall back to launching new browser
if (!browser) {
const executablePath = findChromium();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browser = await puppeteer.launch({
executablePath,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
],
defaultViewport: { width, height },
});
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
// Navigate to URL (only if we launched fresh browser)
if (userAgent) {
await page.setUserAgent(userAgent);
}
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
}
// Get the full DOM content
@@ -176,9 +114,8 @@ async function dumpDom(url) {
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
// Only close browser if we launched it (not if we connected to session)
if (browser && !connectedToSession) {
await browser.close();
if (browser) {
browser.disconnect();
}
}
}
@@ -206,14 +143,15 @@ async function main() {
process.exit(0);
}
// Only wait for page load if using shared Chrome session
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
if (!cdpUrl) {
throw new Error('No Chrome session found (chrome plugin must run first)');
}
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await dumpDom(url);

View File

@@ -28,6 +28,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
chrome_session,
)
@@ -61,15 +62,19 @@ def test_extracts_dom_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run DOM extraction hook
result = subprocess.run(
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=120
,
env=get_test_env())
with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
dom_dir = snapshot_chrome_dir.parent / 'dom'
dom_dir.mkdir(exist_ok=True)
# Run DOM extraction hook
result = subprocess.run(
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=dom_dir,
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -90,7 +95,7 @@ def test_extracts_dom_from_example_com():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify filesystem output (hook writes directly to working dir)
dom_file = tmpdir / 'output.html'
dom_file = dom_dir / 'output.html'
assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}"
# Verify HTML content contains REAL example.com text

View File

@@ -128,8 +128,6 @@ def main(url: str, snapshot_id: str):
success, output, error = get_favicon(url)
if success:
status = 'succeeded'
elif error == 'No favicon found':
status = 'skipped'
else:
status = 'failed'
@@ -148,7 +146,7 @@ def main(url: str, snapshot_id: str):
}
print(json.dumps(result))
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':

View File

@@ -3,7 +3,7 @@
{% if output_path %}
<img src="{{ output_path }}"
alt="Favicon"
style="max-width: 80%; max-height: 80%; object-fit: contain;"
style="width: 30px; height: 30px; max-width: 30px; max-height: 30px; object-fit: contain;"
loading="lazy">
{% endif %}
</div>

View File

@@ -48,7 +48,9 @@ def main():
'pip': {
'packages': [
'--no-deps',
'--prefer-binary',
'forum-dl',
'chardet==5.2.0',
'pydantic',
'pydantic-core',
'typing-extensions',

View File

@@ -13,6 +13,7 @@ Tests verify:
"""
import json
import os
import subprocess
import sys
import tempfile
@@ -28,6 +29,7 @@ TEST_URL = 'https://example.com'
# Module-level cache for binary path
_forumdl_binary_path = None
_forumdl_lib_root = None
def get_forumdl_binary_path():
"""Get the installed forum-dl binary path from cache or by running installation."""
@@ -50,11 +52,48 @@ def get_forumdl_binary_path():
except Exception:
pass
# If not found, try to install via pip
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
# If not found, try to install via pip using the crawl hook overrides
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py'
crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py'
if pip_hook.exists():
binary_id = str(uuid.uuid4())
machine_id = str(uuid.uuid4())
overrides = None
if crawl_hook.exists():
crawl_result = subprocess.run(
[sys.executable, str(crawl_hook)],
capture_output=True,
text=True,
timeout=30,
)
for crawl_line in crawl_result.stdout.strip().split('\n'):
if crawl_line.strip().startswith('{'):
try:
crawl_record = json.loads(crawl_line)
if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl':
overrides = crawl_record.get('overrides')
break
except json.JSONDecodeError:
continue
# Create a persistent temp LIB_DIR for the pip provider
import platform
global _forumdl_lib_root
if not _forumdl_lib_root:
_forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-')
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type
lib_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env['LIB_DIR'] = str(lib_dir)
env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data')
cmd = [
sys.executable, str(pip_hook),
@@ -62,12 +101,15 @@ def get_forumdl_binary_path():
'--machine-id', machine_id,
'--name', 'forum-dl'
]
if overrides:
cmd.append(f'--overrides={json.dumps(overrides)}')
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
timeout=300,
env=env,
)
# Parse Binary from pip installation
@@ -212,8 +254,7 @@ def test_real_forum_url():
import os
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip("forum-dl binary not available")
assert binary_path, "forum-dl binary not available"
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -19,7 +19,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None)
TEST_URL = 'https://github.com/example/repo.git'
TEST_URL = 'https://github.com/ArchiveBox/abx-pkg.git'
def test_hook_script_exists():
assert GIT_HOOK.exists()
@@ -31,10 +31,7 @@ def test_verify_deps_with_abx_pkg():
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
git_loaded = git_binary.load()
if git_loaded and git_loaded.abspath:
assert True, "git is available"
else:
pass
assert git_loaded and git_loaded.abspath, "git is required for git plugin tests"
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:
@@ -48,9 +45,7 @@ def test_reports_missing_git():
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
def test_handles_non_git_url():
pass
if not shutil.which('git'):
pass
assert shutil.which('git'), "git binary not available"
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
@@ -83,8 +78,7 @@ def test_real_git_repo():
"""Test that git can clone a real GitHub repository."""
import os
if not shutil.which('git'):
pytest.skip("git binary not available")
assert shutil.which('git'), "git binary not available"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

View File

@@ -3,13 +3,13 @@
"type": "object",
"additionalProperties": false,
"properties": {
"MERKLETREE_ENABLED": {
"HASHES_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_MERKLETREE", "USE_MERKLETREE"],
"x-aliases": ["SAVE_HASHES", "USE_HASHES"],
"description": "Enable merkle tree hash generation"
},
"MERKLETREE_TIMEOUT": {
"HASHES_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,

View File

@@ -1,16 +1,16 @@
#!/usr/bin/env python3
"""
Create a Merkle tree of all archived outputs.
Create a hashed Merkle tree of all archived outputs.
This plugin runs after all extractors complete (priority 93) and generates
a cryptographic Merkle tree of all files in the snapshot directory.
a cryptographic Merkle hash tree of all files in the snapshot directory.
Output: merkletree.json containing root_hash, tree structure, file list, metadata
Output: hashes.json containing root_hash, tree structure, file list, metadata
Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__93_hashes.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
SAVE_HASHES: Enable hash merkle tree generation (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
@@ -45,7 +45,7 @@ def sha256_data(data: bytes) -> str:
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""Recursively collect all files in snapshot directory."""
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
@@ -94,8 +94,8 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
return root_hash, tree_levels
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
"""Create a complete Merkle tree of all files in snapshot directory."""
def create_hashes(snapshot_dir: Path) -> Dict[str, Any]:
"""Create a complete Merkle hash tree of all files in snapshot directory."""
files = collect_files(snapshot_dir)
file_hashes = [file_hash for _, file_hash, _ in files]
root_hash, tree_levels = build_merkle_tree(file_hashes)
@@ -132,14 +132,14 @@ def main(url: str, snapshot_id: str):
try:
# Check if enabled
save_merkletree = os.getenv('MERKLETREE_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_merkletree:
if not save_hashes:
status = 'skipped'
click.echo(json.dumps({'status': status, 'output': 'MERKLETREE_ENABLED=false'}))
click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'}))
sys.exit(0)
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
# Working directory is the extractor output dir (e.g., <snapshot>/hashes/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
@@ -149,17 +149,17 @@ def main(url: str, snapshot_id: str):
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)
output_path = output_dir / 'merkletree.json'
output_path = output_dir / 'hashes.json'
# Generate Merkle tree
merkle_data = create_merkle_tree(snapshot_dir)
merkle_data = create_hashes(snapshot_dir)
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
output = 'merkletree.json'
output = 'hashes.json'
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--hashes" title="Authenticity Hashes"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>

View File

@@ -1,5 +1,5 @@
"""
Tests for the merkletree plugin.
Tests for the hashes plugin.
Tests the real merkle tree generation with actual files.
"""
@@ -15,27 +15,27 @@ import pytest
from django.test import TestCase
# Get the path to the merkletree hook
# Get the path to the hashes hook
PLUGIN_DIR = Path(__file__).parent.parent
MERKLETREE_HOOK = PLUGIN_DIR / 'on_Snapshot__93_merkletree.py'
HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py'
class TestMerkletreePlugin(TestCase):
"""Test the merkletree plugin."""
class TestHashesPlugin(TestCase):
"""Test the hashes plugin."""
def test_merkletree_hook_exists(self):
"""Merkletree hook script should exist."""
self.assertTrue(MERKLETREE_HOOK.exists(), f"Hook not found: {MERKLETREE_HOOK}")
def test_hashes_hook_exists(self):
"""Hashes hook script should exist."""
self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}")
def test_merkletree_generates_tree_for_files(self):
"""Merkletree hook should generate merkle tree for files in snapshot directory."""
def test_hashes_generates_tree_for_files(self):
"""Hashes hook should generate merkle tree for files in snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create a mock snapshot directory structure
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
# Create output directory for merkletree
output_dir = snapshot_dir / 'merkletree'
# Create output directory for hashes
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
# Create some test files
@@ -48,11 +48,11 @@ class TestMerkletreePlugin(TestCase):
# Run the hook from the output directory
env = os.environ.copy()
env['MERKLETREE_ENABLED'] = 'true'
env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
sys.executable, str(MERKLETREE_HOOK),
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
@@ -67,8 +67,8 @@ class TestMerkletreePlugin(TestCase):
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
output_file = output_dir / 'merkletree.json'
self.assertTrue(output_file.exists(), "merkletree.json not created")
output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists(), "hashes.json not created")
# Parse and verify output
with open(output_file) as f:
@@ -87,20 +87,20 @@ class TestMerkletreePlugin(TestCase):
self.assertGreater(data['metadata']['file_count'], 0)
self.assertGreater(data['metadata']['total_size'], 0)
def test_merkletree_skips_when_disabled(self):
"""Merkletree hook should skip when MERKLETREE_ENABLED=false."""
def test_hashes_skips_when_disabled(self):
"""Hashes hook should skip when HASHES_ENABLED=false."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
output_dir = snapshot_dir / 'merkletree'
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
env['MERKLETREE_ENABLED'] = 'false'
env['HASHES_ENABLED'] = 'false'
result = subprocess.run(
[
sys.executable, str(MERKLETREE_HOOK),
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
@@ -115,20 +115,20 @@ class TestMerkletreePlugin(TestCase):
self.assertEqual(result.returncode, 0)
self.assertIn('skipped', result.stdout)
def test_merkletree_handles_empty_directory(self):
"""Merkletree hook should handle empty snapshot directory."""
def test_hashes_handles_empty_directory(self):
"""Hashes hook should handle empty snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
output_dir = snapshot_dir / 'merkletree'
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
env['MERKLETREE_ENABLED'] = 'true'
env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
sys.executable, str(MERKLETREE_HOOK),
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
@@ -143,7 +143,7 @@ class TestMerkletreePlugin(TestCase):
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
output_file = output_dir / 'merkletree.json'
output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists())
with open(output_file) as f:

View File

@@ -0,0 +1,247 @@
#!/usr/bin/env node
/**
* Capture original request + response headers for the main navigation.
*
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
* then waits for navigation to complete. It records the first top-level
* request headers and the corresponding response headers (with :status).
*
* Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>
* Output: Writes headers.json
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvBool,
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
const PLUGIN_NAME = 'headers';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
let browser = null;
let page = null;
let client = null;
let shuttingDown = false;
let headersWritten = false;
let requestId = null;
let requestUrl = null;
let requestHeaders = null;
let responseHeaders = null;
let responseStatus = null;
let responseStatusText = null;
let responseUrl = null;
let originalUrl = null;
function getFinalUrl() {
const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt');
if (fs.existsSync(finalUrlFile)) {
return fs.readFileSync(finalUrlFile, 'utf8').trim();
}
return page ? page.url() : null;
}
function writeHeadersFile() {
if (headersWritten) return;
if (!responseHeaders) return;
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const responseHeadersWithStatus = {
...(responseHeaders || {}),
};
if (responseStatus !== null && responseStatus !== undefined &&
responseHeadersWithStatus[':status'] === undefined) {
responseHeadersWithStatus[':status'] = String(responseStatus);
}
const record = {
url: requestUrl || originalUrl,
final_url: getFinalUrl(),
status: responseStatus !== undefined ? responseStatus : null,
request_headers: requestHeaders || {},
response_headers: responseHeadersWithStatus,
headers: responseHeadersWithStatus, // backwards compatibility
};
if (responseStatusText) {
record.statusText = responseStatusText;
}
if (responseUrl) {
record.response_url = responseUrl;
}
fs.writeFileSync(outputPath, JSON.stringify(record, null, 2));
headersWritten = true;
}
async function setupListener(url) {
const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
try {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
process.kill(pid, 0);
} catch (e) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: timeout,
puppeteer,
});
client = await page.target().createCDPSession();
await client.send('Network.enable');
client.on('Network.requestWillBeSent', (params) => {
try {
if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) {
responseHeaders = params.redirectResponse.headers || {};
responseStatus = params.redirectResponse.status || null;
responseStatusText = params.redirectResponse.statusText || null;
responseUrl = params.redirectResponse.url || null;
writeHeadersFile();
}
if (requestId) return;
if (params.type && params.type !== 'Document') return;
if (!params.request || !params.request.url) return;
if (!params.request.url.startsWith('http')) return;
requestId = params.requestId;
requestUrl = params.request.url;
requestHeaders = params.request.headers || {};
} catch (e) {
// Ignore errors
}
});
client.on('Network.responseReceived', (params) => {
try {
if (!requestId || params.requestId !== requestId || responseHeaders) return;
const response = params.response || {};
responseHeaders = response.headers || {};
responseStatus = response.status || null;
responseStatusText = response.statusText || null;
responseUrl = response.url || null;
writeHeadersFile();
} catch (e) {
// Ignore errors
}
});
return { browser, page };
}
function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) {
if (shuttingDown) return;
shuttingDown = true;
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: outputStr,
}));
}
async function handleShutdown(signal) {
console.error(`\nReceived ${signal}, emitting final results...`);
if (!headersWritten) {
writeHeadersFile();
}
if (headersWritten) {
emitResult('succeeded', OUTPUT_FILE);
} else {
emitResult('failed', 'No headers captured');
}
if (browser) {
try {
browser.disconnect();
} catch (e) {}
}
process.exit(headersWritten ? 0 : 1);
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
originalUrl = url;
if (!getEnvBool('HEADERS_ENABLED', true)) {
console.error('Skipping (HEADERS_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'}));
process.exit(0);
}
try {
// Set up listeners BEFORE navigation
const connection = await setupListener(url);
browser = connection.browser;
page = connection.page;
// Register signal handlers for graceful shutdown
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
process.on('SIGINT', () => handleShutdown('SIGINT'));
// Wait for chrome_navigate to complete (non-fatal)
try {
const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
} catch (e) {
console.error(`WARN: ${e.message}`);
}
// Keep alive until SIGTERM
await new Promise(() => {});
return;
} catch (e) {
const errorMessage = (e && e.message)
? `${e.name || 'Error'}: ${e.message}`
: String(e || 'Unknown error');
console.error(`ERROR: ${errorMessage}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'failed',
output_str: errorMessage,
}));
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,161 +0,0 @@
#!/usr/bin/env node
/**
* Extract HTTP response headers for a URL.
*
* If a Chrome session exists (from chrome plugin), reads the captured
* response headers from chrome plugin/response_headers.json.
* Otherwise falls back to making an HTTP HEAD request.
*
* Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>
* Output: Writes headers/headers.json
*
* Environment variables:
* TIMEOUT: Timeout in seconds (default: 30)
* USER_AGENT: User agent string (optional)
* CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
*/
const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
getEnv,
getEnvBool,
getEnvInt,
parseArgs,
} = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'headers';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_HEADERS_FILE = 'response_headers.json';
// Get headers from chrome plugin if available
function getHeadersFromChromeSession() {
const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
if (fs.existsSync(headersFile)) {
try {
const data = JSON.parse(fs.readFileSync(headersFile, 'utf8'));
return data;
} catch (e) {
return null;
}
}
return null;
}
// Fetch headers via HTTP HEAD request (fallback)
function fetchHeaders(url) {
return new Promise((resolve, reject) => {
const timeout = getEnvInt('TIMEOUT', 30) * 1000;
const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
const checkSsl = getEnvBool('CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const parsedUrl = new URL(url);
const client = parsedUrl.protocol === 'https:' ? https : http;
const options = {
method: 'HEAD',
hostname: parsedUrl.hostname,
port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
path: parsedUrl.pathname + parsedUrl.search,
headers: { 'User-Agent': userAgent },
timeout,
rejectUnauthorized: checkSsl,
};
const req = client.request(options, (res) => {
resolve({
url: url,
status: res.statusCode,
statusText: res.statusMessage,
headers: res.headers,
});
});
req.on('error', reject);
req.on('timeout', () => {
req.destroy();
reject(new Error('Request timeout'));
});
req.end();
});
}
async function extractHeaders(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first
const chromeHeaders = getHeadersFromChromeSession();
if (chromeHeaders && chromeHeaders.headers) {
fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
return { success: true, output: outputPath, method: 'chrome', status: chromeHeaders.status };
}
// Fallback to HTTP HEAD request
try {
const headers = await fetchHeaders(url);
fs.writeFileSync(outputPath, JSON.stringify(headers, null, 2), 'utf8');
return { success: true, output: outputPath, method: 'http', status: headers.status };
} catch (e) {
return { success: false, error: e.message };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
const result = await extractHeaders(url);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Headers extracted (${result.method}): HTTP ${result.status}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -7,23 +7,68 @@ Tests verify:
2. Node.js is available
3. Headers extraction works for real example.com
4. Output JSON contains actual HTTP headers
5. HTTP fallback works correctly
6. Config options work (TIMEOUT, USER_AGENT)
5. Config options work (TIMEOUT, USER_AGENT)
"""
import json
import shutil
import subprocess
import tempfile
import time
from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
CHROME_NAVIGATE_HOOK,
get_test_env,
chrome_session,
)
PLUGIN_DIR = Path(__file__).parent.parent
HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
TEST_URL = 'https://example.com'
def normalize_root_url(url: str) -> str:
return url.rstrip('/')
def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id):
hook_proc = subprocess.Popen(
['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=headers_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env,
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=snapshot_chrome_dir,
capture_output=True,
text=True,
timeout=120,
env=env,
)
headers_file = headers_dir / 'headers.json'
for _ in range(60):
if headers_file.exists() and headers_file.stat().st_size > 0:
break
time.sleep(1)
if hook_proc.poll() is None:
hook_proc.terminate()
try:
stdout, stderr = hook_proc.communicate(timeout=5)
except subprocess.TimeoutExpired:
hook_proc.kill()
stdout, stderr = hook_proc.communicate()
else:
stdout, stderr = hook_proc.communicate()
return hook_proc.returncode, stdout, stderr, nav_result, headers_file
def test_hook_script_exists():
"""Verify hook script exists."""
@@ -66,21 +111,25 @@ def test_extracts_headers_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'test789',
)
hook_code, stdout, stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert hook_code == 0, f"Extraction failed: {stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
@@ -96,28 +145,36 @@ def test_extracts_headers_from_example_com():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file exists (hook writes to current directory)
headers_file = tmpdir / 'headers.json'
assert headers_file.exists(), "headers.json not created"
# Verify headers JSON contains REAL example.com response
headers_data = json.loads(headers_file.read_text())
assert 'url' in headers_data, "Should have url field"
assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}"
assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}"
assert 'status' in headers_data, "Should have status field"
assert headers_data['status'] in [200, 301, 302], \
f"Should have valid HTTP status, got {headers_data['status']}"
assert 'request_headers' in headers_data, "Should have request_headers field"
assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict"
assert 'response_headers' in headers_data, "Should have response_headers field"
assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict"
assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty"
assert 'headers' in headers_data, "Should have headers field"
assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
assert len(headers_data['headers']) > 0, "Headers dict should not be empty"
# Verify common HTTP headers are present
headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()}
headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()}
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
"Should have at least one common HTTP header"
assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \
"Response headers should include :status pseudo header"
def test_headers_output_structure():
"""Test that headers plugin produces correctly structured output."""
@@ -128,21 +185,25 @@ def test_headers_output_structure():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run headers extraction against real example.com
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testformat'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testformat',
)
hook_code, stdout, stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert hook_code == 0, f"Extraction failed: {stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
@@ -158,27 +219,30 @@ def test_headers_output_structure():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output structure
output_headers_file = tmpdir / 'headers.json'
assert output_headers_file.exists(), "Output headers.json not created"
assert headers_file.exists(), "Output headers.json not created"
output_data = json.loads(output_headers_file.read_text())
output_data = json.loads(headers_file.read_text())
# Verify all required fields are present
assert 'url' in output_data, "Output should have url field"
assert 'status' in output_data, "Output should have status field"
assert 'request_headers' in output_data, "Output should have request_headers field"
assert 'response_headers' in output_data, "Output should have response_headers field"
assert 'headers' in output_data, "Output should have headers field"
# Verify data types
assert isinstance(output_data['status'], int), "Status should be integer"
assert isinstance(output_data['request_headers'], dict), "Request headers should be dict"
assert isinstance(output_data['response_headers'], dict), "Response headers should be dict"
assert isinstance(output_data['headers'], dict), "Headers should be dict"
# Verify example.com returns expected headers
assert output_data['url'] == TEST_URL
assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL)
assert output_data['status'] in [200, 301, 302]
def test_falls_back_to_http_when_chrome_unavailable():
"""Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
def test_fails_without_chrome_session():
"""Test that headers plugin fails when chrome session is missing."""
if not shutil.which('node'):
pass
@@ -186,8 +250,6 @@ def test_falls_back_to_http_when_chrome_unavailable():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Don't create chrome directory - force HTTP fallback
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
@@ -198,34 +260,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output exists and has real HTTP headers
output_headers_file = tmpdir / 'headers.json'
assert output_headers_file.exists(), "Output headers.json not created"
output_data = json.loads(output_headers_file.read_text())
assert output_data['url'] == TEST_URL
assert output_data['status'] in [200, 301, 302]
assert isinstance(output_data['headers'], dict)
assert len(output_data['headers']) > 0
assert result.returncode != 0, "Should fail without chrome session"
assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
def test_config_timeout_honored():
@@ -239,20 +275,26 @@ def test_config_timeout_honored():
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
env_override = os.environ.copy()
env_override['TIMEOUT'] = '5'
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
env.update(env_override)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testtimeout',
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
hook_code, _stdout, _stderr, nav_result, _headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert hook_code in (0, 1), "Should complete without hanging"
def test_config_user_agent():
@@ -266,23 +308,29 @@ def test_config_user_agent():
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
env_override = os.environ.copy()
env_override['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
env.update(env_override)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testua',
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
hook_code, stdout, _stderr, nav_result, _headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if hook_code == 0:
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
@@ -307,20 +355,23 @@ def test_handles_https_urls():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
'https://example.org',
'testhttps',
)
if result.returncode == 0:
output_headers_file = tmpdir / 'headers.json'
if output_headers_file.exists():
output_data = json.loads(output_headers_file.read_text())
assert output_data['url'] == 'https://example.org'
hook_code, _stdout, _stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if hook_code == 0:
if headers_file.exists():
output_data = json.loads(headers_file.read_text())
assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org')
assert output_data['status'] in [200, 301, 302]
@@ -333,21 +384,24 @@ def test_handles_404_gracefully():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
'https://example.com/nonexistent-page-404',
'test404',
)
# May succeed or fail depending on server behavior
# If it succeeds, verify 404 status is captured
if result.returncode == 0:
output_headers_file = tmpdir / 'headers.json'
if output_headers_file.exists():
output_data = json.loads(output_headers_file.read_text())
hook_code, _stdout, _stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if hook_code == 0:
if headers_file.exists():
output_data = json.loads(headers_file.read_text())
assert output_data['status'] == 404, "Should capture 404 status"

View File

@@ -42,6 +42,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
@@ -330,7 +331,7 @@ async function main() {
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
console.error(CHROME_SESSION_REQUIRED_ERROR);
process.exit(1);
}
@@ -363,10 +364,6 @@ async function main() {
page = pages[pages.length - 1];
}
// Set viewport to ensure proper page rendering
const resolution = getEnv('CHROME_RESOLUTION', '1440,2000').split(',').map(x => parseInt(x.trim(), 10));
await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 });
console.error(`Starting infinite scroll on ${url}`);
// Expand <details> and comments before scrolling (if enabled)

View File

@@ -79,10 +79,12 @@ def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll'
infiniscroll_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
cwd=tmpdir,
cwd=infiniscroll_dir,
capture_output=True,
text=True,
env=get_test_env(),

View File

@@ -16,6 +16,7 @@ import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
get_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
@@ -291,8 +292,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir,
env=get_test_env()),
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
@@ -444,8 +444,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir,
env=get_test_env()),
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
@@ -539,7 +538,7 @@ def test_hides_cookie_consent_on_filmin():
print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
pytest.skip(
pytest.fail(
f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
f"Elements found: {len(baseline_result['elements_found'])}. "
f"The site may have changed or cookie consent may be region-specific."
@@ -559,8 +558,7 @@ def test_hides_cookie_consent_on_filmin():
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir,
env=get_test_env()),
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env_with_ext,

View File

@@ -15,11 +15,13 @@ Environment variables:
Note: Requires postlight-parser: npm install -g @postlight/parser
"""
import html
import json
import os
import subprocess
import sys
from pathlib import Path
from urllib.parse import urlparse
import rich_click as click
@@ -115,13 +117,39 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
# Save HTML content and metadata
html_content = html_json.pop('content', '')
# Some sources return HTML-escaped markup inside the content blob.
# If it looks heavily escaped, unescape once so it renders properly.
if html_content:
escaped_count = html_content.count('&lt;') + html_content.count('&gt;')
tag_count = html_content.count('<')
if escaped_count and escaped_count > tag_count * 2:
html_content = html.unescape(html_content)
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
# Save article metadata
metadata = {k: v for k, v in text_json.items() if k != 'content'}
(output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8')
return True, OUTPUT_DIR, ''
# Link images/ to responses capture (if available)
try:
hostname = urlparse(url).hostname or ''
if hostname:
responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve()
link_path = output_dir / 'images'
if responses_images.exists() and responses_images.is_dir():
if link_path.exists() or link_path.is_symlink():
if link_path.is_symlink() or link_path.is_file():
link_path.unlink()
else:
# Don't remove real directories
responses_images = None
if responses_images:
rel_target = os.path.relpath(str(responses_images), str(output_dir))
link_path.symlink_to(rel_target)
except Exception:
pass
return True, 'content.html', ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--merkletree" title="Merkle Tree"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>

View File

@@ -237,7 +237,7 @@ async function main() {
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (!cdpUrl) {
console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
console.error('No Chrome session found (chrome plugin must run first)');
process.exit(1);
}

View File

@@ -81,10 +81,12 @@ def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
modalcloser_dir = tmpdir / 'snapshot' / 'modalcloser'
modalcloser_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
cwd=tmpdir,
cwd=modalcloser_dir,
capture_output=True,
text=True,
env=get_test_env(),

View File

@@ -91,9 +91,9 @@ class TestNpmProviderHook(TestCase):
self.assertIn('npm provider not allowed', result.stderr)
self.assertEqual(result.returncode, 0)
@pytest.mark.skipif(not npm_available(), reason="npm not installed")
def test_hook_creates_npm_prefix(self):
"""Hook should create npm prefix directory."""
assert npm_available(), "npm not installed"
env = os.environ.copy()
env['LIB_DIR'] = str(self.lib_dir)

View File

@@ -81,7 +81,7 @@ function getCdpUrl() {
}
// Extract outlinks
async function extractOutlinks(url) {
async function extractOutlinks(url, snapshotId, crawlId, depth) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
@@ -253,7 +253,7 @@ async function main() {
}
}
const result = await extractOutlinks(url);
const result = await extractOutlinks(url, snapshotId, crawlId, depth);
if (result.success) {
status = 'succeeded';

View File

@@ -47,7 +47,6 @@ class TestParseDomOutlinksPlugin(TestCase):
self.assertTrue(OUTLINKS_HOOK.exists(), f"Hook not found: {OUTLINKS_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestParseDomOutlinksWithChrome(TestCase):
"""Integration tests for parse_dom_outlinks plugin with Chrome."""
@@ -112,9 +111,7 @@ class TestParseDomOutlinksWithChrome(TestCase):
# example.com has at least one link (to iana.org)
self.assertIsInstance(outlinks_data['hrefs'], list)
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
except RuntimeError:
raise

View File

@@ -2,19 +2,12 @@
/**
* Print a URL to PDF using Chrome/Puppeteer.
*
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
* Otherwise launches a new Chrome instance.
* Requires a Chrome session (from chrome plugin) and connects to it via CDP.
*
* Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>
* Output: Writes pdf/output.pdf
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* PDF_ENABLED: Enable PDF generation (default: true)
*/
@@ -24,11 +17,7 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
findChromium,
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
parseArgs,
readCdpUrl,
} = require('../chrome/chrome_utils.js');
@@ -86,81 +75,30 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
}
async function printToPdf(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
let connectedToSession = false;
try {
// Try to connect to existing Chrome session
// Connect to existing Chrome session (required)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (cdpUrl) {
try {
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
connectedToSession = true;
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Set viewport on the page
await page.setViewport({ width, height });
} catch (e) {
console.error(`Failed to connect to CDP session: ${e.message}`);
browser = null;
}
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
// Fall back to launching new browser
if (!browser) {
const executablePath = findChromium();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browser = await puppeteer.launch({
executablePath,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
],
defaultViewport: { width, height },
});
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
// Navigate to URL (only if we launched fresh browser)
if (userAgent) {
await page.setUserAgent(userAgent);
}
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
}
// Print to PDF
@@ -185,9 +123,8 @@ async function printToPdf(url) {
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
// Only close browser if we launched it (not if we connected to session)
if (browser && !connectedToSession) {
await browser.close();
if (browser) {
browser.disconnect();
}
}
}
@@ -215,14 +152,15 @@ async function main() {
process.exit(0);
}
// Only wait for page load if using shared Chrome session
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
if (!cdpUrl) {
throw new Error('No Chrome session found (chrome plugin must run first)');
}
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await printToPdf(url);

View File

@@ -29,6 +29,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
chrome_session,
)
@@ -62,15 +63,19 @@ def test_extracts_pdf_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run PDF extraction hook
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=120
,
env=get_test_env())
with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
pdf_dir = snapshot_chrome_dir.parent / 'pdf'
pdf_dir.mkdir(exist_ok=True)
# Run PDF extraction hook
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=pdf_dir,
capture_output=True,
text=True,
timeout=120,
env=env
)
# Parse clean JSONL output (hook might fail due to network issues)
result_json = None
@@ -98,7 +103,7 @@ def test_extracts_pdf_from_example_com():
assert result.returncode == 0, f"Should exit 0 on success: {result.stderr}"
# Verify filesystem output (hook writes to current directory)
pdf_file = tmpdir / 'output.pdf'
pdf_file = pdf_dir / 'output.pdf'
assert pdf_file.exists(), "output.pdf not created"
# Verify file is valid PDF
@@ -117,7 +122,7 @@ def test_config_save_pdf_false_skips():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env = get_test_env()
env['PDF_ENABLED'] = 'False'
result = subprocess.run(
@@ -140,50 +145,46 @@ def test_config_save_pdf_false_skips():
def test_reports_missing_chrome():
"""Test that script reports error when Chrome is not found."""
"""Test that script reports error when Chrome session is missing."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set CHROME_BINARY to nonexistent path
env = os.environ.copy()
env['CHROME_BINARY'] = '/nonexistent/chrome'
env = get_test_env()
pdf_dir = tmpdir / 'snapshot' / 'pdf'
pdf_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
cwd=tmpdir,
cwd=pdf_dir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should fail and report missing Chrome
if result.returncode != 0:
combined = result.stdout + result.stderr
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
assert result.returncode != 0, "Should fail without shared Chrome session"
combined = result.stdout + result.stderr
assert 'chrome session' in combined.lower() or 'chrome plugin' in combined.lower()
def test_config_timeout_honored():
"""Test that CHROME_TIMEOUT config is respected."""
import os
def test_runs_with_shared_chrome_session():
"""Test that PDF hook completes when shared Chrome session is available."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout
env = os.environ.copy()
env['CHROME_TIMEOUT'] = '5'
with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
pdf_dir = snapshot_chrome_dir.parent / 'pdf'
pdf_dir.mkdir(exist_ok=True)
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
cwd=pdf_dir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"

View File

@@ -142,13 +142,14 @@ class TestPipProviderIntegration(TestCase):
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
@pytest.mark.skipif(
subprocess.run([sys.executable, '-m', 'pip', '--version'],
capture_output=True).returncode != 0,
reason="pip not available"
)
def test_hook_finds_pip_installed_binary(self):
"""Hook should find binaries installed via pip."""
pip_check = subprocess.run(
[sys.executable, '-m', 'pip', '--version'],
capture_output=True,
text=True,
)
assert pip_check.returncode == 0, "pip not available"
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir

View File

@@ -46,8 +46,8 @@ def test_crawl_hook_emits_puppeteer_binary():
assert 'npm' in binaries[0].get('binproviders', ''), "puppeteer should be installable via npm provider"
@pytest.mark.skipif(shutil.which('npm') is None, reason='npm is required for puppeteer installation')
def test_puppeteer_installs_chromium():
assert shutil.which('npm'), "npm is required for puppeteer installation"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
lib_dir = tmpdir / 'lib' / 'arm64-darwin'

View File

@@ -22,6 +22,7 @@ import subprocess
import sys
import tempfile
from pathlib import Path
from urllib.parse import urlparse
import rich_click as click
@@ -135,6 +136,24 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
# Link images/ to responses capture (if available)
try:
hostname = urlparse(url).hostname or ''
if hostname:
responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve()
link_path = output_dir / 'images'
if responses_images.exists() and responses_images.is_dir():
if link_path.exists() or link_path.is_symlink():
if link_path.is_symlink() or link_path.is_file():
link_path.unlink()
else:
responses_images = None
if responses_images:
rel_target = os.path.relpath(str(responses_images), str(output_dir))
link_path.symlink_to(rel_target)
except Exception:
pass
return True, OUTPUT_FILE, ''
except subprocess.TimeoutExpired:

View File

@@ -38,6 +38,7 @@ let originalUrl = '';
let finalUrl = '';
let page = null;
let browser = null;
let initialRecorded = false;
async function setupRedirectListener() {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
@@ -62,6 +63,20 @@ async function setupRedirectListener() {
client.on('Network.requestWillBeSent', (params) => {
const { requestId, request, redirectResponse } = params;
if (!initialRecorded && request.url && request.url.startsWith('http')) {
const initialEntry = {
timestamp: new Date().toISOString(),
from_url: null,
to_url: request.url,
status: null,
type: 'initial',
request_id: requestId,
};
redirectChain.push(initialEntry);
fs.appendFileSync(outputPath, JSON.stringify(initialEntry) + '\n');
initialRecorded = true;
}
if (redirectResponse) {
// This is a redirect
const redirectEntry = {

View File

@@ -48,7 +48,6 @@ class TestRedirectsPlugin(TestCase):
self.assertTrue(REDIRECTS_HOOK.exists(), f"Hook not found: {REDIRECTS_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestRedirectsWithChrome(TestCase):
"""Integration tests for redirects plugin with Chrome."""
@@ -142,9 +141,7 @@ class TestRedirectsWithChrome(TestCase):
self.assertNotIn('Traceback', stderr)
self.assertNotIn('Error:', stderr)
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
except RuntimeError:
raise

View File

@@ -39,7 +39,7 @@ let responseCount = 0;
let shuttingDown = false;
// Resource types to capture (by default, capture everything)
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
const DEFAULT_TYPES = ['document', 'script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
function getExtensionFromMimeType(mimeType) {
const mimeMap = {
@@ -176,11 +176,17 @@ async function setupListener() {
const hostname = urlObj.hostname;
const pathname = urlObj.pathname || '/';
const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
const dirPath = path.dirname(pathname);
const dirPathRaw = path.dirname(pathname);
const dirPath = dirPathRaw === '.' ? '' : dirPathRaw.replace(/^\/+/, '');
const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
const symlinkPath = path.join(symlinkDir, filename);
await createSymlink(uniquePath, symlinkPath);
// Also create a site-style symlink without resource type for easy browsing
const siteDir = path.join(OUTPUT_DIR, hostname, dirPath);
const sitePath = path.join(siteDir, filename);
await createSymlink(uniquePath, sitePath);
} catch (e) {
// URL parsing or symlink creation failed, skip
}

View File

@@ -13,27 +13,18 @@ import tempfile
import time
from pathlib import Path
import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
get_test_env,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
def chrome_available() -> bool:
"""Check if Chrome/Chromium is available."""
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
if shutil.which(name):
return True
return False
# Get the path to the responses hook
PLUGIN_DIR = get_plugin_dir(__file__)
RESPONSES_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_responses.*')
@@ -48,7 +39,6 @@ class TestResponsesPlugin(TestCase):
self.assertTrue(RESPONSES_HOOK.exists(), f"Hook not found: {RESPONSES_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestResponsesWithChrome(TestCase):
"""Integration tests for responses plugin with Chrome."""
@@ -65,68 +55,72 @@ class TestResponsesWithChrome(TestCase):
test_url = 'https://example.com'
snapshot_id = 'test-responses-snapshot'
try:
with chrome_session(
self.temp_dir,
crawl_id='test-responses-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
with chrome_session(
self.temp_dir,
crawl_id='test-responses-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
responses_dir = snapshot_chrome_dir.parent / 'responses'
responses_dir.mkdir(exist_ok=True)
# Run responses hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(responses_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Run responses hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
# Check for output directory and index file
index_output = snapshot_chrome_dir / 'index.jsonl'
# Check for output directory and index file
index_output = responses_dir / 'index.jsonl'
# Wait briefly for background hook to write output
for _ in range(10):
if index_output.exists() and index_output.stat().st_size > 0:
break
time.sleep(1)
# Wait briefly for background hook to write output
for _ in range(30):
if index_output.exists() and index_output.stat().st_size > 0:
break
time.sleep(1)
# Verify hook ran (may keep running waiting for cleanup signal)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
# Verify hook ran (may keep running waiting for cleanup signal)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
self.assertNotIn('Traceback', stderr)
else:
stdout, stderr = result.communicate()
self.assertNotIn('Traceback', stderr)
# If index file exists, verify it's valid JSONL
if index_output.exists():
with open(index_output) as f:
content = f.read().strip()
if content:
for line in content.split('\n'):
if line.strip():
try:
record = json.loads(line)
# Verify structure
self.assertIn('url', record)
self.assertIn('resourceType', record)
except json.JSONDecodeError:
pass # Some lines may be incomplete
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
raise
# If index file exists, verify it's valid JSONL
if index_output.exists():
with open(index_output) as f:
content = f.read().strip()
self.assertTrue(content, "Responses output should not be empty")
for line in content.split('\n'):
if line.strip():
try:
record = json.loads(line)
# Verify structure
self.assertIn('url', record)
self.assertIn('resourceType', record)
except json.JSONDecodeError:
pass # Some lines may be incomplete
if __name__ == '__main__':

View File

@@ -9,7 +9,6 @@
* Output: Writes screenshot/screenshot.png
*
* Environment variables:
* CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
*/
@@ -34,9 +33,10 @@ function flushCoverageAndExit(exitCode) {
const {
getEnv,
getEnvBool,
parseResolution,
parseArgs,
readCdpUrl,
connectToPage,
waitForPageLoaded,
readTargetId,
} = require('../chrome/chrome_utils.js');
// Check if screenshot is enabled BEFORE requiring puppeteer
@@ -75,77 +75,58 @@ function hasStaticFileOutput() {
return false;
}
// Wait for chrome tab to be fully loaded
async function waitForChromeTabLoaded(timeoutMs = 10000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
// Wait 100ms before checking again
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
async function takeScreenshot(url) {
const resolution = getEnv('CHROME_RESOLUTION', '1440,2000');
const { width, height } = parseResolution(resolution);
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Wait for chrome_navigate to complete (writes navigation.json)
const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10);
const timeoutMs = timeoutSeconds * 1000;
const pageLoaded = await waitForChromeTabLoaded(timeoutMs);
if (!pageLoaded) {
throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`);
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
if (!fs.existsSync(navigationFile)) {
await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs);
}
// Connect to existing Chrome session (required - no fallback)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (!cdpUrl) {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
const targetFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (!fs.existsSync(cdpFile)) {
throw new Error('No Chrome session found (chrome plugin must run first)');
}
// Read target_id.txt to get the specific tab for this snapshot
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (!fs.existsSync(targetIdFile)) {
if (!fs.existsSync(targetFile)) {
throw new Error('No target_id.txt found (chrome_tab must run first)');
}
const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim();
if (!cdpUrl.startsWith('ws://') && !cdpUrl.startsWith('wss://')) {
throw new Error('Invalid CDP URL in cdp_url.txt');
}
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs,
puppeteer,
});
try {
// Get the specific page for this snapshot by target ID
const targets = await browser.targets();
const target = targets.find(t => t._targetId === targetId);
if (!target) {
throw new Error(`Target ${targetId} not found in Chrome session`);
const expectedTargetId = readTargetId(CHROME_SESSION_DIR);
if (!expectedTargetId) {
throw new Error('No target_id.txt found (chrome_tab must run first)');
}
const actualTargetId = page.target()._targetId;
if (actualTargetId !== expectedTargetId) {
throw new Error(`Target ${expectedTargetId} not found in Chrome session`);
}
const page = await target.page();
if (!page) {
throw new Error(`Could not get page for target ${targetId}`);
}
// Set viewport on the page
await page.setViewport({ width, height });
// Take screenshot (Puppeteer throws on failure)
await page.screenshot({
path: outputPath,
fullPage: true,
const captureTimeoutMs = Math.max(timeoutMs, 10000);
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('Screenshot capture timed out')), captureTimeoutMs);
});
await page.bringToFront();
await Promise.race([
page.screenshot({ path: outputPath, fullPage: true }),
timeoutPromise,
]);
return outputPath;
} finally {
@@ -188,6 +169,7 @@ async function main() {
status: 'succeeded',
output_str: outputPath,
}));
flushCoverageAndExit(0);
}
main().catch(e => {

View File

@@ -2,7 +2,7 @@
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-thumbnail screenshot-thumbnail"
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333; transform: scale(1.05); transform-origin: top center;"
loading="lazy"
onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
<div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>

View File

@@ -1,8 +1,7 @@
<!-- Screenshot fullscreen - zoomable image -->
<div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
<!-- Screenshot fullscreen - full-width image with vertical scroll -->
<div style="width: 100%; min-height: 100vh; overflow: auto; background: #222; padding: 0; box-sizing: border-box; display: flex; justify-content: center; align-items: flex-start;">
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-fullscreen screenshot-fullscreen"
style="max-width: 100%; cursor: zoom-in;"
onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
style="width: auto; max-width: 100%; height: auto; display: block;">
</div>

View File

@@ -112,27 +112,7 @@ def test_screenshot_with_chrome_session():
assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000
assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n'
# Scenario 2: Custom resolution
screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2'
screenshot_dir2.mkdir()
env['CHROME_RESOLUTION'] = '800,600'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(screenshot_dir2),
capture_output=True,
text=True,
timeout=30,
env=env
)
assert result.returncode == 0
screenshot_file2 = screenshot_dir2 / 'screenshot.png'
assert screenshot_file2.exists()
file_size = screenshot_file2.stat().st_size
assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}"
# Scenario 3: Wrong target ID (error case)
# Scenario 2: Wrong target ID (error case)
screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3'
screenshot_dir3.mkdir()
(snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id')
@@ -149,9 +129,7 @@ def test_screenshot_with_chrome_session():
assert result.returncode != 0
assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower()
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
pytest.skip(f"Chrome session setup failed: {e}")
except RuntimeError:
raise
@@ -362,30 +340,6 @@ def test_missing_snapshot_id_argument():
assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
def test_invalid_resolution_format():
"""Test that invalid CHROME_RESOLUTION format is handled gracefully."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Invalid resolution formats to test parseResolution error handling
for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']:
env['CHROME_RESOLUTION'] = bad_resolution
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should either fail gracefully or fall back to default
# (depending on implementation - script should not crash with uncaught error)
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
def test_no_cdp_url_fails():
"""Test error when chrome dir exists but no cdp_url.txt."""
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -18,6 +18,8 @@ import shutil
from pathlib import Path
from typing import List, Iterable
from django.conf import settings
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
@@ -51,6 +53,12 @@ def _get_archive_dir() -> Path:
data_dir = os.environ.get('DATA_DIR', '').strip()
if data_dir:
return Path(data_dir) / 'archive'
settings_archive_dir = getattr(settings, 'ARCHIVE_DIR', None)
if settings_archive_dir:
return Path(settings_archive_dir)
settings_data_dir = getattr(settings, 'DATA_DIR', None)
if settings_data_dir:
return Path(settings_data_dir) / 'archive'
return Path.cwd() / 'archive'

View File

@@ -25,9 +25,7 @@ def test_ripgrep_hook_detects_binary_from_path():
"""Test that ripgrep hook finds binary using abx-pkg when env var is just a name."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py'
# Skip if rg is not installed
if not shutil.which('rg'):
pass
assert shutil.which('rg'), "ripgrep not installed"
# Set SEARCH_BACKEND_ENGINE to enable the hook
env = os.environ.copy()
@@ -78,8 +76,7 @@ def test_ripgrep_hook_handles_absolute_path():
hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py'
rg_path = shutil.which('rg')
if not rg_path:
pytest.skip("ripgrep not installed")
assert rg_path, "ripgrep not installed"
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
@@ -214,8 +211,7 @@ def test_ripgrep_only_detected_when_backend_enabled():
import sys
from pathlib import Path
if not shutil.which('rg'):
pytest.skip("ripgrep not installed")
assert shutil.which('rg'), "ripgrep not installed"
hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py'

View File

@@ -151,7 +151,6 @@ class TestRipgrepSearch(TestCase):
results = search('test')
self.assertEqual(results, [])
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_single_match(self):
"""search should find matching snapshot."""
results = search('Python programming')
@@ -160,7 +159,6 @@ class TestRipgrepSearch(TestCase):
self.assertNotIn('snap-002', results)
self.assertNotIn('snap-003', results)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_multiple_matches(self):
"""search should find all matching snapshots."""
# 'guide' appears in snap-002 (JavaScript guide) and snap-003 (Archiving Guide)
@@ -170,7 +168,6 @@ class TestRipgrepSearch(TestCase):
self.assertIn('snap-003', results)
self.assertNotIn('snap-001', results)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_case_insensitive_by_default(self):
"""search should be case-sensitive (ripgrep default)."""
# By default rg is case-sensitive
@@ -181,13 +178,11 @@ class TestRipgrepSearch(TestCase):
self.assertIsInstance(results_upper, list)
self.assertIsInstance(results_lower, list)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_no_results(self):
"""search should return empty list for no matches."""
results = search('xyznonexistent123')
self.assertEqual(results, [])
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_regex(self):
"""search should support regex patterns."""
results = search('(Python|JavaScript)')
@@ -195,7 +190,6 @@ class TestRipgrepSearch(TestCase):
self.assertIn('snap-001', results)
self.assertIn('snap-002', results)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_distinct_snapshots(self):
"""search should return distinct snapshot IDs."""
# Query matches both files in snap-001
@@ -212,7 +206,6 @@ class TestRipgrepSearch(TestCase):
search('test')
self.assertIn('ripgrep binary not found', str(context.exception))
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_with_custom_args(self):
"""search should use custom RIPGREP_ARGS."""
with patch.dict(os.environ, {'RIPGREP_ARGS': '["-i"]'}): # Case insensitive
@@ -220,7 +213,6 @@ class TestRipgrepSearch(TestCase):
# With -i flag, should find regardless of case
self.assertIn('snap-001', results)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_timeout(self):
"""search should handle timeout gracefully."""
with patch.dict(os.environ, {'RIPGREP_TIMEOUT': '1'}):
@@ -285,19 +277,16 @@ class TestRipgrepSearchIntegration(TestCase):
else:
file_path.write_text(content)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_archivebox(self):
"""Search for archivebox should find documentation snapshot."""
results = search('archivebox')
self.assertIn('1704067200.123456', results)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_python(self):
"""Search for python should find Python news snapshot."""
results = search('Python')
self.assertIn('1704153600.654321', results)
@pytest.mark.skipif(not shutil.which('rg'), reason="ripgrep not installed")
def test_search_pip_install(self):
"""Search for installation command."""
results = search('pip install')

View File

@@ -21,86 +21,37 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvBool,
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'seo';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'seo.json';
const CHROME_SESSION_DIR = '../chrome';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Wait for chrome tab to be fully loaded
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
// Wait 100ms before checking again
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
// Get CDP URL from chrome plugin
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract SEO metadata
async function extractSeo(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
let browser = null;
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
// Connect to existing Chrome session and get target page
const connection = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: timeout,
puppeteer,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
browser = connection.browser;
const page = connection.page;
// Extract all meta tags
const seoData = await page.evaluate(() => {
@@ -179,15 +130,8 @@ async function main() {
process.exit(0);
}
// Check if Chrome session exists, then wait for page load
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
const result = await extractSeo(url);

View File

@@ -6,33 +6,24 @@ meta tag extraction.
"""
import json
import shutil
import subprocess
import sys
import tempfile
import shutil
from pathlib import Path
import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
get_test_env,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
def chrome_available() -> bool:
"""Check if Chrome/Chromium is available."""
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
if shutil.which(name):
return True
return False
# Get the path to the SEO hook
PLUGIN_DIR = get_plugin_dir(__file__)
SEO_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_seo.*')
@@ -47,7 +38,6 @@ class TestSEOPlugin(TestCase):
self.assertTrue(SEO_HOOK.exists(), f"Hook not found: {SEO_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestSEOWithChrome(TestCase):
"""Integration tests for SEO plugin with Chrome."""
@@ -64,71 +54,75 @@ class TestSEOWithChrome(TestCase):
test_url = 'https://example.com'
snapshot_id = 'test-seo-snapshot'
try:
with chrome_session(
self.temp_dir,
crawl_id='test-seo-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
with chrome_session(
self.temp_dir,
crawl_id='test-seo-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
seo_dir = snapshot_chrome_dir.parent / 'seo'
seo_dir.mkdir(exist_ok=True)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
# Run SEO hook with the active Chrome session
result = subprocess.run(
['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
# Run SEO hook with the active Chrome session
result = subprocess.run(
['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(seo_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
# Check for output file
seo_output = snapshot_chrome_dir / 'seo.json'
# Check for output file
seo_output = seo_dir / 'seo.json'
seo_data = None
seo_data = None
# Try parsing from file first
if seo_output.exists():
with open(seo_output) as f:
# Try parsing from file first
if seo_output.exists():
with open(seo_output) as f:
try:
seo_data = json.load(f)
except json.JSONDecodeError:
pass
# Try parsing from stdout if not in file
if not seo_data:
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
seo_data = json.load(f)
record = json.loads(line)
# SEO data typically has title, description, or og: tags
if any(key in record for key in ['title', 'description', 'og:title', 'canonical']):
seo_data = record
break
except json.JSONDecodeError:
pass
continue
# Try parsing from stdout if not in file
if not seo_data:
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
# SEO data typically has title, description, or og: tags
if any(key in record for key in ['title', 'description', 'og:title', 'canonical']):
seo_data = record
break
except json.JSONDecodeError:
continue
# Verify hook ran successfully
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
self.assertNotIn('Traceback', result.stderr)
self.assertNotIn('Error:', result.stderr)
# Verify hook ran successfully
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
self.assertNotIn('Traceback', result.stderr)
self.assertNotIn('Error:', result.stderr)
# example.com has a title, so we MUST get SEO data
self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout")
# example.com has a title, so we MUST get SEO data
self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout")
# Verify we got some SEO data
has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta'])
self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}")
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
raise
# Verify we got some SEO data
has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta'])
self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}")
if __name__ == '__main__':

View File

@@ -9,12 +9,12 @@ Environment variables:
SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY)
SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) [unused; shared Chrome session required]
SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS)
SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) [unused; shared Chrome session required]
SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
@@ -138,8 +138,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Archive URL using SingleFile.
If a Chrome session exists (from chrome plugin), connects to it via CDP.
Otherwise launches a new Chrome instance.
Requires a Chrome session (from chrome plugin) and connects to it via CDP.
Returns: (success, output_path, error_message)
"""
@@ -151,8 +150,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
chrome_args = get_env_array('SINGLEFILE_CHROME_ARGS') or get_env_array('CHROME_ARGS', [])
chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '')
# Chrome args/binary are intentionally ignored because we require a shared Chrome session
cmd = [binary, *singlefile_args]
@@ -176,14 +174,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if cdp_remote_url:
print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr)
cmd.extend(['--browser-server', cdp_remote_url])
elif chrome:
print(f'[singlefile] Launching Chrome binary: {chrome}', file=sys.stderr)
cmd.extend(['--browser-executable-path', chrome])
# Pass Chrome arguments (only when launching a new browser)
if chrome_args and not cdp_remote_url:
# SingleFile expects --browser-args as a JSON array string
cmd.extend(['--browser-args', json.dumps(chrome_args)])
else:
return False, None, 'No Chrome session found (chrome plugin must run first)'
# SSL handling
if not check_ssl:
@@ -267,8 +259,8 @@ def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str |
# Only attempt if chrome session exists
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
if not cdp_url:
print('[singlefile] No chrome session (cdp_url.txt missing)', file=sys.stderr)
return False, None, 'No Chrome session available'
print('[singlefile] No Chrome session found (chrome plugin must run first)', file=sys.stderr)
return False, None, 'No Chrome session found (chrome plugin must run first)'
if not EXTENSION_SAVE_SCRIPT.exists():
print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr)

View File

@@ -59,27 +59,71 @@ def test_verify_deps_with_abx_pkg():
def test_singlefile_cli_archives_example_com():
"""Test that singlefile CLI archives example.com and produces valid HTML."""
"""Test that singlefile archives example.com and produces valid HTML."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
env['SINGLEFILE_ENABLED'] = 'true'
data_dir = tmpdir / 'data'
extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
extensions_dir.mkdir(parents=True, exist_ok=True)
downloads_dir.mkdir(parents=True, exist_ok=True)
user_data_dir.mkdir(parents=True, exist_ok=True)
env_install = os.environ.copy()
env_install.update({
'DATA_DIR': str(data_dir),
'CHROME_EXTENSIONS_DIR': str(extensions_dir),
'CHROME_DOWNLOADS_DIR': str(downloads_dir),
})
# Run singlefile snapshot hook
result = subprocess.run(
[sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=120
env=env_install,
timeout=120,
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
old_env = os.environ.copy()
os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
try:
with chrome_session(
tmpdir=tmpdir,
crawl_id='singlefile-cli-crawl',
snapshot_id='singlefile-cli-snap',
test_url=TEST_URL,
navigate=True,
timeout=30,
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
env['SINGLEFILE_ENABLED'] = 'true'
env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
singlefile_output_dir = snapshot_chrome_dir.parent / 'singlefile'
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
# Run singlefile snapshot hook
result = subprocess.run(
[sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=singlefile_output_dir,
capture_output=True,
text=True,
env=env,
timeout=120,
)
finally:
os.environ.clear()
os.environ.update(old_env)
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
# Verify output file exists
output_file = tmpdir / 'singlefile.html'
output_file = singlefile_output_dir / 'singlefile.html'
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
# Verify it contains real HTML

View File

@@ -34,18 +34,26 @@ const CHROME_SESSION_DIR = '../chrome';
let browser = null;
let page = null;
let client = null;
let sslCaptured = false;
let shuttingDown = false;
async function setupListener(url) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000;
let targetHost = null;
// Only extract SSL for HTTPS URLs
if (!url.startsWith('https://')) {
throw new Error('URL is not HTTPS');
}
try {
targetHost = new URL(url).host;
} catch (e) {
targetHost = null;
}
// Connect to Chrome page using shared utility
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
@@ -53,54 +61,54 @@ async function setupListener(url) {
puppeteer,
});
// Set up listener to capture SSL details during navigation
page.on('response', async (response) => {
client = await page.target().createCDPSession();
await client.send('Network.enable');
client.on('Network.responseReceived', (params) => {
try {
const request = response.request();
if (sslCaptured) return;
if (params.type && params.type !== 'Document') return;
const response = params.response || {};
const responseUrl = response.url || '';
if (!responseUrl.startsWith('http')) return;
// Only capture the main navigation request
if (!request.isNavigationRequest() || request.frame() !== page.mainFrame()) {
return;
if (targetHost) {
try {
const responseHost = new URL(responseUrl).host;
if (responseHost !== targetHost) return;
} catch (e) {
// Ignore URL parse errors, fall through
}
}
// Only capture if it's for our target URL
if (!response.url().startsWith(url.split('?')[0])) {
return;
}
// Get security details from the response
const securityDetails = response.securityDetails();
let sslInfo = {};
const securityDetails = response.securityDetails || null;
let sslInfo = { url: responseUrl };
if (securityDetails) {
sslInfo.protocol = securityDetails.protocol();
sslInfo.subjectName = securityDetails.subjectName();
sslInfo.issuer = securityDetails.issuer();
sslInfo.validFrom = securityDetails.validFrom();
sslInfo.validTo = securityDetails.validTo();
sslInfo.certificateId = securityDetails.subjectName();
sslInfo.securityState = 'secure';
sslInfo.protocol = securityDetails.protocol;
sslInfo.subjectName = securityDetails.subjectName;
sslInfo.issuer = securityDetails.issuer;
sslInfo.validFrom = securityDetails.validFrom;
sslInfo.validTo = securityDetails.validTo;
sslInfo.certificateId = securityDetails.subjectName;
sslInfo.securityState = response.securityState || 'secure';
sslInfo.schemeIsCryptographic = true;
const sanList = securityDetails.sanList();
const sanList = securityDetails.sanList;
if (sanList && sanList.length > 0) {
sslInfo.subjectAlternativeNames = sanList;
}
} else if (response.url().startsWith('https://')) {
// HTTPS URL but no security details means something went wrong
sslInfo.securityState = 'unknown';
} else if (responseUrl.startsWith('https://')) {
sslInfo.securityState = response.securityState || 'unknown';
sslInfo.schemeIsCryptographic = true;
sslInfo.error = 'No security details available';
} else {
// Non-HTTPS URL
sslInfo.securityState = 'insecure';
sslInfo.schemeIsCryptographic = false;
}
// Write output directly to file
fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
sslCaptured = true;
} catch (e) {
// Ignore errors
}

View File

@@ -13,26 +13,18 @@ import tempfile
import time
from pathlib import Path
import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
def chrome_available() -> bool:
"""Check if Chrome/Chromium is available."""
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
if shutil.which(name):
return True
return False
# Get the path to the SSL hook
PLUGIN_DIR = get_plugin_dir(__file__)
SSL_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_ssl.*')
@@ -47,7 +39,6 @@ class TestSSLPlugin(TestCase):
self.assertTrue(SSL_HOOK.exists(), f"Hook not found: {SSL_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestSSLWithChrome(TestCase):
"""Integration tests for SSL plugin with Chrome."""
@@ -64,88 +55,92 @@ class TestSSLWithChrome(TestCase):
test_url = 'https://example.com'
snapshot_id = 'test-ssl-snapshot'
try:
with chrome_session(
self.temp_dir,
crawl_id='test-ssl-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
with chrome_session(
self.temp_dir,
crawl_id='test-ssl-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
ssl_dir = snapshot_chrome_dir.parent / 'ssl'
ssl_dir.mkdir(exist_ok=True)
# Run SSL hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(ssl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Run SSL hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
# Allow it to run briefly, then terminate (background hook)
time.sleep(3)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
# Check for output file
ssl_output = ssl_dir / 'ssl.jsonl'
for _ in range(30):
if ssl_output.exists() and ssl_output.stat().st_size > 0:
break
time.sleep(1)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
stdout, stderr = result.communicate()
# Check for output file
ssl_output = snapshot_chrome_dir / 'ssl.jsonl'
ssl_data = None
ssl_data = None
# Try parsing from file first
if ssl_output.exists():
with open(ssl_output) as f:
content = f.read().strip()
if content.startswith('{'):
try:
ssl_data = json.loads(content)
except json.JSONDecodeError:
pass
# Try parsing from file first
if ssl_output.exists():
with open(ssl_output) as f:
for line in f:
line = line.strip()
if line.startswith('{'):
try:
ssl_data = json.loads(line)
break
except json.JSONDecodeError:
continue
# Try parsing from stdout if not in file
if not ssl_data:
for line in stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL':
ssl_data = record
break
except json.JSONDecodeError:
continue
# Try parsing from stdout if not in file
if not ssl_data:
for line in stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL':
ssl_data = record
break
except json.JSONDecodeError:
continue
# Verify hook ran successfully
self.assertNotIn('Traceback', stderr)
self.assertNotIn('Error:', stderr)
# Verify hook ran successfully
self.assertNotIn('Traceback', stderr)
self.assertNotIn('Error:', stderr)
# example.com uses HTTPS, so we MUST get SSL certificate data
self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL")
# example.com uses HTTPS, so we MUST get SSL certificate data
self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL")
# Verify we got certificate info
self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}")
self.assertTrue(
ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'),
f"Unexpected protocol: {ssl_data['protocol']}"
)
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
raise
# Verify we got certificate info
self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}")
self.assertTrue(
ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'),
f"Unexpected protocol: {ssl_data['protocol']}"
)
if __name__ == '__main__':

View File

@@ -149,6 +149,17 @@ function getFilenameFromUrl(url) {
}
}
function normalizeUrl(url) {
try {
const parsed = new URL(url);
let path = parsed.pathname || '';
if (path === '/') path = '';
return `${parsed.origin}${path}`;
} catch (e) {
return url;
}
}
async function setupStaticFileListener() {
const timeout = getEnvInt('STATICFILE_TIMEOUT', 30) * 1000;
@@ -174,7 +185,7 @@ async function setupStaticFileListener() {
const status = response.status();
// Only process the main document response
if (url !== originalUrl) return;
if (normalizeUrl(url) !== normalizeUrl(originalUrl)) return;
if (status < 200 || status >= 300) return;
firstResponseHandled = true;
@@ -313,6 +324,19 @@ async function main() {
// Wait for chrome_navigate to complete (non-fatal)
try {
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
if (!detectedContentType && page) {
try {
const inferred = await page.evaluate(() => document.contentType || '');
if (inferred) {
detectedContentType = inferred.split(';')[0].trim();
if (isStaticContentType(detectedContentType)) {
isStaticFile = true;
}
}
} catch (e) {
// Best-effort only
}
}
} catch (e) {
console.error(`WARN: ${e.message}`);
}

View File

@@ -48,7 +48,6 @@ class TestStaticfilePlugin(TestCase):
self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestStaticfileWithChrome(TestCase):
"""Integration tests for staticfile plugin with Chrome."""
@@ -116,9 +115,7 @@ class TestStaticfileWithChrome(TestCase):
except json.JSONDecodeError:
continue
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
except RuntimeError:
raise

View File

@@ -2,22 +2,27 @@
/**
* Extract the title of a URL.
*
* If a Chrome session exists (from chrome plugin), connects to it via CDP
* Requires a Chrome session (from chrome plugin) and connects to it via CDP
* to get the page title (which includes JS-rendered content).
* Otherwise falls back to fetching the URL and parsing HTML.
*
* Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>
* Output: Writes title/title.txt
*
* Environment variables:
* TIMEOUT: Timeout in seconds (default: 30)
* USER_AGENT: User agent string (optional)
* TITLE_TIMEOUT: Timeout in seconds (default: 30)
*/
const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'title';
@@ -25,189 +30,47 @@ const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'title.txt';
const CHROME_SESSION_DIR = '../chrome';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Wait for chrome tab to be fully loaded
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
// Wait 100ms before checking again
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
// Get CDP URL from chrome plugin if available
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract title from HTML
function extractTitleFromHtml(html) {
// Try <title> tag
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
if (titleMatch) {
return titleMatch[1].trim();
}
// Try og:title
const ogMatch = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i);
if (ogMatch) {
return ogMatch[1].trim();
}
// Try twitter:title
const twitterMatch = html.match(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i);
if (twitterMatch) {
return twitterMatch[1].trim();
}
return null;
}
// Fetch URL and extract title (fallback method)
function fetchTitle(url) {
return new Promise((resolve, reject) => {
const timeout = getEnvInt('TIMEOUT', 30) * 1000;
const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
const client = url.startsWith('https') ? https : http;
const req = client.get(url, {
headers: { 'User-Agent': userAgent },
timeout,
}, (res) => {
// Handle redirects
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
fetchTitle(res.headers.location).then(resolve).catch(reject);
return;
}
let data = '';
res.on('data', chunk => {
data += chunk;
// Only need first 64KB to find title
if (data.length > 65536) {
req.destroy();
}
});
res.on('end', () => {
const title = extractTitleFromHtml(data);
if (title) {
resolve(title);
} else {
reject(new Error('No title found in HTML'));
}
});
});
req.on('error', reject);
req.on('timeout', () => {
req.destroy();
reject(new Error('Request timeout'));
});
});
}
// Get title using Puppeteer CDP connection
async function getTitleFromCdp(cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const puppeteer = require('puppeteer-core');
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
async function extractTitle(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeoutMs = getEnvInt('TITLE_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
let browser = null;
try {
// Get existing pages
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
const connection = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs,
puppeteer,
});
browser = connection.browser;
const page = connection.page;
if (!page) {
throw new Error('No page found in Chrome session');
}
await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs * 4, 200);
// Get title from page
const title = await page.title();
let title = await page.title();
if (!title) {
// Try getting from DOM directly
const domTitle = await page.evaluate(() => {
title = await page.evaluate(() => {
return document.title ||
document.querySelector('meta[property="og:title"]')?.content ||
document.querySelector('meta[name="twitter:title"]')?.content ||
document.querySelector('h1')?.textContent?.trim();
});
return domTitle;
}
return title;
} finally {
// Disconnect without closing browser
browser.disconnect();
}
}
async function extractTitle(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first
const cdpUrl = getCdpUrl();
if (cdpUrl) {
try {
const title = await getTitleFromCdp(cdpUrl);
if (title) {
fs.writeFileSync(outputPath, title, 'utf8');
return { success: true, output: outputPath, title, method: 'cdp' };
}
} catch (e) {
console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`);
if (title) {
fs.writeFileSync(outputPath, title, 'utf8');
return { success: true, output: outputPath, title, method: 'cdp' };
}
}
// Fallback to HTTP fetch
try {
const title = await fetchTitle(url);
fs.writeFileSync(outputPath, title, 'utf8');
return { success: true, output: outputPath, title, method: 'http' };
return { success: false, error: 'No title found in Chrome session' };
} catch (e) {
return { success: false, error: e.message };
} finally {
if (browser) {
browser.disconnect();
}
}
}

View File

@@ -7,8 +7,7 @@ Tests verify:
3. Title extraction works for real example.com
4. Output file contains actual page title
5. Handles various title sources (<title>, og:title, twitter:title)
6. Config options work (TIMEOUT, USER_AGENT)
7. Fallback to HTTP when chrome not available
6. Config options work (TITLE_TIMEOUT)
"""
import json
@@ -23,6 +22,9 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
get_test_env,
chrome_session,
CHROME_NAVIGATE_HOOK,
)
@@ -30,6 +32,25 @@ PLUGIN_DIR = get_plugin_dir(__file__)
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
TEST_URL = 'https://example.com'
def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id):
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env,
)
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=title_dir,
capture_output=True,
text=True,
timeout=60,
env=env,
)
return nav_result, result
def test_hook_script_exists():
"""Verify hook script exists."""
@@ -46,15 +67,18 @@ def test_extracts_title_from_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run title extraction
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'test789',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -76,7 +100,7 @@ def test_extracts_title_from_example_com():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file exists (hook writes to current directory)
title_file = tmpdir / 'title.txt'
title_file = title_dir / 'title.txt'
assert title_file.exists(), "title.txt not created"
# Verify title contains REAL example.com title
@@ -88,56 +112,33 @@ def test_extracts_title_from_example_com():
assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
def test_falls_back_to_http_when_chrome_unavailable():
"""Test that title plugin falls back to HTTP when chrome unavailable."""
def test_fails_without_chrome_session():
"""Test that title plugin fails when chrome session is missing."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Don't create chrome directory - force HTTP fallback
title_dir = tmpdir / 'snapshot' / 'title'
title_dir.mkdir(parents=True, exist_ok=True)
# Run title extraction
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
cwd=tmpdir,
cwd=title_dir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
timeout=60,
env=get_test_env(),
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output exists and has real title (hook writes to current directory)
output_title_file = tmpdir / 'title.txt'
assert output_title_file.exists(), "Output title.txt not created"
title_text = output_title_file.read_text().strip()
assert 'example' in title_text.lower()
assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}"
assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
"""Test that TITLE_TIMEOUT config is respected."""
if not shutil.which('node'):
pass
@@ -147,65 +148,27 @@ def test_config_timeout_honored():
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
env_override = os.environ.copy()
env_override['TITLE_TIMEOUT'] = '5'
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
env.update(env_override)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testtimeout',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
@@ -215,18 +178,22 @@ def test_handles_https_urls():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
'https://example.org',
'testhttps',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if result.returncode == 0:
# Hook writes to current directory
output_title_file = tmpdir / 'title.txt'
output_title_file = title_dir / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert len(title_text) > 0, "Title should not be empty"
@@ -246,14 +213,23 @@ def test_handles_404_gracefully():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (
_process,
_pid,
snapshot_chrome_dir,
env,
):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
'https://example.com/nonexistent-page-404',
'test404',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# May succeed or fail depending on server behavior
# example.com returns "Example Domain" even for 404s
@@ -269,20 +245,29 @@ def test_handles_redirects():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# http://example.com redirects to https://example.com
result = subprocess.run(
['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as (
_process,
_pid,
snapshot_chrome_dir,
env,
):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
# http://example.com redirects to https://example.com
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
'http://example.com',
'testredirect',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# Should succeed and follow redirect
if result.returncode == 0:
# Hook writes to current directory
output_title_file = tmpdir / 'title.txt'
output_title_file = title_dir / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert 'example' in title_text.lower()

View File

@@ -174,7 +174,7 @@ async function configure2Captcha() {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();

View File

@@ -44,7 +44,7 @@ class TestTwoCaptcha:
def setup(self):
self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
if not self.api_key:
pytest.skip("TWOCAPTCHA_API_KEY required")
pytest.fail("TWOCAPTCHA_API_KEY required")
def test_install_and_load(self):
"""Extension installs and loads in Chromium."""

View File

@@ -14,6 +14,7 @@ import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
get_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
@@ -283,8 +284,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir,
env=get_test_env()),
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
@@ -301,11 +301,10 @@ const puppeteer = require('puppeteer-core');
return json.loads(output_lines[-1])
# Test URL: Yahoo has many ads that uBlock should block
# Test URL: Yahoo has many ads that uBlock should block (no mocks)
TEST_URL = 'https://www.yahoo.com/'
@pytest.mark.timeout(15)
def test_extension_loads_in_chromium():
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
@@ -519,15 +518,15 @@ const puppeteer = require('puppeteer-core');
pass
def test_blocks_ads_on_test_page():
"""Live test: verify uBlock Origin blocks ads on a test page.
def test_blocks_ads_on_yahoo_com():
"""Live test: verify uBlock Origin blocks ads on yahoo.com (real network).
This test runs TWO browser sessions:
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
2. WITH extension - verifies ads ARE blocked
This ensures we're actually testing the extension's effect, not just
that a test page happens to show ads as blocked.
that a test page happens to show ads as blocked. No mocks are used.
"""
import time
@@ -581,20 +580,15 @@ def test_blocks_ads_on_test_page():
# Verify baseline shows ads ARE visible (not blocked)
if baseline_result['adElementsFound'] == 0:
pytest.skip(
f"Cannot test extension: no ad elements found on {TEST_URL}. "
f"The page may have changed or loaded differently."
pytest.fail(
f"Baseline must find ad elements on {TEST_URL}, but found none. "
f"This test requires a real ad-heavy page."
)
if baseline_result['adElementsVisible'] == 0:
print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
print("This suggests either:")
print(" - There's another ad blocker interfering")
print(" - Network-level ad blocking is in effect")
pytest.skip(
f"Cannot test extension: baseline shows no visible ads "
f"despite finding {baseline_result['adElementsFound']} ad elements."
pytest.fail(
f"Baseline must have visible ads on {TEST_URL}, but none were visible. "
f"This likely means another ad blocker is active or network-level blocking is in effect."
)
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
@@ -713,6 +707,10 @@ const puppeteer = require('{env_base['NODE_MODULES_DIR']}/puppeteer-core');
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Expected fewer ads with extension."
# Ensure uBlock actually blocks at least some ad/track requests
assert ext_result['blockedRequests'] > 0, \
"uBlock should block at least one ad/track request on yahoo.com"
# Extension should block at least 20% of ads (was consistently blocking 5-13% without proper init time)
assert reduction_percent >= 20, \
f"uBlock should block at least 20% of ads.\n" \

View File

@@ -1,14 +1,17 @@
<!-- YT-DLP thumbnail - shows video/audio player or placeholder -->
<div class="extractor-thumbnail ytdlp-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="width: 100%; height: 100px; object-fit: contain;"
poster=""
preload="metadata"
muted
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
</video>
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">🎬</span>
<span>YT-DLP</span>
<!-- YT-DLP output list -->
{% if media_files %}
<div class="loose-items" style="pointer-events: auto;">
{% for file in media_files %}
<a href="{{ file.url|default:file.path|urlencode }}" target="preview"
title="{{ file.name }}">
📄 {{ file.name }}
</a>
{% endfor %}
</div>
</div>
{% else %}
<div class="thumbnail-compact" data-plugin="ytdlp" data-compact="1">
<span class="thumbnail-compact-icon">🎬</span>
<span class="thumbnail-compact-label">YT-DLP</span>
<span class="thumbnail-compact-meta">media</span>
</div>
{% endif %}

View File

@@ -0,0 +1,31 @@
{% load i18n %}
<div class="actions">
<div class="actions-left">
{% block actions %}
{% block actions-form %}
{% for field in action_form %}
{% if field.name == "tags" %}
<span class="actions-tags">{{ field }}</span>
{% else %}
{% if field.label %}<label>{{ field.label }} {{ field }}</label>{% else %}{{ field }}{% endif %}
{% endif %}
{% endfor %}
{% endblock %}
{% block actions-submit %}
<button type="submit" class="button" name="index" value="{{ action_index|default:0 }}">{% translate "Run" %}</button>
{% endblock %}
{% block actions-counter %}
{% if actions_selection_counter %}
<span class="action-counter" data-actions-icnt="{{ cl.result_list|length }}">{{ selection_note }}</span>
{% if cl.result_count != cl.result_list|length %}
<span class="all hidden">{{ selection_note_all }}</span>
<span class="question hidden">
<a role="button" href="#" title="{% translate "Click here to select the objects across all pages" %}">{% blocktranslate with cl.result_count as total_count %}Select all {{ total_count }} {{ module_name }}{% endblocktranslate %}</a>
</span>
<span class="clear hidden"><a role="button" href="#">{% translate "Clear selection" %}</a></span>
{% endif %}
{% endif %}
{% endblock %}
{% endblock %}
</div>
</div>

View File

@@ -1,4 +1,4 @@
{% load i18n static tz %}
{% load i18n static tz core_tags %}
{% get_current_language as LANGUAGE_CODE %}
{% get_current_language_bidi as LANGUAGE_BIDI %}
@@ -12,6 +12,10 @@
{% endblock %}
<link rel="stylesheet" type="text/css" href="{% block stylesheet %}{% static "admin/css/base.css" %}{% endblock %}">
{% api_token as api_token %}
<script>
window.ARCHIVEBOX_API_KEY = "{{ api_token|escapejs }}";
</script>
{% block extrastyle %}
<style>
#upgrade-banner {
@@ -55,8 +59,8 @@
}
/* Main form container - flexbox grid */
#content-main form > div,
#content form > div {
body:not(.change-list) #content-main form > div,
body:not(.change-list) #content form > div {
display: flex;
flex-wrap: wrap;
gap: 20px;
@@ -909,8 +913,8 @@
}
/* Toolbar / search bar */
#toolbar {
padding: 16px;
#changelist #toolbar {
padding: 12px 16px;
background: #fff;
border-bottom: 1px solid #e2e8f0;
display: flex;
@@ -926,6 +930,21 @@
flex: 0 1 auto;
max-width: 500px;
}
body.change-list #toolbar form > div {
display: flex !important;
align-items: center;
gap: 8px;
flex-wrap: nowrap !important;
white-space: nowrap;
}
body.change-list #toolbar label {
margin: 0;
display: inline-flex;
align-items: center;
}
body.change-list #toolbar input[type="submit"] {
margin: 0;
}
#searchbar {
flex: 1;
@@ -961,6 +980,36 @@
letter-spacing: 0.025em;
margin: 0;
border-bottom: 1px solid #e2e8f0;
display: flex;
align-items: center;
justify-content: space-between;
gap: 8px;
}
#changelist-filter .filter-toggle {
border: 1px solid #e2e8f0;
background: #ffffff;
color: #64748b;
font-size: 11px;
padding: 4px 8px;
border-radius: 999px;
cursor: pointer;
text-transform: none;
letter-spacing: normal;
}
#changelist-filter .filter-toggle:hover {
background: #f1f5f9;
color: #334155;
}
.filter-toggle-floating {
position: static;
box-shadow: none;
padding: 2px 6px;
font-size: 11px;
line-height: 1.2;
height: 20px;
}
#changelist-filter h3 {
@@ -1004,15 +1053,62 @@
font-weight: 500;
}
body.filters-collapsed #changelist-filter {
display: none !important;
}
body.filters-collapsed.change-list .results,
body.filters-collapsed.change-list .paginator,
body.filters-collapsed.change-list #toolbar,
body.filters-collapsed.change-list div.xfull,
body.filters-collapsed.change-list #changelist .changelist-form-container,
body.filters-collapsed.change-list #changelist-form,
body.filters-collapsed.change-list #result_list {
margin-right: 0 !important;
width: 100% !important;
}
body.filters-collapsed.change-list #changelist .changelist-form-container > div {
max-width: 100% !important;
}
/* Actions bar */
.actions {
body.change-list #changelist .actions {
padding: 12px 16px;
background: #f8fafc;
border-bottom: 1px solid #e2e8f0;
border-bottom: 0;
display: flex !important;
align-items: center;
gap: 8px;
flex-wrap: nowrap !important;
overflow-x: auto;
}
body.change-list #changelist {
border: 0 !important;
}
body.change-list #changelist .actions .button,
body.change-list #changelist .actions select,
body.change-list #changelist .actions label {
line-height: 1.5rem;
height: 1.5rem;
display: inline-flex;
align-items: center;
}
body.change-list #changelist .actions-left {
display: flex;
align-items: center;
gap: 12px;
flex-wrap: wrap;
gap: 8px;
flex-wrap: nowrap !important;
flex: 1 1 auto;
min-width: 0;
white-space: nowrap;
}
body.change-list #changelist .actions-right {
display: flex;
align-items: center;
gap: 8px;
margin-left: auto;
flex: 0 0 auto;
}
.actions label {
@@ -1098,22 +1194,23 @@
align-items: center;
gap: 4px;
padding: 4px 8px 4px 10px;
background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);
color: #fff;
background: var(--tag-bg, #e2e8f0);
color: var(--tag-fg, #1e293b);
font-size: 13px;
font-weight: 500;
border-radius: 16px;
white-space: nowrap;
transition: all 0.15s ease;
-webkit-font-smoothing: antialiased;
border: 1px solid var(--tag-border, #cbd5e1);
}
.tag-pill:hover {
background: linear-gradient(135deg, #2563eb 0%, #1d4ed8 100%);
filter: brightness(0.98);
}
.tag-pill a.tag-link {
color: #fff;
color: inherit;
text-decoration: none;
}
@@ -1130,10 +1227,10 @@
height: 16px;
padding: 0;
margin: 0;
background: rgba(255, 255, 255, 0.2);
border: none;
background: rgba(15, 23, 42, 0.08);
border: 1px solid rgba(15, 23, 42, 0.12);
border-radius: 50%;
color: #fff;
color: inherit;
font-size: 14px;
font-weight: 600;
line-height: 1;
@@ -1143,7 +1240,7 @@
}
.tag-remove-btn:hover {
background: rgba(255, 255, 255, 0.4);
background: rgba(15, 23, 42, 0.18);
opacity: 1;
}
@@ -1196,29 +1293,94 @@
font-size: 12px;
}
.tag-inline-input-sm {
width: 24px;
min-width: 24px;
max-width: 100px;
padding: 2px 4px;
border: none;
#content .tag-editor-inline input.tag-inline-input-sm {
width: 22px;
min-width: 22px;
max-width: 140px;
height: 22px;
padding: 0 6px;
border: 1px solid #e2e8f0;
outline: none;
font-size: 11px;
font-size: 12px;
font-family: inherit;
background: transparent;
color: #64748b;
transition: width 0.15s ease;
background: #f1f5f9;
color: #94a3b8;
border-radius: 999px;
text-align: center;
cursor: text;
transition: width 0.15s ease, color 0.15s ease, border-color 0.15s ease, background 0.15s ease;
}
.tag-inline-input-sm:focus {
width: 80px;
#content .tag-editor-inline input.tag-inline-input-sm:focus {
width: 120px;
color: #1e293b;
border-color: #94a3b8;
background: #ffffff;
text-align: left;
}
.tag-inline-input-sm::placeholder {
#content .tag-editor-inline input.tag-inline-input-sm::placeholder {
color: #94a3b8;
}
/* Actions bar tag editor (compact to avoid crowding buttons) */
body.change-list #changelist .actions .tag-editor-container {
padding: 2px 6px;
min-height: 24px;
height: 24px;
width: 160px;
max-width: 160px;
flex: 0 0 160px;
flex-wrap: nowrap;
overflow-x: auto;
overflow-y: hidden;
gap: 4px;
}
body.change-list #changelist .actions-tags {
display: none;
align-items: center;
}
/* Ensure changelist filter sidebar is visible */
body.change-list #changelist .changelist-form-container {
display: flex;
align-items: flex-start;
width: 100%;
gap: 20px;
flex-wrap: nowrap;
}
body.change-list #changelist-filter {
flex: 0 0 260px;
max-width: 260px;
display: block;
margin: 0;
order: 2;
align-self: flex-start;
}
body.change-list #changelist .changelist-form-container > div {
flex: 1 1 auto;
min-width: 0;
order: 1;
max-width: calc(100% - 280px);
}
.actions .tag-pills {
gap: 4px;
flex-wrap: nowrap;
}
.actions .tag-pill {
padding: 1px 6px 1px 8px;
font-size: 10px;
}
.actions .tag-inline-input {
min-width: 40px;
padding: 0;
font-size: 11px;
}
/* Container in list view title column */
.tags-inline-editor {
display: inline;
@@ -1497,6 +1659,12 @@
console.log('Converted', buttons.children().length, 'admin actions from dropdown to buttons')
jQuery('select[multiple]').select2();
}
function updateTagWidgetVisibility() {
const tagContainer = document.querySelector('.actions-tags');
if (!tagContainer) return;
const checked = document.querySelectorAll('#changelist-form input.action-select:checked').length;
tagContainer.style.display = checked > 0 ? 'inline-flex' : 'none';
}
function fixInlineAddRow() {
$('#id_snapshottag-MAX_NUM_FORMS').val('1000')
$('.add-row').show()
@@ -1536,11 +1704,87 @@
}
$(document).ready(function() {
fix_actions()
updateTagWidgetVisibility()
const form = document.querySelector('#changelist-form')
if (form) {
form.addEventListener('change', updateTagWidgetVisibility)
}
fixInlineAddRow()
setupSnapshotGridListToggle()
setTimeOffset()
selectSnapshotIfHotlinked()
})
</script>
<script>
(function() {
if (!document.body.classList.contains('change-list')) return;
var filter = document.getElementById('changelist-filter');
if (!filter) return;
var header = filter.querySelector('h2');
if (!header) return;
var toggle = document.getElementById('changelist-filter-toggle');
if (!toggle) {
toggle = document.createElement('button');
toggle.type = 'button';
toggle.id = 'changelist-filter-toggle';
toggle.className = 'filter-toggle';
toggle.setAttribute('aria-expanded', 'true');
toggle.dataset.showLabel = '{% translate "Filters" %}';
toggle.dataset.hideLabel = '{% translate "Hide" %}';
toggle.textContent = toggle.dataset.hideLabel;
header.appendChild(toggle);
}
var storageKey = 'admin-filters-collapsed';
var changelist = document.getElementById('changelist');
var hadFiltered = changelist && changelist.classList.contains('filtered');
var floating = document.getElementById('changelist-filter-float-toggle');
if (!floating) {
floating = document.createElement('button');
floating.type = 'button';
floating.id = 'changelist-filter-float-toggle';
floating.className = 'filter-toggle filter-toggle-floating';
floating.textContent = toggle.dataset.showLabel;
}
var actionsRight = document.querySelector('#changelist .actions .actions-right');
var actionsBar = document.querySelector('#changelist .actions');
if (actionsRight) {
actionsRight.appendChild(floating);
} else if (actionsBar) {
actionsBar.appendChild(floating);
}
function applyState() {
var collapsed = localStorage.getItem(storageKey) === 'true';
document.body.classList.toggle('filters-collapsed', collapsed);
filter.style.display = collapsed ? 'none' : '';
toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel;
toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true');
floating.style.display = collapsed ? 'inline-flex' : 'none';
if (changelist) {
if (collapsed) {
changelist.classList.remove('filtered');
} else if (hadFiltered) {
changelist.classList.add('filtered');
}
}
}
function toggleFilters() {
var collapsed = !document.body.classList.contains('filters-collapsed');
localStorage.setItem(storageKey, collapsed ? 'true' : 'false');
applyState();
}
toggle.addEventListener('click', toggleFilters);
floating.addEventListener('click', toggleFilters);
applyState();
})();
</script>
<script src="{% static 'admin-inline-tags.js' %}"></script>
</body>
</html>

View File

@@ -78,7 +78,19 @@
{% block filters %}
{% if cl.has_filters %}
<div id="changelist-filter">
<h2>{% translate 'Filter' %}</h2>
<h2>
{% translate 'Filter' %}
<button
type="button"
id="changelist-filter-toggle"
class="filter-toggle"
aria-expanded="true"
data-show-label="{% translate 'Filters' %}"
data-hide-label="{% translate 'Hide' %}"
>
{% translate 'Hide' %}
</button>
</h2>
{% if cl.has_active_filters %}<h3 id="changelist-filter-clear">
<a href="{{ cl.clear_all_filters_qs }}">&#10006; {% translate "Clear all filters" %}</a>
</h3>{% endif %}
@@ -88,4 +100,28 @@
{% endblock %}
</div>
</div>
{% if cl.has_filters %}
<script>
(function() {
var storageKey = 'admin-filters-collapsed';
var toggle = document.getElementById('changelist-filter-toggle');
if (!toggle) return;
function applyState() {
var collapsed = localStorage.getItem(storageKey) === 'true';
document.body.classList.toggle('filters-collapsed', collapsed);
toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel;
toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true');
}
toggle.addEventListener('click', function() {
var collapsed = !document.body.classList.contains('filters-collapsed');
localStorage.setItem(storageKey, collapsed ? 'true' : 'false');
applyState();
});
applyState();
})();
</script>
{% endif %}
{% endblock %}

View File

@@ -78,7 +78,19 @@
{% block filters %}
{% if cl.has_filters %}
<div id="changelist-filter">
<h2>{% translate 'Filter' %}</h2>
<h2>
{% translate 'Filter' %}
<button
type="button"
id="changelist-filter-toggle"
class="filter-toggle"
aria-expanded="true"
data-show-label="{% translate 'Filters' %}"
data-hide-label="{% translate 'Hide' %}"
>
{% translate 'Hide' %}
</button>
</h2>
{% if cl.has_active_filters %}<h3 id="changelist-filter-clear">
<a href="{{ cl.clear_all_filters_qs }}">&#10006; {% translate "Clear all filters" %}</a>
</h3>{% endif %}
@@ -88,4 +100,28 @@
{% endblock %}
</div>
</div>
{% if cl.has_filters %}
<script>
(function() {
var storageKey = 'admin-filters-collapsed';
var toggle = document.getElementById('changelist-filter-toggle');
if (!toggle) return;
function applyState() {
var collapsed = localStorage.getItem(storageKey) === 'true';
document.body.classList.toggle('filters-collapsed', collapsed);
toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel;
toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true');
}
toggle.addEventListener('click', function() {
var collapsed = !document.body.classList.contains('filters-collapsed');
localStorage.setItem(storageKey, collapsed ? 'true' : 'false');
applyState();
});
applyState();
})();
</script>
{% endif %}
{% endblock %}

View File

@@ -130,6 +130,29 @@
color: #c9d1d9;
border-color: #8b949e;
}
#progress-monitor .cancel-item-btn {
background: transparent;
border: 1px solid #30363d;
color: #f85149;
cursor: pointer;
padding: 2px 6px;
border-radius: 6px;
font-size: 11px;
line-height: 1;
transition: all 0.2s;
flex-shrink: 0;
}
#progress-monitor .cancel-item-btn:hover {
background: rgba(248, 81, 73, 0.12);
border-color: #f85149;
color: #ff7b72;
}
#progress-monitor .cancel-item-btn.is-busy {
opacity: 0.6;
cursor: wait;
border-color: #6e7681;
color: #6e7681;
}
/* Tree Container */
#progress-monitor .tree-container {
@@ -161,14 +184,21 @@
gap: 12px;
padding: 10px 14px;
background: rgba(0,0,0,0.2);
cursor: pointer;
text-decoration: none;
color: inherit;
}
#progress-monitor .crawl-header:hover {
background: rgba(88, 166, 255, 0.1);
}
#progress-monitor a.crawl-header:visited {
#progress-monitor .crawl-header-link {
display: flex;
align-items: center;
gap: 12px;
flex: 1;
min-width: 0;
cursor: pointer;
text-decoration: none;
color: inherit;
}
#progress-monitor a.crawl-header-link:visited {
color: inherit;
}
#progress-monitor .crawl-icon {
@@ -256,14 +286,21 @@
align-items: center;
gap: 10px;
padding: 8px 12px;
cursor: pointer;
text-decoration: none;
color: inherit;
}
#progress-monitor .snapshot-header:hover {
background: rgba(88, 166, 255, 0.05);
}
#progress-monitor a.snapshot-header:visited {
#progress-monitor .snapshot-header-link {
display: flex;
align-items: center;
gap: 10px;
flex: 1;
min-width: 0;
cursor: pointer;
text-decoration: none;
color: inherit;
}
#progress-monitor a.snapshot-header-link:visited {
color: inherit;
}
#progress-monitor .snapshot-icon {
@@ -342,7 +379,6 @@
}
#progress-monitor .extractor-badge.started .progress-fill {
background: rgba(210, 153, 34, 0.3);
width: 50%;
animation: progress-pulse 1.5s ease-in-out infinite;
}
@keyframes progress-pulse {
@@ -518,6 +554,25 @@
letter-spacing: 0.5px;
flex-shrink: 0;
}
#progress-monitor .pid-label {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 2px 6px;
border-radius: 999px;
font-size: 10px;
font-weight: 600;
color: #8b949e;
background: rgba(148, 163, 184, 0.12);
border: 1px solid rgba(148, 163, 184, 0.2);
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
letter-spacing: 0.2px;
white-space: nowrap;
}
#progress-monitor .pid-label.compact {
padding: 1px 5px;
font-size: 9px;
}
</style>
@@ -527,6 +582,7 @@
<div class="orchestrator-status">
<span class="status-dot stopped" id="orchestrator-dot"></span>
<span id="orchestrator-text">Stopped</span>
<span class="pid-label compact" id="orchestrator-pid" style="display:none;"></span>
</div>
<div class="stats">
<div class="stat">
@@ -572,12 +628,32 @@
const thumbnailStrip = document.getElementById('thumbnail-strip');
let pollInterval = null;
let pollDelayMs = 1000;
let idleTicks = 0;
let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
let knownThumbnailIds = new Set();
// Baselines for resettable counters
let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0');
function getApiKey() {
return (window.ARCHIVEBOX_API_KEY || '').trim();
}
function buildApiUrl(path) {
const apiKey = getApiKey();
if (!apiKey) return path;
const sep = path.includes('?') ? '&' : '?';
return `${path}${sep}api_key=${encodeURIComponent(apiKey)}`;
}
function buildApiHeaders() {
const headers = { 'Content-Type': 'application/json' };
const apiKey = getApiKey();
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
return headers;
}
let lastSucceeded = 0;
let lastFailed = 0;
@@ -620,6 +696,7 @@
return icons[plugin] || '&#128196;';
}
function renderThumbnail(thumb, isNew) {
const ext = (thumb.embed_path || '').toLowerCase().split('.').pop();
const isImage = ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico'].includes(ext);
@@ -630,9 +707,10 @@
item.title = `${thumb.plugin}: ${thumb.snapshot_url}`;
item.dataset.id = thumb.id;
if (isImage && thumb.archive_path) {
const archiveUrl = thumb.archive_url || thumb.archive_path;
if (isImage && archiveUrl) {
item.innerHTML = `
<img src="${thumb.archive_path}" alt="${thumb.plugin}" loading="lazy" onerror="this.parentElement.innerHTML='<div class=\\'thumbnail-fallback\\'>${getPluginIcon(thumb.plugin)}</div><span class=\\'thumbnail-plugin\\'>${thumb.plugin}</span>'">
<img src="${archiveUrl}" alt="${thumb.plugin}" loading="lazy" onerror="this.parentElement.innerHTML='<div class=\\'thumbnail-fallback\\'>${getPluginIcon(thumb.plugin)}</div><span class=\\'thumbnail-plugin\\'>${thumb.plugin}</span>'">
<span class="thumbnail-plugin">${thumb.plugin}</span>
`;
} else {
@@ -685,13 +763,19 @@
extractor.status === 'failed' ? '&#10007;' :
extractor.status === 'backoff' ? '&#8987;' :
extractor.status === 'skipped' ? '&#8674;' : '&#9675;';
const progress = typeof extractor.progress === 'number'
? Math.max(0, Math.min(100, extractor.progress))
: null;
const progressStyle = progress !== null ? ` style="width: ${progress}%;"` : '';
const pidHtml = extractor.pid ? `<span class="pid-label compact">pid ${extractor.pid}</span>` : '';
return `
<span class="extractor-badge ${extractor.status || 'queued'}">
<span class="progress-fill"></span>
<span class="progress-fill"${progressStyle}></span>
<span class="badge-content">
<span class="badge-icon">${icon}</span>
<span>${extractor.plugin || 'unknown'}</span>
${pidHtml}
</span>
</span>
`;
@@ -700,6 +784,11 @@
function renderSnapshot(snapshot, crawlId) {
const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`;
const canCancel = snapshot.status === 'queued';
const cancelBtn = canCancel
? `<button class="cancel-item-btn" data-cancel-type="snapshot" data-snapshot-id="${snapshot.id}" data-label="✕" title="Cancel snapshot">✕</button>`
: '';
const snapshotPidHtml = snapshot.worker_pid ? `<span class="pid-label compact">pid ${snapshot.worker_pid}</span>` : '';
let extractorHtml = '';
if (snapshot.all_plugins && snapshot.all_plugins.length > 0) {
@@ -716,18 +805,22 @@
return `
<div class="snapshot-item">
<a class="snapshot-header" href="${adminUrl}">
<span class="snapshot-icon">${statusIcon}</span>
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${(snapshot.total_plugins || 0) > 0
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
: 'Waiting for extractors...'}
<div class="snapshot-header">
<a class="snapshot-header-link" href="${adminUrl}">
<span class="snapshot-icon">${statusIcon}</span>
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${(snapshot.total_plugins || 0) > 0
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
: 'Waiting for extractors...'}
</div>
</div>
</div>
<span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span>
</a>
${snapshotPidHtml}
<span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span>
</a>
${cancelBtn}
</div>
<div class="snapshot-progress">
<div class="progress-bar-container">
<div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
@@ -742,6 +835,11 @@
function renderCrawl(crawl) {
const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`;
const canCancel = crawl.status === 'queued' || crawl.status === 'started';
const cancelBtn = canCancel
? `<button class="cancel-item-btn" data-cancel-type="crawl" data-crawl-id="${crawl.id}" data-label="✕" title="Cancel crawl">✕</button>`
: '';
const crawlPidHtml = crawl.worker_pid ? `<span class="pid-label compact">pid ${crawl.worker_pid}</span>` : '';
let snapshotsHtml = '';
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
@@ -760,7 +858,7 @@
// Queued but retry_at is in future (was claimed by worker, will retry)
warningHtml = `
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
🔄 Trying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
</div>
`;
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
@@ -784,19 +882,23 @@
return `
<div class="crawl-item" data-crawl-id="${crawl.id || 'unknown'}">
<a class="crawl-header" href="${adminUrl}">
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label || '(no label)'}</div>
<div class="crawl-meta">${metaText}</div>
</div>
<div class="crawl-stats">
<span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span>
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
<span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span>
</div>
<span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span>
</a>
<div class="crawl-header">
<a class="crawl-header-link" href="${adminUrl}">
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label || '(no label)'}</div>
<div class="crawl-meta">${metaText}</div>
</div>
<div class="crawl-stats">
<span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span>
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
<span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span>
</div>
${crawlPidHtml}
<span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span>
</a>
${cancelBtn}
</div>
<div class="crawl-progress">
<div class="progress-bar-container">
<div class="progress-bar crawl ${crawl.status === 'started' && (crawl.progress || 0) === 0 ? 'indeterminate' : ''}"
@@ -820,11 +922,26 @@
data.crawls_pending > 0 || data.crawls_started > 0 ||
data.snapshots_pending > 0 || data.snapshots_started > 0 ||
data.archiveresults_pending > 0 || data.archiveresults_started > 0;
if (!hasActivity && !isCollapsed) {
setCollapsedState(true);
}
if (hasActivity) {
idleTicks = 0;
if (pollDelayMs !== 1000) {
setPollingDelay(1000);
}
} else {
idleTicks += 1;
if (idleTicks > 5 && pollDelayMs !== 10000) {
setPollingDelay(10000);
}
}
// Update orchestrator status - show "Running" only when there's actual activity
// Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently
const dot = document.getElementById('orchestrator-dot');
const text = document.getElementById('orchestrator-text');
const pidEl = document.getElementById('orchestrator-pid');
const hasWorkers = data.total_workers > 0;
if (hasWorkers || hasActivity) {
@@ -838,6 +955,14 @@
text.textContent = 'Idle';
}
if (data.orchestrator_pid) {
pidEl.textContent = `pid ${data.orchestrator_pid}`;
pidEl.style.display = 'inline-flex';
} else {
pidEl.textContent = '';
pidEl.style.display = 'none';
}
// Pulse the dot to show we got fresh data
dot.classList.add('flash');
setTimeout(() => dot.classList.remove('flash'), 300);
@@ -909,7 +1034,7 @@
function startPolling() {
if (pollInterval) return;
fetchProgress();
pollInterval = setInterval(fetchProgress, 1000); // Poll every 1 second
pollInterval = setInterval(fetchProgress, pollDelayMs);
}
function stopPolling() {
@@ -919,10 +1044,19 @@
}
}
// Collapse toggle
collapseBtn.addEventListener('click', function() {
isCollapsed = !isCollapsed;
localStorage.setItem('progress-monitor-collapsed', isCollapsed);
function setPollingDelay(ms) {
pollDelayMs = ms;
if (pollInterval) {
clearInterval(pollInterval);
pollInterval = setInterval(fetchProgress, pollDelayMs);
}
}
function setCollapsedState(collapsed, persist = true) {
isCollapsed = collapsed;
if (persist) {
localStorage.setItem('progress-monitor-collapsed', isCollapsed);
}
if (isCollapsed) {
monitor.classList.add('collapsed');
collapseBtn.textContent = 'Expand';
@@ -930,12 +1064,92 @@
monitor.classList.remove('collapsed');
collapseBtn.textContent = 'Details';
}
}
function setCancelButtonState(btn, busy) {
if (!btn) return;
const label = btn.dataset.label || '✕';
btn.disabled = !!busy;
btn.classList.toggle('is-busy', !!busy);
btn.textContent = busy ? '…' : label;
}
function cancelCrawl(crawlId, btn) {
if (!crawlId) return;
if (!getApiKey()) {
console.warn('API key unavailable for this session.');
setCancelButtonState(btn, false);
return;
}
setCancelButtonState(btn, true);
fetch(buildApiUrl(`/api/v1/crawls/crawl/${crawlId}`), {
method: 'PATCH',
headers: buildApiHeaders(),
body: JSON.stringify({ status: 'sealed', retry_at: null }),
})
.then(response => response.json())
.then(data => {
if (data.error) {
console.error('Cancel crawl error:', data.error);
}
fetchProgress();
})
.catch(error => {
console.error('Cancel crawl failed:', error);
setCancelButtonState(btn, false);
});
}
function cancelSnapshot(snapshotId, btn) {
if (!snapshotId) return;
if (!getApiKey()) {
console.warn('API key unavailable for this session.');
setCancelButtonState(btn, false);
return;
}
setCancelButtonState(btn, true);
fetch(buildApiUrl(`/api/v1/core/snapshot/${snapshotId}`), {
method: 'PATCH',
headers: buildApiHeaders(),
body: JSON.stringify({ status: 'sealed', retry_at: null }),
})
.then(response => response.json())
.then(data => {
if (data.error) {
console.error('Cancel snapshot error:', data.error);
}
fetchProgress();
})
.catch(error => {
console.error('Cancel snapshot failed:', error);
setCancelButtonState(btn, false);
});
}
// Collapse toggle
collapseBtn.addEventListener('click', function() {
setCollapsedState(!isCollapsed);
});
crawlTree.addEventListener('click', function(event) {
const btn = event.target.closest('.cancel-item-btn');
if (!btn) return;
event.preventDefault();
event.stopPropagation();
const cancelType = btn.dataset.cancelType;
if (cancelType === 'crawl') {
cancelCrawl(btn.dataset.crawlId, btn);
} else if (cancelType === 'snapshot') {
cancelSnapshot(btn.dataset.snapshotId, btn);
}
});
// Apply initial state
if (isCollapsed) {
monitor.classList.add('collapsed');
collapseBtn.textContent = 'Expand';
setCollapsedState(true, false);
}
// Start polling when page loads

View File

@@ -180,7 +180,7 @@
<input type="checkbox" name="_selected_action" value="{{obj.pk}}"/>
</label>
</div>
<a href="/{{obj.archive_path}}/index.html" class="card-thumbnail {% if not obj.thumbnail_url %}missing{% endif %}">
<a href="{% snapshot_base_url obj %}/index.html" class="card-thumbnail {% if not obj.thumbnail_url %}missing{% endif %}">
<img src="{{obj.thumbnail_url|default:'/static/spinner.gif' }}" alt="{{obj.title|default:'Not yet archived...'}}" />
</a>
<div class="card-footer">
@@ -194,10 +194,10 @@
</div>
{% endif %}
<div class="card-title" title="{{obj.title}}">
<a href="/{{obj.archive_path}}/index.html">
<a href="{% snapshot_base_url obj %}/index.html">
<h4>
{% if obj.is_archived %}
<img src="/{{obj.archive_path}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"/>
<img src="{% snapshot_base_url obj %}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"/>
{% else %}
<img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async"/>
{% endif %}

Some files were not shown because too many files have changed in this diff Show More