WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

45
.github/workflows/release-runner.yml vendored Normal file
View File

@@ -0,0 +1,45 @@
name: Release State
on:
push:
branches:
- '**'
workflow_dispatch:
permissions:
contents: write
id-token: write
jobs:
release-state:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: true
ref: ${{ github.ref_name }}
- uses: actions/setup-python@v5
with:
python-version: "3.13"
- uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- uses: actions/setup-node@v4
with:
node-version: 22
- name: Configure git identity
run: |
git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
- name: Run release script
env:
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
GH_TOKEN: ${{ github.token }}
PYPI_PAT_SECRET: ${{ secrets.PYPI_PAT_SECRET }}
run: ./bin/release.sh

View File

@@ -9,7 +9,6 @@ name: Release
# This workflow ensures the correct ordering during a release.
on:
workflow_dispatch:
release:
types: [published]

View File

@@ -6,8 +6,9 @@ from django.views.generic.base import RedirectView
from .v1_api import urls as v1_api_urls
urlpatterns = [
path("", RedirectView.as_view(url='/api/v1')),
path("", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url='/api/v1/docs')),

View File

@@ -6,7 +6,8 @@ from typing import List, Optional, Union, Any, Annotated
from datetime import datetime
from django.db.models import Model, Q
from django.http import HttpRequest
from django.conf import settings
from django.http import HttpRequest, HttpResponse
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
@@ -18,6 +19,22 @@ from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError
from archivebox.core.models import Snapshot, ArchiveResult, Tag
from archivebox.api.auth import auth_using_token
from archivebox.config.common import SERVER_CONFIG
from archivebox.core.tag_utils import (
build_tag_cards,
delete_tag as delete_tag_record,
export_tag_snapshots_jsonl,
export_tag_urls,
get_matching_tags,
get_or_create_tag,
get_tag_by_ref,
normalize_created_by_filter,
normalize_created_year_filter,
normalize_has_snapshots_filter,
normalize_tag_sort,
rename_tag as rename_tag_record,
)
from archivebox.crawls.models import Crawl
from archivebox.api.v1_crawls import CrawlSchema
@@ -404,7 +421,7 @@ class TagSchema(Schema):
def get_tags(request: HttpRequest):
setattr(request, 'with_snapshots', False)
setattr(request, 'with_archiveresults', False)
return Tag.objects.all().distinct()
return get_matching_tags()
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
@@ -412,9 +429,9 @@ def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
setattr(request, 'with_snapshots', with_snapshots)
setattr(request, 'with_archiveresults', False)
try:
return Tag.objects.get(id__icontains=tag_id)
return get_tag_by_ref(tag_id)
except (Tag.DoesNotExist, ValidationError):
return Tag.objects.get(slug__icontains=tag_id)
raise HttpError(404, 'Tag not found')
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
@@ -459,6 +476,55 @@ class TagCreateResponseSchema(Schema):
created: bool
class TagSearchSnapshotSchema(Schema):
id: str
title: str
url: str
favicon_url: str
admin_url: str
archive_url: str
downloaded_at: Optional[str] = None
class TagSearchCardSchema(Schema):
id: int
name: str
slug: str
num_snapshots: int
filter_url: str
edit_url: str
export_urls_url: str
export_jsonl_url: str
rename_url: str
delete_url: str
snapshots: List[TagSearchSnapshotSchema]
class TagSearchResponseSchema(Schema):
tags: List[TagSearchCardSchema]
sort: str
created_by: str
year: str
has_snapshots: str
class TagUpdateSchema(Schema):
name: str
class TagUpdateResponseSchema(Schema):
success: bool
tag_id: int
tag_name: str
slug: str
class TagDeleteResponseSchema(Schema):
success: bool
tag_id: int
deleted_count: int
class TagSnapshotRequestSchema(Schema):
snapshot_id: str
tag_name: Optional[str] = None
@@ -471,41 +537,82 @@ class TagSnapshotResponseSchema(Schema):
tag_name: str
@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete")
@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags")
def search_tags(
request: HttpRequest,
q: str = "",
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
):
"""Return detailed tag cards for admin/live-search UIs."""
normalized_sort = normalize_tag_sort(sort)
normalized_created_by = normalize_created_by_filter(created_by)
normalized_year = normalize_created_year_filter(year)
normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
return {
'tags': build_tag_cards(
query=q,
request=request,
sort=normalized_sort,
created_by=normalized_created_by,
year=normalized_year,
has_snapshots=normalized_has_snapshots,
),
'sort': normalized_sort,
'created_by': normalized_created_by,
'year': normalized_year,
'has_snapshots': normalized_has_snapshots,
}
def _public_tag_listing_enabled() -> bool:
explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
if explicit is not None:
return bool(explicit)
return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
user = getattr(request, 'user', None)
if getattr(user, 'is_authenticated', False):
return True
token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
auth_header = request.headers.get('Authorization', '')
if not token and auth_header.lower().startswith('bearer '):
token = auth_header.split(None, 1)[1].strip()
if token and auth_using_token(token=token, request=request):
return True
return _public_tag_listing_enabled()
@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None)
def tags_autocomplete(request: HttpRequest, q: str = ""):
"""Return tags matching the query for autocomplete."""
if not q:
# Return all tags if no query (limited to 50)
tags = Tag.objects.all().order_by('name')[:50]
else:
tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20]
if not _request_has_tag_autocomplete_access(request):
raise HttpError(401, 'Authentication required')
tags = get_matching_tags(q)[:50 if not q else 20]
return {
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags]
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
}
@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create")
def tags_create(request: HttpRequest, data: TagCreateSchema):
"""Create a new tag or return existing one."""
name = data.name.strip()
if not name:
raise HttpError(400, 'Tag name is required')
tag, created = Tag.objects.get_or_create(
name__iexact=name,
defaults={
'name': name,
'created_by': request.user if request.user.is_authenticated else None,
}
)
# If found by case-insensitive match, use that tag
if not created:
existing_tag = Tag.objects.filter(name__iexact=name).first()
if existing_tag is None:
raise HttpError(500, 'Failed to load existing tag after get_or_create')
tag = existing_tag
try:
tag, created = get_or_create_tag(
data.name,
created_by=request.user if request.user.is_authenticated else None,
)
except ValueError as err:
raise HttpError(400, str(err)) from err
return {
'success': True,
@@ -515,6 +622,62 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
}
@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag")
def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
try:
tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
except ValueError as err:
raise HttpError(400, str(err)) from err
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
'slug': tag.slug,
}
@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag")
def delete_tag(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
deleted_count, _ = delete_tag_record(tag)
return {
'success': True,
'tag_id': int(tag_id),
'deleted_count': deleted_count,
}
@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export")
def tag_urls_export(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
return response
@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export")
def tag_snapshots_export(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
return response
@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot")
def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
"""Add a tag to a snapshot. Creates the tag if it doesn't exist."""
@@ -534,24 +697,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
# Get or create the tag
if data.tag_name:
name = data.tag_name.strip()
if not name:
raise HttpError(400, 'Tag name is required')
tag, _ = Tag.objects.get_or_create(
name__iexact=name,
defaults={
'name': name,
'created_by': request.user if request.user.is_authenticated else None,
}
)
# If found by case-insensitive match, use that tag
existing_tag = Tag.objects.filter(name__iexact=name).first()
if existing_tag is not None:
tag = existing_tag
try:
tag, _ = get_or_create_tag(
data.tag_name,
created_by=request.user if request.user.is_authenticated else None,
)
except ValueError as err:
raise HttpError(400, str(err)) from err
elif data.tag_id:
try:
tag = Tag.objects.get(pk=data.tag_id)
tag = get_tag_by_ref(data.tag_id)
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
else:

View File

@@ -4,7 +4,7 @@ __package__ = 'archivebox.base_models'
import json
from collections.abc import Mapping
from typing import TypedDict
from typing import NotRequired, TypedDict
from django import forms
from django.contrib import admin
@@ -17,9 +17,13 @@ from django_object_actions import DjangoObjectActions
class ConfigOption(TypedDict):
plugin: str
type: str
type: str | list[str]
default: object
description: str
enum: NotRequired[list[object]]
pattern: NotRequired[str]
minimum: NotRequired[int | float]
maximum: NotRequired[int | float]
class KeyValueWidget(forms.Widget):
@@ -44,12 +48,16 @@ class KeyValueWidget(forms.Widget):
options: dict[str, ConfigOption] = {}
for plugin_name, schema in plugin_configs.items():
for key, prop in schema.get('properties', {}).items():
options[key] = {
option: ConfigOption = {
'plugin': plugin_name,
'type': prop.get('type', 'string'),
'default': prop.get('default', ''),
'description': prop.get('description', ''),
}
for schema_key in ('enum', 'pattern', 'minimum', 'maximum'):
if schema_key in prop:
option[schema_key] = prop[schema_key]
options[key] = option
return options
except Exception:
return {}
@@ -98,14 +106,12 @@ class KeyValueWidget(forms.Widget):
'''
# Render existing key-value pairs
row_idx = 0
for key, val in data.items():
val_str = json.dumps(val) if not isinstance(val, str) else val
html += self._render_row(widget_id, row_idx, key, val_str)
row_idx += 1
html += self._render_row(widget_id, key, val_str)
# Always add one empty row for new entries
html += self._render_row(widget_id, row_idx, '', '')
html += self._render_row(widget_id, '', '')
html += f'''
</div>
@@ -114,22 +120,450 @@ class KeyValueWidget(forms.Widget):
style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
+ Add Row
</button>
<span id="{widget_id}_hint" style="font-size: 11px; color: #666; font-style: italic;"></span>
</div>
<input type="hidden" name="{name}" id="{widget_id}" value="">
<script>
(function() {{
var configMeta_{widget_id} = {config_meta_json};
var rowCounter_{widget_id} = 0;
function showKeyHint_{widget_id}(key) {{
var hint = document.getElementById('{widget_id}_hint');
var meta = configMeta_{widget_id}[key];
if (meta) {{
hint.innerHTML = '<b>' + key + '</b>: ' + (meta.description || meta.type) +
(meta.default !== '' ? ' <span style="color:#888">(default: ' + meta.default + ')</span>' : '');
}} else {{
hint.textContent = key ? 'Custom key: ' + key : '';
function stringifyValue_{widget_id}(value) {{
return typeof value === 'string' ? value : JSON.stringify(value);
}}
function getTypes_{widget_id}(meta) {{
if (!meta || meta.type === undefined || meta.type === null) {{
return [];
}}
return Array.isArray(meta.type) ? meta.type : [meta.type];
}}
function getMetaForKey_{widget_id}(key) {{
if (!key) {{
return null;
}}
var explicitMeta = configMeta_{widget_id}[key];
if (explicitMeta) {{
return Object.assign({{ key: key }}, explicitMeta);
}}
if (key.endsWith('_BINARY')) {{
return {{
key: key,
plugin: 'custom',
type: 'string',
default: '',
description: 'Path to binary executable',
}};
}}
if (isRegexConfigKey_{widget_id}(key)) {{
return {{
key: key,
plugin: 'custom',
type: 'string',
default: '',
description: 'Regex pattern list',
}};
}}
return null;
}}
function describeMeta_{widget_id}(meta) {{
if (!meta) {{
return '';
}}
var details = '';
if (Array.isArray(meta.enum) && meta.enum.length) {{
details = 'Allowed: ' + meta.enum.map(stringifyValue_{widget_id}).join(', ');
}} else {{
var types = getTypes_{widget_id}(meta);
if (types.length) {{
details = 'Expected: ' + types.join(' or ');
}}
}}
if (meta.minimum !== undefined || meta.maximum !== undefined) {{
var bounds = [];
if (meta.minimum !== undefined) bounds.push('min ' + meta.minimum);
if (meta.maximum !== undefined) bounds.push('max ' + meta.maximum);
details += (details ? ' ' : '') + '(' + bounds.join(', ') + ')';
}}
return [meta.description || '', details].filter(Boolean).join(' ');
}}
function getExampleInput_{widget_id}(key, meta) {{
var types = getTypes_{widget_id}(meta);
if (key.endsWith('_BINARY')) {{
return 'Example: wget or /usr/bin/wget';
}}
if (key.endsWith('_ARGS_EXTRA') || key.endsWith('_ARGS')) {{
return 'Example: ["--extra-arg"]';
}}
if (types.includes('array')) {{
return 'Example: ["value"]';
}}
if (types.includes('object')) {{
if (key === 'SAVE_ALLOWLIST' || key === 'SAVE_DENYLIST') {{
return 'Example: {{"^https://example\\\\.com": ["wget"]}}';
}}
return 'Example: {{"key": "value"}}';
}}
return '';
}}
function isRegexConfigKey_{widget_id}(key) {{
return key === 'URL_ALLOWLIST' ||
key === 'URL_DENYLIST' ||
key === 'SAVE_ALLOWLIST' ||
key === 'SAVE_DENYLIST' ||
key.endsWith('_PATTERN') ||
key.includes('REGEX');
}}
function isSimpleFilterPattern_{widget_id}(pattern) {{
return /^[\\w.*:-]+$/.test(pattern);
}}
function validateRegexPattern_{widget_id}(pattern) {{
if (!pattern || isSimpleFilterPattern_{widget_id}(pattern)) {{
return '';
}}
try {{
new RegExp(pattern);
}} catch (error) {{
return error && error.message ? error.message : 'Invalid regex';
}}
return '';
}}
function validateRegexConfig_{widget_id}(key, raw, typeName) {{
if (typeName === 'object') {{
var parsed;
try {{
parsed = JSON.parse(raw);
}} catch (error) {{
return {{ ok: false, value: raw, message: 'Must be valid JSON' }};
}}
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {{
return {{ ok: false, value: parsed, message: 'Must be a JSON object' }};
}}
for (var regexKey in parsed) {{
var objectRegexError = validateRegexPattern_{widget_id}(regexKey);
if (objectRegexError) {{
return {{ ok: false, value: parsed, message: 'Invalid regex key "' + regexKey + '": ' + objectRegexError }};
}}
}}
return {{ ok: true, value: parsed, message: '' }};
}}
var patterns = raw.split(/[\\n,]+/).map(function(pattern) {{
return pattern.trim();
}}).filter(Boolean);
for (var i = 0; i < patterns.length; i++) {{
var regexError = validateRegexPattern_{widget_id}(patterns[i]);
if (regexError) {{
return {{ ok: false, value: raw, message: 'Invalid regex "' + patterns[i] + '": ' + regexError }};
}}
}}
return {{ ok: true, value: raw, message: '' }};
}}
function validateBinaryValue_{widget_id}(raw) {{
if (!raw) {{
return {{ ok: true, value: raw, message: '' }};
}}
if (/['"`]/.test(raw)) {{
return {{ ok: false, value: raw, message: 'Binary paths cannot contain quotes' }};
}}
if (/[;&|<>$(){{}}\\[\\]!]/.test(raw)) {{
return {{ ok: false, value: raw, message: 'Binary paths can only be a binary name or absolute path' }};
}}
if (raw.startsWith('/')) {{
if (/^[A-Za-z0-9_./+\\- ]+$/.test(raw)) {{
return {{ ok: true, value: raw, message: '' }};
}}
return {{ ok: false, value: raw, message: 'Absolute paths may only contain path-safe characters' }};
}}
if (/^[A-Za-z0-9_.+-]+$/.test(raw)) {{
return {{ ok: true, value: raw, message: '' }};
}}
return {{ ok: false, value: raw, message: 'Enter a binary name like wget or an absolute path like /usr/bin/wget' }};
}}
function parseValue_{widget_id}(raw) {{
try {{
if (raw === 'true') return true;
if (raw === 'false') return false;
if (raw === 'null') return null;
if (raw !== '' && !isNaN(raw)) return Number(raw);
if ((raw.startsWith('{{') && raw.endsWith('}}')) ||
(raw.startsWith('[') && raw.endsWith(']')) ||
(raw.startsWith('"') && raw.endsWith('"'))) {{
return JSON.parse(raw);
}}
}} catch (error) {{
return raw;
}}
return raw;
}}
function sameValue_{widget_id}(left, right) {{
return left === right || JSON.stringify(left) === JSON.stringify(right);
}}
function parseTypedValue_{widget_id}(raw, typeName, meta) {{
var numberValue;
var parsed;
if (typeName && meta && meta.key && isRegexConfigKey_{widget_id}(meta.key)) {{
return validateRegexConfig_{widget_id}(meta.key, raw, typeName);
}}
if (typeName === 'string' && meta && meta.key && meta.key.endsWith('_BINARY')) {{
return validateBinaryValue_{widget_id}(raw);
}}
if (typeName === 'string') {{
if (meta.pattern) {{
try {{
if (!(new RegExp(meta.pattern)).test(raw)) {{
return {{ ok: false, value: raw, message: 'Must match pattern ' + meta.pattern }};
}}
}} catch (error) {{}}
}}
return {{ ok: true, value: raw, message: '' }};
}}
if (typeName === 'integer') {{
if (!/^-?\\d+$/.test(raw)) {{
return {{ ok: false, value: raw, message: 'Must be an integer' }};
}}
numberValue = Number(raw);
if (meta.minimum !== undefined && numberValue < meta.minimum) {{
return {{ ok: false, value: numberValue, message: 'Must be at least ' + meta.minimum }};
}}
if (meta.maximum !== undefined && numberValue > meta.maximum) {{
return {{ ok: false, value: numberValue, message: 'Must be at most ' + meta.maximum }};
}}
return {{ ok: true, value: numberValue, message: '' }};
}}
if (typeName === 'number') {{
if (raw === '' || isNaN(raw)) {{
return {{ ok: false, value: raw, message: 'Must be a number' }};
}}
numberValue = Number(raw);
if (meta.minimum !== undefined && numberValue < meta.minimum) {{
return {{ ok: false, value: numberValue, message: 'Must be at least ' + meta.minimum }};
}}
if (meta.maximum !== undefined && numberValue > meta.maximum) {{
return {{ ok: false, value: numberValue, message: 'Must be at most ' + meta.maximum }};
}}
return {{ ok: true, value: numberValue, message: '' }};
}}
if (typeName === 'boolean') {{
var lowered = raw.toLowerCase();
if (lowered === 'true' || raw === '1') return {{ ok: true, value: true, message: '' }};
if (lowered === 'false' || raw === '0') return {{ ok: true, value: false, message: '' }};
return {{ ok: false, value: raw, message: 'Must be true or false' }};
}}
if (typeName === 'null') {{
return raw === 'null'
? {{ ok: true, value: null, message: '' }}
: {{ ok: false, value: raw, message: 'Must be null' }};
}}
if (typeName === 'array' || typeName === 'object') {{
try {{
parsed = JSON.parse(raw);
}} catch (error) {{
return {{ ok: false, value: raw, message: 'Must be valid JSON' }};
}}
if (typeName === 'array' && Array.isArray(parsed)) {{
return {{ ok: true, value: parsed, message: '' }};
}}
if (typeName === 'object' && parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {{
return {{ ok: true, value: parsed, message: '' }};
}}
return {{
ok: false,
value: parsed,
message: typeName === 'array' ? 'Must be a JSON array' : 'Must be a JSON object',
}};
}}
return {{ ok: true, value: parseValue_{widget_id}(raw), message: '' }};
}}
function validateValueAgainstMeta_{widget_id}(raw, meta) {{
if (!meta || raw === '') {{
return {{ state: 'neutral', value: raw, message: '' }};
}}
var enumValues = Array.isArray(meta.enum) ? meta.enum : [];
var types = getTypes_{widget_id}(meta);
if (!types.length) {{
types = ['string'];
}}
var error = 'Invalid value';
for (var i = 0; i < types.length; i++) {{
var candidate = parseTypedValue_{widget_id}(raw, types[i], meta);
if (!candidate.ok) {{
error = candidate.message || error;
continue;
}}
if (enumValues.length && !enumValues.some(function(enumValue) {{
return sameValue_{widget_id}(enumValue, candidate.value) || stringifyValue_{widget_id}(enumValue) === raw;
}})) {{
error = 'Must be one of: ' + enumValues.map(stringifyValue_{widget_id}).join(', ');
continue;
}}
return {{ state: 'valid', value: candidate.value, message: '' }};
}}
return {{ state: 'invalid', value: raw, message: error }};
}}
function ensureRowId_{widget_id}(row) {{
if (!row.dataset.rowId) {{
row.dataset.rowId = String(rowCounter_{widget_id}++);
}}
return row.dataset.rowId;
}}
function setRowHelp_{widget_id}(row) {{
var keyInput = row.querySelector('.kv-key');
var help = row.querySelector('.kv-help');
if (!keyInput || !help) {{
return;
}}
var key = keyInput.value.trim();
if (!key) {{
help.textContent = '';
return;
}}
var meta = getMetaForKey_{widget_id}(key);
if (meta) {{
var extra = isRegexConfigKey_{widget_id}(key)
? ((meta.type === 'object' || (Array.isArray(meta.type) && meta.type.includes('object')))
? ' Expected: JSON object with regex keys.'
: ' Expected: valid regex.')
: '';
var example = getExampleInput_{widget_id}(key, meta);
help.textContent = [describeMeta_{widget_id}(meta) + extra, example].filter(Boolean).join(' ');
}} else {{
help.textContent = 'Custom key';
}}
}}
function configureValueInput_{widget_id}(row) {{
var keyInput = row.querySelector('.kv-key');
var valueInput = row.querySelector('.kv-value');
var datalist = row.querySelector('.kv-value-options');
if (!keyInput || !valueInput || !datalist) {{
return;
}}
var rowId = ensureRowId_{widget_id}(row);
datalist.id = '{widget_id}_value_options_' + rowId;
var meta = getMetaForKey_{widget_id}(keyInput.value.trim());
var enumValues = Array.isArray(meta && meta.enum) ? meta.enum : [];
var types = getTypes_{widget_id}(meta);
if (!enumValues.length && types.includes('boolean')) {{
enumValues = ['True', 'False'];
}}
if (enumValues.length) {{
datalist.innerHTML = enumValues.map(function(enumValue) {{
return '<option value="' + stringifyValue_{widget_id}(enumValue).replace(/"/g, '&quot;') + '"></option>';
}}).join('');
valueInput.setAttribute('list', datalist.id);
}} else {{
datalist.innerHTML = '';
valueInput.removeAttribute('list');
}}
}}
function setValueValidationState_{widget_id}(input, state, message) {{
if (!input) {{
return;
}}
if (state === 'valid') {{
input.style.borderColor = '#2da44e';
input.style.boxShadow = '0 0 0 1px rgba(45, 164, 78, 0.18)';
input.style.backgroundColor = '#f6ffed';
}} else if (state === 'invalid') {{
input.style.borderColor = '#cf222e';
input.style.boxShadow = '0 0 0 1px rgba(207, 34, 46, 0.18)';
input.style.backgroundColor = '#fff8f8';
}} else {{
input.style.borderColor = '#ccc';
input.style.boxShadow = 'none';
input.style.backgroundColor = '';
}}
input.title = message || '';
}}
function applyValueValidation_{widget_id}(row) {{
var keyInput = row.querySelector('.kv-key');
var valueInput = row.querySelector('.kv-value');
if (!keyInput || !valueInput) {{
return;
}}
var key = keyInput.value.trim();
if (!key) {{
setValueValidationState_{widget_id}(valueInput, 'neutral', '');
return;
}}
var meta = getMetaForKey_{widget_id}(key);
if (!meta) {{
setValueValidationState_{widget_id}(valueInput, 'neutral', '');
return;
}}
var validation = validateValueAgainstMeta_{widget_id}(valueInput.value.trim(), meta);
setValueValidationState_{widget_id}(valueInput, validation.state, validation.message);
}}
function coerceValueForStorage_{widget_id}(key, raw) {{
var meta = getMetaForKey_{widget_id}(key);
if (!meta) {{
return parseValue_{widget_id}(raw);
}}
var validation = validateValueAgainstMeta_{widget_id}(raw, meta);
return validation.state === 'valid' ? validation.value : raw;
}}
function initializeRows_{widget_id}() {{
var container = document.getElementById('{widget_id}_rows');
container.querySelectorAll('.key-value-row').forEach(function(row) {{
ensureRowId_{widget_id}(row);
configureValueInput_{widget_id}(row);
setRowHelp_{widget_id}(row);
applyValueValidation_{widget_id}(row);
}});
}}
function updateHiddenField_{widget_id}() {{
@@ -142,20 +576,7 @@ class KeyValueWidget(forms.Widget):
if (keyInput && valInput && keyInput.value.trim()) {{
var key = keyInput.value.trim();
var val = valInput.value.trim();
// Try to parse as JSON (for booleans, numbers, etc)
try {{
if (val === 'true') result[key] = true;
else if (val === 'false') result[key] = false;
else if (val === 'null') result[key] = null;
else if (!isNaN(val) && val !== '') result[key] = Number(val);
else if ((val.startsWith('{{') && val.endsWith('}}')) ||
(val.startsWith('[') && val.endsWith(']')) ||
(val.startsWith('"') && val.endsWith('"')))
result[key] = JSON.parse(val);
else result[key] = val;
}} catch(e) {{
result[key] = val;
}}
result[key] = coerceValueForStorage_{widget_id}(key, val);
}}
}});
document.getElementById('{widget_id}').value = JSON.stringify(result);
@@ -163,60 +584,85 @@ class KeyValueWidget(forms.Widget):
window.addKeyValueRow_{widget_id} = function() {{
var container = document.getElementById('{widget_id}_rows');
var rows = container.querySelectorAll('.key-value-row');
var newIdx = rows.length;
var newRow = document.createElement('div');
newRow.className = 'key-value-row';
newRow.style.cssText = 'display: flex; gap: 8px; margin-bottom: 6px; align-items: center;';
newRow.innerHTML = '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">' +
newRow.style.cssText = 'margin-bottom: 6px;';
newRow.innerHTML = '<div style="display: flex; gap: 8px; align-items: center;">' +
'<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">' +
'<input type="text" class="kv-value" placeholder="value" ' +
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">' +
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">' +
'<datalist class="kv-value-options"></datalist>' +
'<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' +
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;"></button>';
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;"></button>' +
'</div>' +
'<div class="kv-help" style="margin-top: 4px; font-size: 11px; color: #666; font-style: italic;"></div>';
container.appendChild(newRow);
ensureRowId_{widget_id}(newRow);
configureValueInput_{widget_id}(newRow);
setRowHelp_{widget_id}(newRow);
applyValueValidation_{widget_id}(newRow);
updateHiddenField_{widget_id}();
newRow.querySelector('.kv-key').focus();
}};
window.removeKeyValueRow_{widget_id} = function(btn) {{
var row = btn.parentElement;
var row = btn.closest('.key-value-row');
row.remove();
updateHiddenField_{widget_id}();
}};
window.showKeyHint_{widget_id} = showKeyHint_{widget_id};
window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id};
// Initialize on load
document.addEventListener('DOMContentLoaded', function() {{
initializeRows_{widget_id}();
updateHiddenField_{widget_id}();
}});
// Also run immediately in case DOM is already ready
if (document.readyState !== 'loading') {{
initializeRows_{widget_id}();
updateHiddenField_{widget_id}();
}}
// Update on any input change
document.getElementById('{widget_id}_rows').addEventListener('input', updateHiddenField_{widget_id});
var rowsEl_{widget_id} = document.getElementById('{widget_id}_rows');
rowsEl_{widget_id}.addEventListener('input', function(event) {{
var row = event.target.closest('.key-value-row');
if (!row) {{
return;
}}
if (event.target.classList.contains('kv-key')) {{
configureValueInput_{widget_id}(row);
setRowHelp_{widget_id}(row);
}}
if (event.target.classList.contains('kv-key') || event.target.classList.contains('kv-value')) {{
applyValueValidation_{widget_id}(row);
updateHiddenField_{widget_id}();
}}
}});
}})();
</script>
</div>
'''
return mark_safe(html)
def _render_row(self, widget_id: str, idx: int, key: str, value: str) -> str:
def _render_row(self, widget_id: str, key: str, value: str) -> str:
return f'''
<div class="key-value-row" style="display: flex; gap: 8px; margin-bottom: 6px; align-items: center;">
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;"></button>
<div class="key-value-row" style="margin-bottom: 6px;">
<div style="display: flex; gap: 8px; align-items: center;">
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">
<datalist class="kv-value-options"></datalist>
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;"></button>
</div>
<div class="kv-help" style="margin-top: 4px; font-size: 11px; color: #666; font-style: italic;"></div>
</div>
'''

View File

@@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
def add(urls: str | list[str],
depth: int | str=0,
tag: str='',
url_allowlist: str='',
url_denylist: str='',
parser: str="auto",
plugins: str="",
persona: str='Default',
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
update: bool | None=None,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
@@ -85,6 +87,8 @@ def add(urls: str | list[str],
created_by_id = created_by_id or get_or_create_system_user_pk()
started_at = timezone.now()
if update is None:
update = not ARCHIVING_CONFIG.ONLY_NEW
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
@@ -120,6 +124,8 @@ def add(urls: str | list[str],
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
}
)
@@ -150,6 +156,9 @@ def add(urls: str | list[str],
snapshot.ensure_crawl_symlink()
return crawl, crawl.snapshot_set.all()
if bg:
crawl.create_snapshots_from_urls()
# 5. Start the crawl runner to process the queue
# The runner will:
# - Process Crawl -> create Snapshots from all URLs
@@ -192,8 +201,7 @@ def add(urls: str | list[str],
except Exception:
rel_output_str = str(crawl.output_dir)
# Build admin URL from SERVER_CONFIG
bind_addr = SERVER_CONFIG.BIND_ADDR
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
base_url = bind_addr
else:
@@ -218,11 +226,13 @@ def add(urls: str | list[str],
@click.command()
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())

View File

@@ -42,6 +42,16 @@ from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
return {
'type': 'ArchiveResult',
'snapshot_id': str(snapshot_id),
'plugin': plugin,
'hook_name': hook_name,
'status': status,
}
# =============================================================================
# CREATE
# =============================================================================
@@ -52,21 +62,21 @@ def create_archiveresults(
status: str = 'queued',
) -> int:
"""
Create ArchiveResults for Snapshots.
Create ArchiveResult request records for Snapshots.
Reads Snapshot records from stdin and creates ArchiveResult entries.
Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
If --plugin is specified, only creates results for that plugin.
Otherwise, creates results for all pending plugins.
If --plugin is specified, only emits requests for that plugin.
Otherwise, emits requests for all enabled snapshot hooks.
Exit codes:
0: Success
1: Failure
"""
from django.utils import timezone
from archivebox.config.configset import get_config
from archivebox.hooks import discover_hooks
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
@@ -135,33 +145,20 @@ def create_archiveresults(
created_count = 0
for snapshot in snapshots:
if plugin:
# Create for specific plugin only
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin,
defaults={
'status': status,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = status
result.retry_at = timezone.now()
result.save()
if not is_tty:
write_record(result.to_json())
write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
created_count += 1
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
hooks = discover_hooks('Snapshot', config=config)
for hook_path in hooks:
hook_name = hook_path.name
plugin_name = hook_path.parent.name
if not is_tty:
write_record(result.to_json())
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
created_count += 1
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
return 0
@@ -205,6 +202,7 @@ def list_archiveresults(
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
'noresults': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
@@ -233,8 +231,6 @@ def update_archiveresults(
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import ArchiveResult
@@ -257,7 +253,6 @@ def update_archiveresults(
# Apply updates from CLI flags
if status:
result.status = status
result.retry_at = timezone.now()
result.save()
updated_count += 1

View File

@@ -38,15 +38,16 @@ import rich_click as click
def process_archiveresult_by_id(archiveresult_id: str) -> int:
"""
Run extraction for a single ArchiveResult by ID (used by workers).
Re-run extraction for a single ArchiveResult by ID.
Triggers the ArchiveResult's state machine tick() to run the extractor
plugin, but only after claiming ownership via retry_at. This keeps direct
CLI execution aligned with the worker lifecycle and prevents duplicate hook
runs if another process already owns the same ArchiveResult.
ArchiveResults are projected status rows, not queued work items. Re-running
a single result means resetting that row and queueing its parent snapshot
through the shared crawl runner with the corresponding plugin selected.
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.core.models import ArchiveResult
from archivebox.services.runner import run_crawl
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
try:
# Claim-before-tick is the required calling pattern for direct
# state-machine drivers. If another worker already owns this row,
# report that and exit without running duplicate extractor side effects.
if not archiveresult.tick_claimed(lock_seconds=120):
print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
return 0
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
snapshot.status = snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl = snapshot.crawl
if crawl.status != crawl.StatusChoices.STARTED:
crawl.status = crawl.StatusChoices.QUEUED
crawl.retry_at = timezone.now()
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
return 1
@@ -121,8 +133,9 @@ def run_plugins(
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Gather snapshot IDs to process
# Gather snapshot IDs and optional plugin constraints to process
snapshot_ids = set()
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
for record in records:
record_type = record.get('type')
@@ -142,6 +155,9 @@ def run_plugins(
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
plugin_name = record.get('plugin')
if plugin_name and not plugins_list:
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
elif 'id' in record:
# Assume it's a snapshot ID
@@ -160,26 +176,15 @@ def run_plugins(
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
continue
# Create pending ArchiveResults if needed
if plugins_list:
# Only create for specific plugins
for plugin_name in plugins_list:
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin_name,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
if existing_result and existing_result.status in [
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
ArchiveResult.StatusChoices.BACKOFF,
]:
existing_result.reset_for_retry()
# Reset snapshot status to allow processing
if snapshot.status == Snapshot.StatusChoices.SEALED:
@@ -207,10 +212,15 @@ def run_plugins(
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
selected_plugins = plugins_list or sorted({
plugin
for snapshot_id in crawl_snapshot_ids
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
}) or None
run_crawl(
crawl_id,
snapshot_ids=sorted(crawl_snapshot_ids),
selected_plugins=plugins_list or None,
selected_plugins=selected_plugins,
)
# Output results as JSONL (when piped) or human-readable (when TTY)

View File

@@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None:
"""List Snapshots as JSONL."""
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
"""List Snapshots."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
@@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith:
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
))

View File

@@ -42,6 +42,7 @@ import rich_click as click
from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
from archivebox.personas import importers as persona_importers
# =============================================================================
@@ -440,8 +441,6 @@ def create_personas(
browser_binary = get_browser_binary(import_from)
if browser_binary:
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
else:
browser_binary = None
created_count = 0
for name in name_list:
@@ -450,7 +449,7 @@ def create_personas(
continue
# Validate persona name to prevent path traversal
is_valid, error_msg = validate_persona_name(name)
is_valid, error_msg = persona_importers.validate_persona_name(name)
if not is_valid:
rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
continue
@@ -468,49 +467,29 @@ def create_personas(
# Import browser profile if requested
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
# Copy the browser profile
rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
try:
# Remove existing chrome_user_data if it exists
if persona_chrome_dir.exists():
shutil.rmtree(persona_chrome_dir)
# Copy the profile directory
# We copy the entire user data dir, not just Default profile
shutil.copytree(
source_profile_dir,
persona_chrome_dir,
symlinks=True,
ignore=shutil.ignore_patterns(
'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
'Service Worker', 'GCM Store', '*.log', 'Crashpad',
'BrowserMetrics', 'BrowserMetrics-spare.pma',
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
),
import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile)
import_result = persona_importers.import_persona_from_source(
persona,
import_source,
copy_profile=True,
import_cookies=True,
capture_storage=False,
)
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(
persona_chrome_dir,
cookies_file,
profile_dir=profile,
chrome_binary=browser_binary,
):
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
else:
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
except Exception as e:
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
return 1
if import_result.profile_copied:
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
if import_result.cookies_imported:
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
elif not import_result.profile_copied:
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
for warning in import_result.warnings:
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
@@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int:
# Apply updates from CLI flags
if name:
# Validate new name to prevent path traversal
is_valid, error_msg = validate_persona_name(name)
is_valid, error_msg = persona_importers.validate_persona_name(name)
if not is_valid:
rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
continue

View File

@@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """
└─────────────────────────────────────────────────────────────────────────────┘
"""
ARCHIVERESULT_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ ArchiveResultMachine │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄─────────────────┐ │
│ │ (initial) │ │ │
│ └──┬───────┬──┘ │ │
│ │ │ │ tick() unless can_start() │
│ │ │ exceeded_max_ │ │
│ │ │ attempts │ │
│ │ ▼ │ │
│ │ ┌──────────┐ │ │
│ │ │ SKIPPED │ │ │
│ │ │ (final) │ │ │
│ │ └──────────┘ │ │
│ │ tick() when │ │
│ │ can_start() │ │
│ ▼ │ │
│ ┌─────────────┐ │ │
│ │ STARTED │──────────────────┘ │
│ │ │◄─────────────────────────────────────────────────┐ │
│ │ enter: │ │ │ │
│ │ result.run()│ tick() unless │ │ │
│ │ (execute │ is_finished() │ │ │
│ │ hook via │──────────────────────┘ │ │
│ │ run_hook())│ │ │
│ └──────┬──────┘ │ │
│ │ │ │
│ │ tick() checks status set by hook output │ │
│ ├─────────────┬─────────────┬─────────────┐ │ │
│ ▼ ▼ ▼ ▼ │ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
│ │ │ │ │
│ exceeded_max_ │ │ can_start()│ │
│ attempts │ │ loops back │ │
│ ▼ │ └────────────┘ │
│ ┌──────────┐ │ │
│ │ SKIPPED │◄─┘ │
│ │ (final) │ │
│ └──────────┘ │
│ │
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
└─────────────────────────────────────────────────────────────────────────────┘
"""
BINARY_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ BinaryMachine │
@@ -193,8 +143,8 @@ def pluginmap(
"""
Show a map of all state machines and their associated plugin hooks.
Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
Displays ASCII art diagrams of the core queued model state machines (Crawl,
Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
that will run for each model's transitions.
"""
from rich.console import Console
@@ -257,17 +207,6 @@ def pluginmap(
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
prnt()
# Show diagrams first (unless quiet mode)
if not quiet:
# Show ArchiveResult diagram separately since it's different
prnt(Panel(
ARCHIVERESULT_MACHINE_DIAGRAM,
title='[bold green]ArchiveResultMachine[/bold green]',
border_style='green',
expand=False,
))
prnt()
for event_name, info in model_events.items():
# Discover hooks for this event
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)

View File

@@ -145,17 +145,25 @@ def process_stdin_records() -> int:
try:
archiveresult = ArchiveResult.objects.get(id=record_id)
except ArchiveResult.DoesNotExist:
archiveresult = ArchiveResult.from_json(record)
archiveresult = None
else:
# New archiveresult - create it
archiveresult = ArchiveResult.from_json(record)
archiveresult = None
snapshot_id = record.get('snapshot_id')
plugin_name = record.get('plugin')
snapshot = None
if archiveresult:
archiveresult.retry_at = timezone.now()
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
archiveresult.save()
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
plugin_name = plugin_name or archiveresult.plugin
elif snapshot_id:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
snapshot = None
if snapshot:
snapshot.retry_at = timezone.now()
if snapshot.status != Snapshot.StatusChoices.STARTED:
snapshot.status = Snapshot.StatusChoices.QUEUED
@@ -167,9 +175,9 @@ def process_stdin_records() -> int:
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
if archiveresult.plugin:
plugin_names_by_crawl[crawl_id].add(archiveresult.plugin)
output_records.append(archiveresult.to_json())
if plugin_name:
plugin_names_by_crawl[crawl_id].add(str(plugin_name))
output_records.append(record if not archiveresult else archiveresult.to_json())
queued_count += 1
elif record_type == TYPE_BINARY:
@@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int:
"""
from django.utils import timezone
from archivebox.machine.models import Machine, Process
from archivebox.services.runner import run_pending_crawls
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
Process.cleanup_stale_running()
recover_orphaned_snapshots()
recover_orphaned_crawls()
Machine.current()
current = Process.current()
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
@@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
traceback.print_exc()
sys.exit(1)
if daemon:
if not sys.stdin.isatty():
exit_code = process_stdin_records()
if exit_code != 0:
sys.exit(exit_code)
sys.exit(run_runner(daemon=True))
if not sys.stdin.isatty():
sys.exit(process_stdin_records())
else:

View File

@@ -3,9 +3,7 @@
__package__ = 'archivebox.cli'
from typing import Iterable
import os
import sys
import subprocess
import rich_click as click
from rich import print
@@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types
from archivebox.config.common import SERVER_CONFIG
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
"""Stop any existing orchestrator process so the server can take ownership."""
process_model.cleanup_stale_running(machine=machine)
running_runners = list(process_model.objects.filter(
machine=machine,
status=process_model.StatusChoices.RUNNING,
process_type=process_model.TypeChoices.ORCHESTRATOR,
).order_by('created_at'))
if not running_runners:
return 0
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
if supervisor is not None and stop_worker_fn is not None:
for worker_name in ('worker_runner', 'worker_runner_watch'):
try:
stop_worker_fn(supervisor, worker_name)
except Exception:
pass
for proc in running_runners:
try:
proc.kill_tree(graceful_timeout=2.0)
except Exception:
try:
proc.terminate(graceful_timeout=2.0)
except Exception:
pass
process_model.cleanup_stale_running(machine=machine)
return len(running_runners)
@enforce_types
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
reload: bool=False,
@@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
if debug or reload:
SHELL_CONFIG.DEBUG = True
if run_in_debug:
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
if reload:
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
from archivebox.config.common import STORAGE_CONFIG
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if not is_reloader_child:
env = os.environ.copy()
subprocess.Popen(
[sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
@@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
from archivebox.workers.supervisord_util import (
get_existing_supervisord_process,
get_worker,
stop_worker,
start_server_workers,
is_port_in_use,
)
from archivebox.machine.models import Machine, Process
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
machine = Machine.current()
stop_existing_background_runner(
machine=machine,
process_model=Process,
supervisor=get_existing_supervisord_process(),
stop_worker_fn=stop_worker,
)
supervisor = get_existing_supervisord_process()
if supervisor:
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
server_proc = get_worker(supervisor, server_worker_name)
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
if server_state == 'RUNNING':
runner_proc = get_worker(supervisor, 'worker_runner')
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if runner_state == 'RUNNING':
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
if runner_watch_state == 'RUNNING':
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
sys.exit(1)
if run_in_debug:
from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if not reload:
runserver_args.append('--noreload') # '--insecure'
if nothreading:
runserver_args.append('--nothreading')
call_command("runserver", *runserver_args)
else:
from archivebox.workers.supervisord_util import (
get_existing_supervisord_process,
get_worker,
start_server_workers,
is_port_in_use,
)
from archivebox.machine.models import Machine, Process
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
# Check if the background crawl runner is already running for this data directory
if Process.objects.filter(
machine=Machine.current(),
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists():
print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]')
print(' Stop the existing runner before starting a new server')
print(' To stop: pkill -f "archivebox run --daemon"')
sys.exit(1)
# Check if supervisord is already running
supervisor = get_existing_supervisord_process()
if supervisor:
daphne_proc = get_worker(supervisor, 'worker_daphne')
daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None
# If daphne is already running, error out
if daphne_state == 'RUNNING':
runner_proc = get_worker(supervisor, 'worker_runner')
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if runner_state == 'RUNNING':
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
sys.exit(1)
# Otherwise, daphne is not running - fall through to start it
# No existing workers found - start new ones
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
@click.command()

View File

@@ -172,6 +172,9 @@ def list_snapshots(
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
sort: Optional[str] = None,
csv: Optional[str] = None,
with_headers: bool = False,
) -> int:
"""
List Snapshots as JSONL with optional filters.
@@ -182,7 +185,11 @@ def list_snapshots(
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
if with_headers and not csv:
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
return 2
is_tty = sys.stdout.isatty() and not csv
queryset = Snapshot.objects.all().order_by('-created_at')
@@ -199,7 +206,29 @@ def list_snapshots(
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
if sort:
queryset = queryset.order_by(sort)
count = 0
if csv:
cols = [col.strip() for col in csv.split(',') if col.strip()]
if not cols:
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
return 2
rows: list[str] = []
if with_headers:
rows.append(','.join(cols))
for snapshot in queryset.iterator(chunk_size=500):
rows.append(snapshot.to_csv(cols=cols, separator=','))
count += 1
output = '\n'.join(rows)
if output:
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
return 0
for snapshot in queryset:
if is_tty:
status_color = {

View File

@@ -1,6 +1,7 @@
__package__ = "archivebox.config"
import re
import secrets
import sys
import shutil
from typing import ClassVar, Dict, Optional, List
@@ -8,7 +9,6 @@ from pathlib import Path
from rich import print
from pydantic import Field, field_validator
from django.utils.crypto import get_random_string
from archivebox.config.configset import BaseConfigSet
@@ -104,7 +104,7 @@ class ServerConfig(BaseConfigSet):
"danger-onedomain-fullreplay",
)
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
ADMIN_BASE_URL: str = Field(default="")

View File

@@ -1,10 +1,13 @@
__package__ = 'archivebox.config'
import html
import json
import os
import shutil
import inspect
import re
from pathlib import Path
from typing import Any, Dict
from typing import Any, Callable, Dict
from urllib.parse import quote, urlencode
from django.http import HttpRequest
from django.utils import timezone
from django.utils.html import format_html
@@ -18,16 +21,27 @@ from archivebox.misc.util import parse_date
from archivebox.machine.models import Binary
ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
# Common binaries to check for
KNOWN_BINARIES = [
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
'node', 'npm', 'npx', 'yt-dlp',
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
'python3', 'python', 'bash', 'zsh',
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
]
CANONICAL_BINARY_ALIASES = {
'youtube-dl': 'yt-dlp',
'ytdlp': 'yt-dlp',
}
def is_superuser(request: HttpRequest) -> bool:
return bool(getattr(request.user, 'is_superuser', False))
@@ -38,6 +52,249 @@ def format_parsed_datetime(value: object) -> str:
return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else ""
JSON_TOKEN_RE = re.compile(
r'(?P<key>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
r'|(?P<string>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
r'|(?P<boolean>\btrue\b|\bfalse\b)'
r'|(?P<null>\bnull\b)'
r'|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
)
def render_code_block(text: str, *, highlighted: bool = False) -> str:
code = html.escape(text, quote=False)
if highlighted:
def _wrap_token(match: re.Match[str]) -> str:
styles = {
'key': 'color: #0550ae;',
'string': 'color: #0a7f45;',
'boolean': 'color: #8250df; font-weight: 600;',
'null': 'color: #6e7781; font-style: italic;',
'number': 'color: #b35900;',
}
token_type = next(name for name, value in match.groupdict().items() if value is not None)
return f'<span style="{styles[token_type]}">{match.group(0)}</span>'
code = JSON_TOKEN_RE.sub(_wrap_token, code)
return (
'<pre style="max-height: 600px; overflow: auto; background: #f6f8fa; '
'border: 1px solid #d0d7de; border-radius: 6px; padding: 12px; margin: 0;">'
'<code style="font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, '
'\'Liberation Mono\', monospace; white-space: pre; line-height: 1.5;">'
f'{code}'
'</code></pre>'
)
def render_highlighted_json_block(value: Any) -> str:
return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True)
def get_plugin_docs_url(plugin_name: str) -> str:
return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
def get_live_config_url(key: str) -> str:
return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
def get_environment_binary_url(name: str) -> str:
return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
binary_id = getattr(binary, 'id', None)
if not binary_id:
return None
base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
changelist_filters = urlencode({'q': canonical_binary_name(name)})
return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
def get_machine_admin_url() -> str | None:
try:
from archivebox.machine.models import Machine
return Machine.current().admin_change_url
except Exception:
return None
def render_code_tag_list(values: list[str]) -> str:
if not values:
return '<span style="color: #6e7781;">(none)</span>'
tags = ''.join(
str(format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
))
for value in values
)
return f'<div style="display: flex; flex-wrap: wrap;">{tags}</div>'
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
rows = (
('Title', config.get('title') or '(none)'),
('Description', config.get('description') or '(none)'),
('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
)
rendered_rows = ''.join(
str(format_html(
'<div style="margin: 0 0 14px 0;">'
'<div style="font-weight: 600; margin-bottom: 4px;">{}</div>'
'<div>{}</div>'
'</div>',
label,
value,
))
for label, value in rows
)
return f'<div style="margin: 4px 0 0 0;">{rendered_rows}</div>'
def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str:
if not values:
return '<span style="color: #6e7781;">(none)</span>'
tags = []
for value in values:
if url_resolver is None:
tags.append(str(format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
)))
else:
tags.append(str(format_html(
'<a href="{}" style="text-decoration: none;">'
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
'</a>',
url_resolver(value),
value,
)))
return f'<div style="display: flex; flex-wrap: wrap;">{"".join(tags)}</div>'
def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str:
links = [
str(format_html('<a href="{}">Computed value</a>', get_live_config_url(prop_name))),
]
if machine_admin_url:
links.append(str(format_html('<a href="{}">Edit override</a>', machine_admin_url)))
fallback = prop_info.get('x-fallback')
if isinstance(fallback, str) and fallback:
links.append(str(format_html('<a href="{}">Fallback: <code>{}</code></a>', get_live_config_url(fallback), fallback)))
aliases = prop_info.get('x-aliases') or []
if isinstance(aliases, list):
for alias in aliases:
if isinstance(alias, str) and alias:
links.append(str(format_html('<a href="{}">Alias: <code>{}</code></a>', get_live_config_url(alias), alias)))
default = prop_info.get('default')
if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
links.append(str(format_html('<a href="{}">Binary: <code>{}</code></a>', get_environment_binary_url(default), default)))
return ' &nbsp; '.join(links)
def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
header_links = [
str(format_html('<a href="{}">Dependencies</a>', ENVIRONMENT_BINARIES_BASE_URL)),
str(format_html('<a href="{}">Installed Binaries</a>', INSTALLED_BINARIES_BASE_URL)),
]
if machine_admin_url:
header_links.insert(0, str(format_html('<a href="{}">Machine Config Editor</a>', machine_admin_url)))
cards = [
f'<div style="margin: 0 0 16px 0;">{" &nbsp; | &nbsp; ".join(header_links)}</div>'
]
for prop_name, prop_info in properties.items():
prop_type = prop_info.get('type', 'unknown')
if isinstance(prop_type, list):
prop_type = ' | '.join(str(type_name) for type_name in prop_type)
prop_desc = prop_info.get('description', '')
default_html = ''
if 'default' in prop_info:
default_html = str(format_html(
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
prop_info['default'],
))
description_html = prop_desc or mark_safe('<span style="color: #6e7781;">(no description)</span>')
cards.append(str(format_html(
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
'<div style="margin-bottom: 6px;">'
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
' <span style="color: #6e7781;">({})</span>'
'</div>'
'<div style="margin-bottom: 6px;">{}</div>'
'<div style="font-size: 0.95em;">{}</div>'
'{}'
'</div>',
get_live_config_url(prop_name),
prop_name,
prop_type,
description_html,
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
mark_safe(default_html),
)))
return ''.join(cards)
def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
if not hooks:
return '<span style="color: #6e7781;">(none)</span>'
items = []
for hook_name in hooks:
if source == 'builtin':
items.append(str(format_html(
'<div style="margin: 0 0 8px 0;">'
'<a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a>'
'</div>',
get_plugin_hook_source_url(plugin_name, hook_name),
hook_name,
)))
else:
items.append(str(format_html(
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
hook_name,
)))
return ''.join(items)
def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
installed_binary_url = get_installed_binary_change_url(name, db_binary)
if installed_binary_url:
return str(format_html(
'<code>{}</code><br/>'
'<a href="{}">View Installed Binary Record</a>',
merged['abspath'],
installed_binary_url,
))
return str(format_html('<code>{}</code>', merged['abspath']))
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
indent_str = " " * indent
if indent == 0:
@@ -80,21 +337,41 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
return f" {str(obj)}"
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
"""Detect available binaries using shutil.which."""
binaries = {}
def canonical_binary_name(name: str) -> str:
return CANONICAL_BINARY_ALIASES.get(name, name)
for name in KNOWN_BINARIES:
path = shutil.which(name)
if path:
binaries[name] = {
'name': name,
'abspath': path,
'version': None, # Could add version detection later
'is_available': True,
}
return binaries
def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
return (
int(binary.status == Binary.StatusChoices.INSTALLED),
int(bool(binary.version)),
int(bool(binary.abspath)),
binary.modified_at,
)
def get_db_binaries_by_name() -> Dict[str, Binary]:
grouped: Dict[str, list[Binary]] = {}
for binary in Binary.objects.all():
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
return {
name: max(records, key=_binary_sort_key)
for name, records in grouped.items()
}
def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
return {
'name': canonical_binary_name(name),
'version': str(getattr(binary, 'version', '') or ''),
'binprovider': str(getattr(binary, 'binprovider', '') or ''),
'abspath': str(getattr(binary, 'abspath', '') or ''),
'sha256': str(getattr(binary, 'sha256', '') or ''),
'status': str(getattr(binary, 'status', '') or ''),
'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
}
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
@@ -150,29 +427,18 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Found Abspath": [],
}
# Get binaries from database (previously detected/installed)
db_binaries = {b.name: b for b in Binary.objects.all()}
# Get currently detectable binaries
detected = get_detected_binaries()
# Merge and display
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
db_binaries = get_db_binaries_by_name()
all_binary_names = sorted(db_binaries.keys())
for name in all_binary_names:
db_binary = db_binaries.get(name)
detected_binary = detected.get(name)
merged = serialize_binary_record(name, db_binaries.get(name))
rows['Binary Name'].append(ItemLink(name, key=name))
if db_binary:
rows['Found Version'].append(f'{db_binary.version}' if db_binary.version else '✅ found')
rows['Provided By'].append(db_binary.binprovider or 'PATH')
rows['Found Abspath'].append(str(db_binary.abspath or ''))
elif detected_binary:
rows['Found Version'].append('✅ found')
rows['Provided By'].append('PATH')
rows['Found Abspath'].append(detected_binary['abspath'])
if merged['is_available']:
rows['Found Version'].append(f"{merged['version']}" if merged['version'] else '✅ found')
rows['Provided By'].append(merged['binprovider'] or '-')
rows['Found Abspath'].append(merged['abspath'] or '-')
else:
rows['Found Version'].append('❌ missing')
rows['Provided By'].append('-')
@@ -187,41 +453,22 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
key = canonical_binary_name(key)
# Try database first
try:
binary = Binary.objects.get(name=key)
section: SectionData = {
"name": binary.name,
"description": str(binary.abspath or ''),
"fields": {
'name': binary.name,
'binprovider': binary.binprovider,
'abspath': str(binary.abspath),
'version': binary.version,
'sha256': binary.sha256,
},
"help_texts": {},
}
return ItemContext(
slug=key,
title=key,
data=[section],
)
except Binary.DoesNotExist:
pass
db_binary = get_db_binaries_by_name().get(key)
merged = serialize_binary_record(key, db_binary)
# Try to detect from PATH
path = shutil.which(key)
if path:
if merged['is_available']:
section: SectionData = {
"name": key,
"description": path,
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
"fields": {
'name': key,
'binprovider': 'PATH',
'abspath': path,
'version': 'unknown',
'binprovider': merged['binprovider'] or '-',
'abspath': merged['abspath'] or 'not found',
'version': merged['version'] or 'unknown',
'sha256': merged['sha256'],
'status': merged['status'],
},
"help_texts": {},
}
@@ -233,12 +480,13 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
section: SectionData = {
"name": key,
"description": "Binary not found",
"description": "No persisted Binary record found",
"fields": {
'name': key,
'binprovider': 'not installed',
'abspath': 'not found',
'version': 'N/A',
'binprovider': merged['binprovider'] or 'not recorded',
'abspath': merged['abspath'] or 'not recorded',
'version': merged['version'] or 'N/A',
'status': merged['status'] or 'unrecorded',
},
"help_texts": {},
}
@@ -293,8 +541,6 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import json
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
plugins = get_filesystem_plugins()
@@ -308,45 +554,61 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
)
# Base fields that all plugins have
docs_url = get_plugin_docs_url(plugin['name'])
machine_admin_url = get_machine_admin_url()
fields = {
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
"path": plugin['path'],
"hooks": ', '.join(plugin['hooks']),
}
# Add config.json data if available
if plugin.get('config'):
config_json = json.dumps(plugin['config'], indent=2)
fields["config.json"] = mark_safe(
'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; '
f'padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>'
)
# Also extract and display individual config properties for easier viewing
if 'properties' in plugin['config']:
config_properties = plugin['config']['properties']
properties_summary = []
for prop_name, prop_info in config_properties.items():
prop_type = prop_info.get('type', 'unknown')
prop_desc = prop_info.get('description', '')
properties_summary.append(f"{prop_name} ({prop_type}): {prop_desc}")
if properties_summary:
fields["Config Properties"] = mark_safe('<br/>'.join(properties_summary))
section: SectionData = {
sections: list[SectionData] = [{
"name": plugin['name'],
"description": plugin['path'],
"description": format_html(
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
plugin['path'],
docs_url,
),
"fields": fields,
"help_texts": {},
}
}]
if plugin['hooks']:
sections.append({
"name": "Hooks",
"description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
"fields": {},
"help_texts": {},
})
if plugin.get('config'):
sections.append({
"name": "Plugin Metadata",
"description": mark_safe(render_plugin_metadata_html(plugin['config'])),
"fields": {},
"help_texts": {},
})
sections.append({
"name": "config.json",
"description": mark_safe(render_highlighted_json_block(plugin['config'])),
"fields": {},
"help_texts": {},
})
config_properties = plugin['config'].get('properties', {})
if config_properties:
sections.append({
"name": "Config Properties",
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
"fields": {},
"help_texts": {},
})
return ItemContext(
slug=key,
title=plugin['name'],
data=[section],
data=sections,
)

View File

@@ -1,14 +1,23 @@
__package__ = 'archivebox.core'
import html
import json
import os
import shlex
from pathlib import Path
from urllib.parse import quote
from functools import reduce
from operator import and_
from django.contrib import admin
from django.db.models import Min, Q, TextField
from django.db.models.functions import Cast
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from django.utils.text import smart_split
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
@@ -16,11 +25,71 @@ from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.widgets import InlineTagEditorWidget
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
from archivebox.core.models import ArchiveResult, Snapshot
def _stringify_env_value(value) -> str:
if value is None:
return ''
if isinstance(value, str):
return value
return json.dumps(value, separators=(',', ':'))
def _quote_shell_string(value: str) -> str:
return "'" + str(value).replace("'", "'\"'\"'") + "'"
def _get_replay_source_url(result: ArchiveResult) -> str:
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
def build_abx_dl_display_command(result: ArchiveResult) -> str:
source_url = _get_replay_source_url(result)
plugin_name = str(result.plugin or '').strip()
if not plugin_name and not source_url:
return 'abx-dl'
if not source_url:
return f'abx-dl --plugins={plugin_name}'
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
display_command = build_abx_dl_display_command(result)
process = getattr(result, 'process', None)
env = getattr(process, 'env', None) or {}
env_items = ' '.join(
f'{key}={shlex.quote(_stringify_env_value(value))}'
for key, value in sorted(env.items())
if value is not None
)
snapshot_dir = shlex.quote(str(result.snapshot_dir))
if env_items:
return f'cd {snapshot_dir}; env {env_items} {display_command}'
return f'cd {snapshot_dir}; {display_command}'
def get_plugin_admin_url(plugin_name: str) -> str:
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
if plugin_dir:
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(builtin_root):
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
user_root = USER_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(user_root):
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
@@ -35,6 +104,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
'backoff': ('#92400e', '#fef3c7'),
'skipped': ('#475569', '#f1f5f9'),
'noresults': ('#475569', '#f1f5f9'),
}
rows = []
@@ -54,8 +126,10 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
if len(full_output) > 60:
output_display += '...'
# Get full command as tooltip
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
cmd_str_escaped = html.escape(display_cmd)
cmd_attr = html.escape(replay_cmd, quote=True)
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
@@ -77,7 +151,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
title="View/edit archive result">
<code>{str(result.id)[:8]}</code>
<code>{str(result.id)[-8:]}</code>
</a>
</td>
<td style="padding: 10px 12px; white-space: nowrap;">
@@ -140,7 +214,15 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
<b>Command:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
<div style="position: relative; margin: 0; padding: 8px 56px 8px 8px; background: #1e293b; border-radius: 4px;">
<button type="button"
data-command="{cmd_attr}"
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
style="position: absolute; top: 6px; right: 6px; padding: 2px 8px; border: 0; border-radius: 4px; background: #334155; color: #e2e8f0; font-size: 11px; cursor: pointer;">
Copy
</button>
<code title="{cmd_attr}" style="display: block; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; color: #e2e8f0; font-size: 11px;">{cmd_str_escaped}</code>
</div>
</div>
</details>
</td>
@@ -165,7 +247,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Details</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
@@ -193,7 +275,7 @@ class ArchiveResultInline(admin.TabularInline):
extra = 0
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
@@ -259,10 +341,11 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
list_display_links = None
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
search_fields = ()
autocomplete_fields = ['snapshot']
fieldsets = (
@@ -271,7 +354,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Plugin', {
'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at'),
'fields': ('plugin_with_icon', 'process_link', 'status'),
'classes': ('card',),
}),
('Timing', {
@@ -305,8 +388,61 @@ class ArchiveResultAdmin(BaseModelAdmin):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
def get_queryset(self, request):
return (
super()
.get_queryset(request)
.select_related('snapshot', 'process')
.prefetch_related('snapshot__tags')
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
)
def get_search_results(self, request, queryset, search_term):
if not search_term:
return queryset, False
queryset = queryset.annotate(
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
output_json_text=Cast('output_json', output_field=TextField()),
cmd_text=Cast('process__cmd', output_field=TextField()),
)
search_bits = [
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
for bit in smart_split(search_term)
]
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
if not search_bits:
return queryset, False
filters = []
for bit in search_bits:
filters.append(
Q(snapshot_id_text__icontains=bit)
| Q(snapshot__url__icontains=bit)
| Q(snapshot__tags__name__icontains=bit)
| Q(snapshot_crawl_id_text__icontains=bit)
| Q(plugin__icontains=bit)
| Q(hook_name__icontains=bit)
| Q(output_str__icontains=bit)
| Q(output_json_text__icontains=bit)
| Q(cmd_text__icontains=bit)
)
return queryset.filter(reduce(and_, filters)).distinct(), True
@admin.display(description='Details', ordering='id')
def details_link(self, result):
return format_html(
'<a href="{}"><code>{}</code></a>',
reverse('admin:core_archiveresult_change', args=[result.id]),
str(result.id)[-8:],
)
@admin.display(
description='Snapshot Info'
description='Snapshot',
ordering='snapshot__url',
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
@@ -325,20 +461,83 @@ class ArchiveResultAdmin(BaseModelAdmin):
def tags_str(self, result):
return result.snapshot.tags_str()
@admin.display(description='Tags', ordering='snapshot_first_tag')
def tags_inline(self, result):
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
tags_html = widget.render(
name=f'tags_{result.snapshot_id}',
value=result.snapshot.tags.all(),
attrs={'id': f'tags_{result.snapshot_id}'},
snapshot_id=str(result.snapshot_id),
)
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
@admin.display(description='Status', ordering='status')
def status_badge(self, result):
status = result.status or ArchiveResult.StatusChoices.QUEUED
return format_html(
'<span class="status-badge {} status-{}">{}</span>',
status,
status,
result.get_status_display() or status,
)
@admin.display(description='Plugin', ordering='plugin')
def plugin_with_icon(self, result):
icon = get_plugin_icon(result.plugin)
return format_html(
'<span title="{}">{}</span> {}',
'<a href="{}" title="{}">{}</a> <a href="{}"><code>{}</code></a>',
get_plugin_admin_url(result.plugin),
result.plugin,
icon,
get_plugin_admin_url(result.plugin),
result.plugin,
)
def cmd_str(self, result):
@admin.display(description='Process', ordering='process__pid')
def process_link(self, result):
if not result.process_id:
return '-'
process_label = result.process.pid if result.process and result.process.pid else '-'
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
'<a href="{}"><code>{}</code></a>',
reverse('admin:machine_process_change', args=[result.process_id]),
process_label,
)
@admin.display(description='Machine', ordering='process__machine__hostname')
def machine_link(self, result):
if not result.process_id or not result.process or not result.process.machine_id:
return '-'
machine = result.process.machine
return format_html(
'<a href="{}"><code>{}</code> {}</a>',
reverse('admin:machine_machine_change', args=[machine.id]),
str(machine.id)[:8],
machine.hostname,
)
@admin.display(description='Command')
def cmd_str(self, result):
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
return format_html(
'''
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
<button type="button"
data-command="{}"
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;">
Copy
</button>
<code title="{}" style="display: block; width: 100%; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; box-sizing: border-box;">
{}
</code>
</div>
''',
replay_cmd,
replay_cmd,
display_cmd,
)
def output_display(self, result):
@@ -352,6 +551,27 @@ class ArchiveResultAdmin(BaseModelAdmin):
result.output_str,
)
@admin.display(description='Output', ordering='output_str')
def output_str_display(self, result):
output_text = str(result.output_str or '').strip()
if not output_text:
return '-'
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
if live_path:
return format_html(
'<a href="{}" title="{}"><code>{}</code></a>',
build_snapshot_url(str(result.snapshot_id), live_path),
output_text,
output_text,
)
return format_html(
'<span title="{}">{}</span>',
output_text,
output_text,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_html = format_html(

View File

@@ -61,12 +61,14 @@ def register_admin_site():
from archivebox.crawls.admin import register_admin as register_crawls_admin
from archivebox.api.admin import register_admin as register_api_admin
from archivebox.machine.admin import register_admin as register_machine_admin
from archivebox.personas.admin import register_admin as register_personas_admin
from archivebox.workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)
register_api_admin(archivebox_admin)
register_machine_admin(archivebox_admin)
register_personas_admin(archivebox_admin)
register_workers_admin(archivebox_admin)
return archivebox_admin

View File

@@ -6,6 +6,7 @@ from pathlib import Path
from django.contrib import admin, messages
from django.urls import path
from django.shortcuts import get_object_or_404, redirect
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.utils import timezone
@@ -14,6 +15,7 @@ from django.db.models.functions import Coalesce
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
from django.middleware.csrf import get_token
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
@@ -24,7 +26,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import render_archiveresults_list
@@ -215,10 +217,23 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'),
path('<path:object_id>/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'),
]
return custom_urls + urls
def redo_failed_view(self, request, object_id):
snapshot = get_object_or_404(Snapshot, pk=object_id)
if request.method == 'POST':
queued = bg_archive_snapshot(snapshot, overwrite=False)
messages.success(
request,
f"Queued {queued} snapshot for re-archiving. The background runner will process it.",
)
return redirect(snapshot.admin_change_url)
# def get_queryset(self, request):
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
@@ -312,6 +327,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def admin_actions(self, obj):
summary_url = build_web_url(f'/{obj.archive_path}')
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/'
csrf_token = get_token(self.request)
return format_html(
'''
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
@@ -344,13 +361,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Now
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Redo failed extractors (missing outputs)"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
🔁 Redo Failed
</a>
<form action="{}" method="post" style="display: inline-flex; margin: 0;">
<input type="hidden" name="csrfmiddlewaretoken" value="{}">
<button type="submit" class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s; cursor: pointer;"
title="Redo failed extractors (missing outputs)"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
🔁 Redo Failed
</button>
</form>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Re-run all extractors (overwrite existing)"
@@ -367,14 +386,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
</a>
</div>
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
<b>Tip:</b> Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected.
</p>
''',
summary_url,
results_url,
obj.url,
obj.pk,
obj.pk,
redo_failed_url,
csrf_token,
obj.pk,
obj.pk,
)

View File

@@ -1,63 +1,74 @@
__package__ = 'archivebox.core'
from django.contrib import admin
from urllib.parse import quote
from django import forms
from django.contrib import admin, messages
from django.contrib.admin.options import IS_POPUP_VAR
from django.http import HttpRequest, HttpResponseRedirect
from django.urls import reverse
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.core.models import SnapshotTag, Tag
from archivebox.core.tag_utils import (
TAG_HAS_SNAPSHOTS_CHOICES,
TAG_SORT_CHOICES,
build_tag_cards,
get_tag_creator_choices,
get_tag_year_choices,
normalize_created_by_filter,
normalize_created_year_filter,
normalize_has_snapshots_filter,
normalize_tag_sort,
)
from archivebox.core.host_utils import build_snapshot_url
class TagInline(admin.TabularInline):
model = SnapshotTag
# fk_name = 'snapshot'
fields = ('id', 'tag')
extra = 1
# min_num = 1
max_num = 1000
autocomplete_fields = (
'tag',
)
# class AutocompleteTags:
# model = Tag
# search_fields = ['name']
# name = 'name'
# # source_field = 'name'
# remote_field = Tag._meta.get_field('name')
# class AutocompleteTagsAdminStub:
# name = 'admin'
# class TaggedItemInline(admin.TabularInline):
# readonly_fields = ('object_link',)
# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields)
# model = TaggedItem
# extra = 1
# show_change_link = True
# @admin.display(description='object')
# def object_link(self, obj):
# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id)
# return format_html('<a href="/admin/{}/{}/{}/change"><b>[{}]</b></a>', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj))
class TagAdminForm(forms.ModelForm):
class Meta:
model = Tag
fields = '__all__'
widgets = {
'name': forms.TextInput(attrs={
'placeholder': 'research, receipts, product-design...',
'autocomplete': 'off',
'spellcheck': 'false',
'data-tag-name-input': '1',
}),
}
def clean_name(self):
name = (self.cleaned_data.get('name') or '').strip()
if not name:
raise forms.ValidationError('Tag name is required.')
return name
class TagAdmin(BaseModelAdmin):
list_display = ('created_at', 'created_by', 'id', 'name', 'num_snapshots', 'snapshots')
form = TagAdminForm
change_list_template = 'admin/core/tag/change_list.html'
change_form_template = 'admin/core/tag/change_form.html'
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
list_filter = ('created_at', 'created_by')
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
search_fields = ('id', 'name', 'slug')
actions = ['delete_selected', 'merge_tags']
ordering = ['-created_at']
# inlines = [TaggedItemInline]
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
actions = ['delete_selected']
ordering = ['name', 'id']
fieldsets = (
('Tag Info', {
('Tag', {
'fields': ('name', 'slug'),
'classes': ('card',),
}),
@@ -65,112 +76,137 @@ class TagAdmin(BaseModelAdmin):
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Snapshots', {
('Recent Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
paginator = AccelleratedPaginator
add_fieldsets = (
('Tag', {
'fields': ('name',),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
return self.fieldsets if obj else self.add_fieldsets
def num_snapshots(self, tag):
def changelist_view(self, request: HttpRequest, extra_context=None):
query = (request.GET.get('q') or '').strip()
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
extra_context = {
**(extra_context or {}),
'initial_query': query,
'initial_sort': sort,
'initial_created_by': created_by,
'initial_year': year,
'initial_has_snapshots': has_snapshots,
'tag_sort_choices': TAG_SORT_CHOICES,
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
'tag_created_by_choices': get_tag_creator_choices(),
'tag_year_choices': get_tag_year_choices(),
'initial_tag_cards': build_tag_cards(
query=query,
request=request,
sort=sort,
created_by=created_by,
year=year,
has_snapshots=has_snapshots,
),
'tag_search_api_url': reverse('api-1:search_tags'),
'tag_create_api_url': reverse('api-1:tags_create'),
}
return super().changelist_view(request, extra_context=extra_context)
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
current_name = (request.POST.get('name') or '').strip()
if not current_name and obj:
current_name = obj.name
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
if obj:
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
context.update({
'tag_search_api_url': reverse('api-1:search_tags'),
'tag_similar_cards': similar_tag_cards,
'tag_similar_query': current_name,
})
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
return super().response_add(request, obj, post_url_continue=post_url_continue)
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
return self._redirect_to_changelist(obj.name)
def response_change(self, request: HttpRequest, obj: Tag):
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
return super().response_change(request, obj)
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
return self._redirect_to_changelist(obj.name)
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
changelist_url = reverse('admin:core_tag_changelist')
if query:
changelist_url = f'{changelist_url}?q={quote(query)}'
return HttpResponseRedirect(changelist_url)
@admin.display(description='Snapshots')
def snapshots(self, tag: Tag):
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
total_count = tag.snapshot_set.count()
if not snapshots:
return mark_safe(
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
)
cards = []
for snapshot in snapshots:
title = (snapshot.title or '').strip() or snapshot.url
cards.append(format_html(
'''
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
<span style="min-width:0;">
<strong style="display:block;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</strong>
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
</span>
</a>
''',
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
title[:120],
snapshot.url[:120],
))
cards.append(format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
tag.id,
total_count,
))
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
@admin.display(description='Snapshots', ordering='num_snapshots')
def num_snapshots(self, tag: Tag):
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
tag.id,
tag.snapshot_set.count(),
count,
)
def snapshots(self, tag):
total_count = tag.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> {}',
snap.pk,
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
)
for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">{total_count} total snapshots...<a>'))
# def get_urls(self):
# urls = super().get_urls()
# custom_urls = [
# path(
# "merge-tags/",
# self.admin_site.admin_view(self.merge_tags_view),
# name="taggit_tag_merge_tags",
# ),
# ]
# return custom_urls + urls
# @admin.action(description="Merge selected tags")
# def merge_tags(self, request, queryset):
# selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME)
# if not selected:
# self.message_user(request, "Please select at least one tag.")
# return redirect(request.get_full_path())
# selected_tag_ids = ",".join(selected)
# redirect_url = f"{request.get_full_path()}merge-tags/"
# request.session["selected_tag_ids"] = selected_tag_ids
# return redirect(redirect_url)
# def merge_tags_view(self, request):
# selected_tag_ids = request.session.get("selected_tag_ids", "").split(",")
# if request.method == "POST":
# form = MergeTagsForm(request.POST)
# if form.is_valid():
# new_tag_name = form.cleaned_data["new_tag_name"]
# new_tag, created = Tag.objects.get_or_create(name=new_tag_name)
# with transaction.atomic():
# for tag_id in selected_tag_ids:
# tag = Tag.objects.get(id=tag_id)
# tagged_items = TaggedItem.objects.filter(tag=tag)
# for tagged_item in tagged_items:
# if TaggedItem.objects.filter(
# tag=new_tag,
# content_type=tagged_item.content_type,
# object_id=tagged_item.object_id,
# ).exists():
# # we have the new tag as well, so we can just
# # remove the tag association
# tagged_item.delete()
# else:
# # point this taggedItem to the new one
# tagged_item.tag = new_tag
# tagged_item.save()
# # delete the old tag
# if tag.id != new_tag.id:
# tag.delete()
# self.message_user(request, "Tags have been merged", level="success")
# # clear the selected_tag_ids from session after merge is complete
# request.session.pop("selected_tag_ids", None)
# return redirect("..")
# else:
# self.message_user(request, "Form is invalid.", level="error")
# context = {
# "form": MergeTagsForm(),
# "selected_tag_ids": selected_tag_ids,
# }
# return render(request, "admin/taggit/merge_tags_form.html", context)
# @admin.register(SnapshotTag, site=archivebox_admin)
# class SnapshotTagAdmin(BaseModelAdmin):
# list_display = ('id', 'snapshot', 'tag')
# sort_fields = ('id', 'snapshot', 'tag')
# search_fields = ('id', 'snapshot_id', 'tag_id')
# fields = ('snapshot', 'id')
# actions = ['delete_selected']
# ordering = ['-id']
def register_admin(admin_site):
admin_site.register(Tag, TagAdmin)

View File

@@ -1,12 +1,16 @@
__package__ = 'archivebox.core'
from django import forms
from django.utils.html import format_html
from archivebox.misc.util import URL_REGEX
from archivebox.misc.util import URL_REGEX, find_all_urls
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
from archivebox.crawls.schedule_utils import validate_schedule
from archivebox.hooks import get_plugins
from archivebox.config.common import SEARCH_BACKEND_CONFIG
from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget
from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon
from archivebox.personas.models import Persona
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@@ -22,6 +26,22 @@ def get_plugin_choices():
return [(name, name) for name in get_plugins()]
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
schema = plugin_configs.get(plugin_name, {})
description = str(schema.get('description') or '').strip()
if not description:
return plugin_name
icon_html = get_plugin_icon(plugin_name)
return format_html(
'<span class="plugin-choice-icon">{}</span><span class="plugin-choice-name">{}</span><a class="plugin-choice-description" href="https://archivebox.github.io/abx-plugins/#{}" target="_blank" rel="noopener noreferrer">{}</a>',
icon_html,
plugin_name,
plugin_name,
description,
)
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
field = form.fields[name]
if not isinstance(field, forms.ChoiceField):
@@ -31,22 +51,19 @@ def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
class AddLinkForm(forms.Form):
# Basic fields
url = forms.RegexField(
label="URLs (one per line)",
regex=URL_REGEX,
min_length=6,
url = forms.CharField(
label="URLs",
strip=True,
widget=forms.Textarea,
widget=forms.Textarea(attrs={
'data-url-regex': URL_REGEX.pattern,
}),
required=True
)
tag = forms.CharField(
label="Tags (comma separated tag1,tag2,tag3)",
label="Tags",
strip=True,
required=False,
widget=forms.TextInput(attrs={
'list': 'tag-datalist',
'autocomplete': 'off',
})
widget=TagEditorWidget(),
)
depth = forms.ChoiceField(
label="Archive depth",
@@ -58,11 +75,15 @@ class AddLinkForm(forms.Form):
label="Notes",
strip=True,
required=False,
widget=forms.Textarea(attrs={
'rows': 3,
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
widget=forms.TextInput(attrs={
'placeholder': 'Optional notes about this crawl',
})
)
url_filters = forms.Field(
label="URL allowlist / denylist",
required=False,
widget=URLFiltersWidget(source_selector='textarea[name="url"]'),
)
# Plugin groups
chrome_plugins = forms.MultipleChoiceField(
@@ -111,24 +132,15 @@ class AddLinkForm(forms.Form):
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
)
persona = forms.CharField(
persona = forms.ModelChoiceField(
label="Persona (authentication profile)",
max_length=100,
initial='Default',
required=False,
)
overwrite = forms.BooleanField(
label="Overwrite existing snapshots",
initial=False,
required=False,
)
update = forms.BooleanField(
label="Update/retry previously failed URLs",
initial=False,
required=False,
queryset=Persona.objects.none(),
empty_label=None,
to_field_name='name',
)
index_only = forms.BooleanField(
label="Index only (don't archive yet)",
label="Index only dry run (add crawl but don't archive yet)",
initial=False,
required=False,
)
@@ -142,11 +154,13 @@ class AddLinkForm(forms.Form):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Import at runtime to avoid circular imports
from archivebox.config.common import ARCHIVING_CONFIG
default_persona = Persona.get_or_create_default()
self.fields['persona'].queryset = Persona.objects.order_by('name')
self.fields['persona'].initial = default_persona.name
# Get all plugins
all_plugins = get_plugins()
plugin_configs = discover_plugin_configs()
# Define plugin groups
chrome_dependent = {
@@ -170,26 +184,28 @@ class AddLinkForm(forms.Form):
# Populate plugin field choices
get_choice_field(self, 'chrome_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
]
get_choice_field(self, 'archiving_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in archiving
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
]
get_choice_field(self, 'parsing_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in parsing
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
]
get_choice_field(self, 'search_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in search
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
]
get_choice_field(self, 'binary_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in binary
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
]
get_choice_field(self, 'extension_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in extensions
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
]
# Set update default from config
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
if required_search_plugin in search_choices:
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
def clean(self):
cleaned_data = super().clean() or {}
@@ -207,6 +223,23 @@ class AddLinkForm(forms.Form):
return cleaned_data
def clean_url(self):
value = self.cleaned_data.get('url') or ''
urls = '\n'.join(find_all_urls(value))
if not urls:
raise forms.ValidationError('Enter at least one valid URL.')
return urls
def clean_url_filters(self):
from archivebox.crawls.models import Crawl
value = self.cleaned_data.get('url_filters') or {}
return {
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
'same_domain_only': bool(value.get('same_domain_only')),
}
def clean_schedule(self):
schedule = (self.cleaned_data.get('schedule') or '').strip()
if not schedule:

View File

@@ -163,6 +163,10 @@ def get_api_base_url(request=None) -> str:
return _build_base_url_for_host(get_api_host(), request=request)
def get_public_base_url(request=None) -> str:
return _build_base_url_for_host(get_public_host(), request=request)
# Backwards-compat aliases (archive == web)
def get_archive_base_url(request=None) -> str:
return get_web_base_url(request=request)

View File

@@ -0,0 +1,15 @@
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("core", "0031_add_archiveresult_snapshot_status_index"),
]
operations = [
migrations.RemoveField(
model_name="archiveresult",
name="retry_at",
),
]

View File

@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.workers.tasks import bg_archive_snapshot
from archivebox.crawls.models import Crawl
from archivebox.machine.models import NetworkInterface, Binary
from archivebox.machine.models import Binary
@@ -60,32 +60,41 @@ class Tag(ModelWithUUID):
def __str__(self):
return self.name
def _generate_unique_slug(self) -> str:
base_slug = slugify(self.name) or 'tag'
existing = Tag.objects.filter(slug__startswith=base_slug)
if self.pk:
existing = existing.exclude(pk=self.pk)
existing_slugs = set(existing.values_list("slug", flat=True))
slug = base_slug
i = 1
while slug in existing_slugs:
slug = f"{base_slug}_{i}"
i += 1
return slug
def save(self, *args, **kwargs):
is_new = self._state.adding
if is_new:
self.slug = slugify(self.name)
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
i = None
while True:
slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
if slug not in existing:
self.slug = slug
break
i = (i or 0) + 1
existing_name = None
if self.pk:
existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first()
if not self.slug or existing_name != self.name:
self.slug = self._generate_unique_slug()
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Tag',
indent_level=0,
metadata={
'id': self.id,
'name': self.name,
'slug': self.slug,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# log_worker_event(
# worker_type='DB',
# event='Created Tag',
# indent_level=0,
# metadata={
# 'id': self.id,
# 'name': self.name,
# 'slug': self.slug,
# },
# )
@property
def api_url(self) -> str:
@@ -364,7 +373,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct()
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -393,24 +401,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
super().save(*args, **kwargs)
self.ensure_legacy_archive_symlink()
if self.url not in self.crawl.urls:
existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url}
if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls:
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Snapshot',
indent_level=2,
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id),
'depth': self.depth,
'status': self.status,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# log_worker_event(
# worker_type='DB',
# event='Created Snapshot',
# indent_level=2,
# url=self.url,
# metadata={
# 'id': str(self.id),
# 'crawl_id': str(self.crawl_id),
# 'depth': self.depth,
# 'status': self.status,
# },
# )
# =========================================================================
# Filesystem Migration Methods
@@ -1528,16 +1537,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
"""
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
Called by: SnapshotMachine.enter_started()
Hook Lifecycle:
1. discover_hooks('Snapshot') → finds all plugin hooks
2. For each hook:
- Create ArchiveResult with status=QUEUED
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
3. ArchiveResults execute independently via ArchiveResultMachine
4. Hook execution happens in ArchiveResult.run(), NOT here
Returns:
list[ArchiveResult]: Newly created pending results
"""
@@ -1602,7 +1601,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'url': self.url,
'title': self.title,
'tags': self.tags_str(),
'tags_str': self.tags_str(),
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None,
'timestamp': self.timestamp,
@@ -1672,7 +1670,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# ID not found, fall through to create-by-URL logic
pass
url = record.get('url')
from archivebox.misc.util import fix_url_from_markdown
url = fix_url_from_markdown(str(record.get('url') or '').strip())
if not url:
return None
@@ -1807,7 +1807,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
defaults={
'plugin': plugin,
'status': ArchiveResult.INITIAL_STATE,
'retry_at': timezone.now(),
},
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1853,11 +1852,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
failed = results.filter(status='failed').count()
running = results.filter(status='started').count()
skipped = results.filter(status='skipped').count()
noresults = results.filter(status='noresults').count()
total = results.count()
pending = total - succeeded - failed - running - skipped
pending = total - succeeded - failed - running - skipped - noresults
# Calculate percentage (succeeded + failed + skipped as completed)
completed = succeeded + failed + skipped
# Calculate percentage (succeeded + failed + skipped + noresults as completed)
completed = succeeded + failed + skipped + noresults
percent = int((completed / total * 100) if total > 0 else 0)
# Sum output sizes
@@ -1875,47 +1875,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'running': running,
'pending': pending,
'skipped': skipped,
'noresults': noresults,
'percent': percent,
'output_size': output_size,
'is_sealed': is_sealed,
}
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
def retry_failed_archiveresults(self) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
This enables seamless retry of the entire extraction pipeline:
- Resets FAILED and SKIPPED results to QUEUED
- Sets retry_at so workers pick them up
- Plugins run in order (numeric prefix)
- Each plugin checks its dependencies at runtime
Dependency handling (e.g., chrome → screenshot):
- Plugins check if required outputs exist before running
- If dependency output missing → plugin returns 'skipped'
- On retry, if dependency now succeeds → dependent can run
Returns count of ArchiveResults reset.
"""
retry_at = retry_at or timezone.now()
count = self.archiveresult_set.filter(
status__in=[
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
]
).update(
status=ArchiveResult.StatusChoices.QUEUED,
retry_at=retry_at,
output=None,
output_str='',
output_json=None,
output_files={},
output_size=0,
output_mimetypes='',
start_ts=None,
end_ts=None,
)
# Also reset the snapshot and current_step so it gets re-checked from the beginning
if count > 0:
self.status = self.StatusChoices.STARTED
self.retry_at = retry_at
self.retry_at = timezone.now()
self.current_step = 0 # Reset to step 0 for retry
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
@@ -2228,6 +2219,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
'snapshot': self,
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
'url_str': htmlencode(urldecode(self.base_url)),
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
@@ -2275,8 +2267,8 @@ class SnapshotMachine(BaseStateMachine):
│ • discover_hooks('Snapshot') → finds all plugin hooks │
│ • create_pending_archiveresults() → creates ONE │
│ ArchiveResult per hook (NO execution yet) │
│ 2. ArchiveResults process independently with their own
state machines (see ArchiveResultMachine)
│ 2. The shared abx-dl runner executes hooks and the
projector updates ArchiveResult rows from events
│ 3. Advance through steps 0-9 as foreground hooks complete │
└─────────────────────────────────────────────────────────────┘
↓ tick() when is_finished()
@@ -2358,7 +2350,7 @@ class SnapshotMachine(BaseStateMachine):
cast(Any, crawl).sm.seal()
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2366,6 +2358,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
NORESULTS = 'noresults', 'No Results'
INITIAL_STATE = StatusChoices.QUEUED
ACTIVE_STATE = StatusChoices.STARTED
FINAL_STATES = (
StatusChoices.SUCCEEDED,
StatusChoices.FAILED,
StatusChoices.SKIPPED,
StatusChoices.NORESULTS,
)
FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE)
@classmethod
def get_plugin_choices(cls):
@@ -2404,16 +2407,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
notes = models.TextField(blank=True, null=False, default='')
# output_dir is computed via @property from snapshot.output_dir / plugin
state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
active_state = StatusChoices.STARTED
snapshot_id: uuid.UUID
process_id: uuid.UUID | None
@@ -2421,7 +2418,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = 'Archive Result'
@@ -2516,40 +2512,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return None
def save(self, *args, **kwargs):
is_new = self._state.adding
# Create Process record if this is a new ArchiveResult and no process exists yet
if is_new and not self.process_id:
from archivebox.machine.models import Process, Machine
process = Process.objects.create(
machine=Machine.current(),
pwd=str(Path(self.snapshot.output_dir) / self.plugin),
cmd=[], # Will be set by run()
status='queued',
timeout=120,
env={},
)
self.process = process
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
# Call the Django Model.save() directly instead
models.Model.save(self, *args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created ArchiveResult',
indent_level=3,
plugin=self.plugin,
metadata={
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
'snapshot_url': str(self.snapshot.url)[:64],
'status': self.status,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# log_worker_event(
# worker_type='DB',
# event='Created ArchiveResult',
# indent_level=3,
# plugin=self.plugin,
# metadata={
# 'id': str(self.id),
# 'snapshot_id': str(self.snapshot_id),
# 'snapshot_url': str(self.snapshot.url)[:64],
# 'status': self.status,
# },
# )
@cached_property
def snapshot_dir(self):
@@ -2566,6 +2546,28 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def get_absolute_url(self):
return f'/{self.snapshot.archive_path}/{self.plugin}'
def reset_for_retry(self, *, save: bool = True) -> None:
self.status = self.StatusChoices.QUEUED
self.output_str = ''
self.output_json = None
self.output_files = {}
self.output_size = 0
self.output_mimetypes = ''
self.start_ts = None
self.end_ts = None
if save:
self.save(update_fields=[
'status',
'output_str',
'output_json',
'output_files',
'output_size',
'output_mimetypes',
'start_ts',
'end_ts',
'modified_at',
])
@property
def plugin_module(self) -> Any | None:
# Hook scripts are now used instead of Python plugin modules
@@ -2723,11 +2725,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return None
def create_output_dir(self):
output_dir = Path(self.snapshot_dir) / self.plugin
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir
@property
def output_dir_name(self) -> str:
return self.plugin
@@ -2782,134 +2779,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save_search_index(self):
pass
def cascade_health_update(self, success: bool):
"""Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
# Update archival hierarchy
self.snapshot.increment_health_stats(success)
self.snapshot.crawl.increment_health_stats(success)
# Update execution infrastructure
if self.binary:
self.binary.increment_health_stats(success)
if self.binary.machine:
self.binary.machine.increment_health_stats(success)
if self.iface:
self.iface.increment_health_stats(success)
def run(self):
"""
Execute this ArchiveResult's hook and update status.
If self.hook_name is set, runs only that specific hook.
If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
Updates status/output fields, queues discovered URLs, and triggers indexing.
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
from archivebox.config.configset import get_config
# Get merged config with proper context
config = get_config(
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
# Determine which hook(s) to run
hooks = []
if self.hook_name:
# SPECIFIC HOOK MODE: Find the specific hook by name
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
hook_path = plugin_dir / self.hook_name
if hook_path.exists():
hooks.append(hook_path)
break
else:
# LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
if matches:
hooks.extend(sorted(matches))
if not hooks:
self.status = self.StatusChoices.FAILED
if self.hook_name:
self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
else:
self.output_str = f'No hooks found for plugin: {self.plugin}'
self.retry_at = None
self.save()
return
# Output directory is plugin_dir for the hook output
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
start_ts = timezone.now()
process = None
for hook in hooks:
# Run hook using Process.launch() - returns Process model
process = run_hook(
hook,
output_dir=plugin_dir,
config=config,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
crawl_id=str(self.snapshot.crawl.id),
depth=self.snapshot.depth,
)
# Link ArchiveResult to Process
self.process = process
self.start_ts = start_ts
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
if not process:
# No hooks ran
self.status = self.StatusChoices.FAILED
self.output_str = 'No hooks executed'
self.save()
return
# Update status based on hook execution
if process.status == process.StatusChoices.RUNNING:
# BACKGROUND HOOK - still running, return immediately
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
return
# FOREGROUND HOOK - completed, update from filesystem
self.update_from_output()
# Clean up empty output directory if no files were created
if plugin_dir.exists() and not self.output_files:
try:
if not any(plugin_dir.iterdir()):
plugin_dir.rmdir()
except (OSError, RuntimeError):
pass
def update_from_output(self):
"""
Update this ArchiveResult from filesystem logs and output files.
Used for:
- Foreground hooks that completed (called from ArchiveResult.run())
- Background hooks that completed (called from Snapshot.cleanup())
Used for Snapshot cleanup / orphan recovery when a hook's output exists
on disk but the projector did not finalize the row in the database.
Updates:
- status, output_str, output_json from ArchiveResult JSONL record
- output_files, output_size, output_mimetypes by walking filesystem
- end_ts, retry_at, cmd, cmd_version, binary FK
- end_ts, cmd, cmd_version, binary FK
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
"""
import mimetypes
@@ -2924,7 +2804,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.status = self.StatusChoices.FAILED
self.output_str = 'Output directory not found'
self.end_ts = timezone.now()
self.retry_at = None
self.save()
return
@@ -2948,6 +2827,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
'succeeded': self.StatusChoices.SUCCEEDED,
'failed': self.StatusChoices.FAILED,
'skipped': self.StatusChoices.SKIPPED,
'noresults': self.StatusChoices.NORESULTS,
}
self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
@@ -3011,7 +2891,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Update timestamps
self.end_ts = timezone.now()
self.retry_at = None
self.save()
@@ -3095,340 +2974,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
"""
import re
from archivebox.config.configset import get_config
# Get merged config with proper hierarchy
config = get_config(
user=self.created_by,
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
# Get allowlist/denylist (can be string or list)
allowlist_raw = config.get('URL_ALLOWLIST', '')
denylist_raw = config.get('URL_DENYLIST', '')
# Normalize to list of patterns
def to_pattern_list(value):
if isinstance(value, list):
return value
if isinstance(value, str):
return [p.strip() for p in value.split(',') if p.strip()]
return []
allowlist = to_pattern_list(allowlist_raw)
denylist = to_pattern_list(denylist_raw)
# Denylist takes precedence
if denylist:
for pattern in denylist:
try:
if re.search(pattern, url):
return False
except re.error:
continue # Skip invalid regex patterns
# If allowlist exists, URL must match at least one pattern
if allowlist:
for pattern in allowlist:
try:
if re.search(pattern, url):
return True
except re.error:
continue # Skip invalid regex patterns
return False # No allowlist patterns matched
return True # No filters or passed filters
return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot)
@property
def output_dir(self) -> Path:
"""Get the output directory for this plugin's results."""
return Path(self.snapshot.output_dir) / self.plugin
def is_background_hook(self) -> bool:
"""Check if this ArchiveResult is for a background hook."""
plugin_dir = Path(self.pwd) if self.pwd else None
if not plugin_dir:
return False
pid_file = plugin_dir / 'hook.pid'
return pid_file.exists()
# =============================================================================
# ArchiveResult State Machine
# =============================================================================
class ArchiveResultMachine(BaseStateMachine):
"""
State machine for managing ArchiveResult (single plugin execution) lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for its turn to run │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. archiveresult.run() │
│ • Find specific hook by hook_name │
│ • run_hook(script, output_dir, ...) → subprocess │
│ │
│ 2a. FOREGROUND hook (returns HookResult): │
│ • update_from_output() immediately │
│ - Read stdout.log │
│ - Parse JSONL records │
│ - Extract 'ArchiveResult' record → update status │
│ - Walk output_dir → populate output_files │
│ - Call process_hook_records() for side effects │
│ │
│ 2b. BACKGROUND hook (returns None): │
│ • Status stays STARTED │
│ • Continues running in background │
│ • Killed by Snapshot.cleanup() when sealed │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
│ • Set by hook's JSONL output during update_from_output() │
│ • Health stats incremented (num_uses_succeeded/failed) │
│ • Parent Snapshot health stats also updated │
└─────────────────────────────────────────────────────────────┘
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model_attr_name = 'archiveresult'
# States
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
# Flow: queued → started → (succeeded|failed|skipped)
# queued → skipped (if exceeded max attempts)
# started → backoff → started (retry)
tick = (
queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
| queued.to.itself(unless='can_start')
| queued.to(started, cond='can_start')
| started.to(succeeded, cond='is_succeeded')
| started.to(failed, cond='is_failed')
| started.to(skipped, cond='is_skipped')
| started.to(backoff, cond='is_backoff')
| backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
| backoff.to.itself(unless='can_start')
| backoff.to(started, cond='can_start')
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
# Reason: backoff should always retry→started, then started→final states
)
archiveresult: ArchiveResult
def can_start(self) -> bool:
"""Pure function - check if AR can start (has valid URL)."""
return bool(self.archiveresult.snapshot.url)
def is_exceeded_max_attempts(self) -> bool:
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
# Count failed ArchiveResults for this snapshot (any plugin type)
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.FAILED
).count()
return failed_count >= max_attempts
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
and not self.archiveresult.output_str
)
def is_finished(self) -> bool:
"""
Check if extraction has completed (success, failure, or skipped).
For background hooks in STARTED state, checks if their Process has finished and reaps them.
"""
# If already in final state, return True
if self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
):
return True
# If in STARTED state with a Process, check if Process has finished running
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
if self.archiveresult.process_id:
process = self.archiveresult.process
# If process is NOT running anymore, reap the background hook
if not process.is_running:
self.archiveresult.update_from_output()
# Check if now in final state after reaping
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
return False
@queued.enter
def enter_queued(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
# Update Process with network interface
if self.archiveresult.process_id:
self.archiveresult.process.iface = NetworkInterface.current()
self.archiveresult.process.save()
# Lock the object and mark start time
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
)
# Run the plugin - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
@backoff.enter
def enter_backoff(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
end_ts=None,
)
def _check_and_seal_parent_snapshot(self):
"""
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
Note: In the new architecture, the shared runner handles step advancement and sealing.
This method is kept for direct model-driven edge cases.
"""
import sys
snapshot = self.archiveresult.snapshot
# Check if all archiveresults are finished (in final states)
remaining_active = snapshot.archiveresult_set.exclude(
status__in=[
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
]
).count()
if remaining_active == 0:
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
# Seal the parent snapshot
cast(Any, snapshot).sm.seal()
@succeeded.enter
def enter_succeeded(self):
import sys
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=True)
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
@failed.enter
def enter_failed(self):
import sys
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=False)
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
@skipped.enter
def enter_skipped(self):
import sys
# Set output_str if not already set (e.g., when skipped due to max attempts)
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
# =============================================================================
# State Machine Registration
# =============================================================================
@@ -3436,4 +2988,3 @@ class ArchiveResultMachine(BaseStateMachine):
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(SnapshotMachine)
registry.register(ArchiveResultMachine)

View File

@@ -232,11 +232,12 @@ SQLITE_CONNECTION_OPTIONS = {
# https://gcollazo.com/optimal-sqlite-settings-for-django/
# https://litestream.io/tips/#busy-timeout
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
"timeout": 10,
"timeout": 30,
"check_same_thread": False,
"transaction_mode": "IMMEDIATE",
"init_command": (
"PRAGMA foreign_keys=ON;"
"PRAGMA busy_timeout = 30000;"
"PRAGMA journal_mode = WAL;"
"PRAGMA synchronous = NORMAL;"
"PRAGMA temp_store = MEMORY;"

View File

@@ -0,0 +1,271 @@
from __future__ import annotations
import json
from collections import defaultdict
from typing import Any
from django.contrib.auth.models import User
from django.db.models import Count, F, Q, QuerySet
from django.db.models.functions import Lower
from django.http import HttpRequest
from django.urls import reverse
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.core.models import Snapshot, SnapshotTag, Tag
TAG_SNAPSHOT_PREVIEW_LIMIT = 10
TAG_SORT_CHOICES = (
('name_asc', 'Name A-Z'),
('name_desc', 'Name Z-A'),
('created_desc', 'Created newest'),
('created_asc', 'Created oldest'),
('snapshots_desc', 'Most snapshots'),
('snapshots_asc', 'Fewest snapshots'),
)
TAG_HAS_SNAPSHOTS_CHOICES = (
('all', 'All'),
('yes', 'Has snapshots'),
('no', 'No snapshots'),
)
def normalize_tag_name(name: str) -> str:
return (name or '').strip()
def normalize_tag_sort(sort: str = 'created_desc') -> str:
valid_sorts = {key for key, _label in TAG_SORT_CHOICES}
return sort if sort in valid_sorts else 'created_desc'
def normalize_has_snapshots_filter(value: str = 'all') -> str:
valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES}
return value if value in valid_filters else 'all'
def normalize_created_by_filter(created_by: str = '') -> str:
return created_by if str(created_by).isdigit() else ''
def normalize_created_year_filter(year: str = '') -> str:
year = (year or '').strip()
return year if len(year) == 4 and year.isdigit() else ''
def get_matching_tags(
query: str = '',
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
) -> QuerySet[Tag]:
queryset = Tag.objects.select_related('created_by').annotate(
num_snapshots=Count('snapshot_set', distinct=True),
)
query = normalize_tag_name(query)
if query:
queryset = queryset.filter(
Q(name__icontains=query) | Q(slug__icontains=query),
)
created_by = normalize_created_by_filter(created_by)
if created_by:
queryset = queryset.filter(created_by_id=int(created_by))
year = normalize_created_year_filter(year)
if year:
queryset = queryset.filter(created_at__year=int(year))
has_snapshots = normalize_has_snapshots_filter(has_snapshots)
if has_snapshots == 'yes':
queryset = queryset.filter(num_snapshots__gt=0)
elif has_snapshots == 'no':
queryset = queryset.filter(num_snapshots=0)
sort = normalize_tag_sort(sort)
if sort == 'name_asc':
queryset = queryset.order_by(Lower('name'), 'id')
elif sort == 'name_desc':
queryset = queryset.order_by(Lower('name').desc(), '-id')
elif sort == 'created_asc':
queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name'))
elif sort == 'snapshots_desc':
queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name'))
elif sort == 'snapshots_asc':
queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id')
else:
queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name'))
return queryset
def get_tag_creator_choices() -> list[tuple[str, str]]:
rows = (
Tag.objects
.filter(created_by__isnull=False)
.values_list('created_by_id', 'created_by__username')
.order_by(Lower('created_by__username'), 'created_by_id')
.distinct()
)
return [(str(user_id), username or f'User {user_id}') for user_id, username in rows]
def get_tag_year_choices() -> list[str]:
years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC')
return [str(year.year) for year in years]
def get_tag_by_ref(tag_ref: str | int) -> Tag:
if isinstance(tag_ref, int):
return Tag.objects.get(pk=tag_ref)
ref = str(tag_ref).strip()
if ref.isdigit():
return Tag.objects.get(pk=int(ref))
try:
return Tag.objects.get(slug__iexact=ref)
except Tag.DoesNotExist:
return Tag.objects.get(slug__icontains=ref)
def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]:
normalized_name = normalize_tag_name(name)
if not normalized_name:
raise ValueError('Tag name is required')
existing = Tag.objects.filter(name__iexact=normalized_name).first()
if existing:
return existing, False
tag = Tag.objects.create(
name=normalized_name,
created_by=created_by,
)
return tag, True
def rename_tag(tag: Tag, name: str) -> Tag:
normalized_name = normalize_tag_name(name)
if not normalized_name:
raise ValueError('Tag name is required')
existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first()
if existing:
raise ValueError(f'Tag "{existing.name}" already exists')
if tag.name != normalized_name:
tag.name = normalized_name
tag.save()
return tag
def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]:
return tag.delete()
def export_tag_urls(tag: Tag) -> str:
urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True)
return '\n'.join(urls)
def export_tag_snapshots_jsonl(tag: Tag) -> str:
snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags')
return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots)
def _display_snapshot_title(snapshot: Snapshot) -> str:
title = (snapshot.title or '').strip()
url = (snapshot.url or '').strip()
if not title:
return url
normalized_title = title.lower()
if normalized_title == 'pending...' or normalized_title == url.lower():
return url
return title
def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]:
return {
'id': str(snapshot.pk),
'title': _display_snapshot_title(snapshot),
'url': snapshot.url,
'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request),
'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]),
'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request),
'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None,
}
def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]:
tag_ids = [tag.pk for tag in tags]
if not tag_ids:
return {}
snapshot_tags = (
SnapshotTag.objects
.filter(tag_id__in=tag_ids)
.select_related('snapshot__crawl__created_by')
.order_by(
'tag_id',
F('snapshot__downloaded_at').desc(nulls_last=True),
F('snapshot__created_at').desc(nulls_last=True),
F('snapshot_id').desc(),
)
)
preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list)
for snapshot_tag in snapshot_tags:
previews = preview_map[snapshot_tag.tag_id]
if len(previews) >= preview_limit:
continue
previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request))
return preview_map
def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]:
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
return {
'id': tag.pk,
'name': tag.name,
'slug': tag.slug,
'num_snapshots': count,
'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}",
'edit_url': reverse('admin:core_tag_change', args=[tag.pk]),
'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]),
'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]),
'rename_url': reverse('api-1:rename_tag', args=[tag.pk]),
'delete_url': reverse('api-1:delete_tag', args=[tag.pk]),
'snapshots': snapshot_previews or [],
}
def build_tag_cards(
query: str = '',
request: HttpRequest | None = None,
limit: int | None = None,
preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT,
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
) -> list[dict[str, Any]]:
queryset = get_matching_tags(
query=query,
sort=sort,
created_by=created_by,
year=year,
has_snapshots=has_snapshots,
)
if limit is not None:
queryset = queryset[:limit]
tags = list(queryset)
preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit)
return [
build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, []))
for tag in tags
]

View File

@@ -11,6 +11,7 @@ from archivebox.hooks import (
)
from archivebox.core.host_utils import (
get_admin_base_url,
get_public_base_url,
get_web_base_url,
get_snapshot_base_url,
build_snapshot_url,
@@ -166,6 +167,11 @@ def web_base_url(context) -> str:
return get_web_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def public_base_url(context) -> str:
return get_public_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def snapshot_base_url(context, snapshot) -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)

View File

@@ -1,5 +1,6 @@
__package__ = 'archivebox.core'
import json
import os
import posixpath
from glob import glob, escape
@@ -7,7 +8,7 @@ from django.utils import timezone
import inspect
from typing import Callable, cast, get_type_hints
from pathlib import Path
from urllib.parse import urlparse
from urllib.parse import quote, urlparse
from django.shortcuts import render, redirect
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
@@ -26,7 +27,7 @@ from admin_data_views.typing import TableContext, ItemContext, SectionData
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
from archivebox.misc.serve_static import serve_static_with_byterange_support
@@ -37,7 +38,18 @@ from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
from archivebox.hooks import (
BUILTIN_PLUGINS_DIR,
USER_PLUGINS_DIR,
discover_plugin_configs,
get_enabled_plugins,
get_plugin_name,
iter_plugin_dirs,
)
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/'
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
@@ -699,6 +711,9 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
if not show_indexes and (not rel_path or rel_path == "index.html"):
return SnapshotView.render_live_index(request, snapshot)
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
@@ -784,7 +799,6 @@ class SnapshotHostView(View):
raise Http404
return _serve_snapshot_replay(request, snapshot, path)
class SnapshotReplayView(View):
"""Serve snapshot directory contents on a one-domain replay path."""
@@ -915,8 +929,17 @@ class AddView(UserPassesTestMixin, FormView):
return custom_config
def get_context_data(self, **kwargs):
from archivebox.core.models import Tag
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
plugin_configs = discover_plugin_configs()
plugin_dependency_map = {
plugin_name: [
str(required_plugin).strip()
for required_plugin in (schema.get('required_plugins') or [])
if str(required_plugin).strip()
]
for plugin_name, schema in plugin_configs.items()
if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins')
}
return {
**super().get_context_data(**kwargs),
'title': "Create Crawl",
@@ -924,8 +947,9 @@ class AddView(UserPassesTestMixin, FormView):
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
'required_search_plugin': required_search_plugin,
'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True),
'stdout': '',
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
@@ -937,11 +961,10 @@ class AddView(UserPassesTestMixin, FormView):
depth = int(form.cleaned_data["depth"])
plugins = ','.join(form.cleaned_data.get("plugins", []))
schedule = form.cleaned_data.get("schedule", "").strip()
persona = form.cleaned_data.get("persona", "Default")
overwrite = form.cleaned_data.get("overwrite", False)
update = form.cleaned_data.get("update", False)
persona = form.cleaned_data.get("persona")
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
url_filters = form.cleaned_data.get("url_filters") or {}
custom_config = self._get_custom_config_overrides(form)
from archivebox.config.permissions import HOSTNAME
@@ -957,6 +980,7 @@ class AddView(UserPassesTestMixin, FormView):
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Crawl with the URLs from the file
@@ -964,16 +988,18 @@ class AddView(UserPassesTestMixin, FormView):
urls_content = sources_file.read_text()
# Build complete config
config = {
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'DEPTH': depth,
'PLUGINS': plugins or '',
'DEFAULT_PERSONA': persona or 'Default',
'DEFAULT_PERSONA': (persona.name if persona else 'Default'),
}
# Merge custom config overrides
config.update(custom_config)
if url_filters.get('allowlist'):
config['URL_ALLOWLIST'] = url_filters['allowlist']
if url_filters.get('denylist'):
config['URL_DENYLIST'] = url_filters['denylist']
crawl = Crawl.objects.create(
urls=urls_content,
@@ -999,6 +1025,8 @@ class AddView(UserPassesTestMixin, FormView):
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
crawl.create_snapshots_from_urls()
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from archivebox.crawls.actors import CrawlActor
@@ -1011,7 +1039,7 @@ class AddView(UserPassesTestMixin, FormView):
urls = form.cleaned_data["url"]
schedule = form.cleaned_data.get("schedule", "").strip()
rough_url_count = urls.count('://')
rough_url_count = len([url for url in urls.splitlines() if url.strip()])
# Build success message with schedule link if created
schedule_msg = ""
@@ -1080,10 +1108,6 @@ class WebAddView(AddView):
'persona': defaults_form.fields['persona'].initial or 'Default',
'config': {},
}
if defaults_form.fields['update'].initial:
form_data['update'] = 'on'
if defaults_form.fields['overwrite'].initial:
form_data['overwrite'] = 'on'
if defaults_form.fields['index_only'].initial:
form_data['index_only'] = 'on'
@@ -1118,6 +1142,41 @@ def live_progress_view(request):
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
normalized_hook_name = Path(hook_name).name if hook_name else ""
if not normalized_hook_name:
return (plugin, plugin, "unknown", "")
phase = "unknown"
if normalized_hook_name.startswith("on_Crawl__"):
phase = "crawl"
elif normalized_hook_name.startswith("on_Snapshot__"):
phase = "snapshot"
elif normalized_hook_name.startswith("on_Binary__"):
phase = "binary"
label = normalized_hook_name
if "__" in normalized_hook_name:
label = normalized_hook_name.split("__", 1)[1]
label = label.rsplit(".", 1)[0]
if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
label = label[3:]
label = label.replace("_", " ").strip() or plugin
return (plugin, label, phase, normalized_hook_name)
def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
hook_path = ""
if isinstance(cmd, list) and cmd:
first = cmd[0]
if isinstance(first, str):
hook_path = first
if not hook_path:
return ("", "setup", "unknown", "")
return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
machine = Machine.current()
orchestrator_proc = Process.objects.filter(
machine=machine,
@@ -1188,8 +1247,19 @@ def live_progress_view(request):
Process.TypeChoices.BINARY,
],
)
recent_processes = Process.objects.filter(
machine=machine,
process_type__in=[
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
modified_at__gte=timezone.now() - timedelta(minutes=10),
).order_by("-modified_at")
crawl_process_pids: dict[str, int] = {}
snapshot_process_pids: dict[str, int] = {}
process_records_by_crawl: dict[str, list[dict[str, object]]] = {}
process_records_by_snapshot: dict[str, list[dict[str, object]]] = {}
seen_process_records: set[str] = set()
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
@@ -1197,11 +1267,48 @@ def live_progress_view(request):
crawl_id = env.get('CRAWL_ID')
snapshot_id = env.get('SNAPSHOT_ID')
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
if snapshot_id and proc.pid:
if phase == "snapshot" and snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
for proc in recent_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
if not crawl_id and not snapshot_id:
continue
plugin, label, phase, hook_name = process_label(proc.cmd)
record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
if proc_key in seen_process_records:
continue
seen_process_records.add(proc_key)
status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
payload: dict[str, object] = {
"id": str(proc.id),
"plugin": plugin,
"label": label,
"hook_name": hook_name,
"status": status,
"phase": phase,
"source": "process",
"process_id": str(proc.id),
}
if status == "started" and proc.pid:
payload["pid"] = proc.pid
if phase == "snapshot" and snapshot_id:
process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload)
elif crawl_id:
process_records_by_crawl.setdefault(str(crawl_id), []).append(payload)
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).prefetch_related(
@@ -1234,6 +1341,11 @@ def live_progress_view(request):
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), []))
crawl_setup_total = len(crawl_setup_plugins)
crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
@@ -1241,28 +1353,21 @@ def live_progress_view(request):
# Get archive results for this snapshot (already prefetched)
snapshot_results = snapshot.archiveresult_set.all()
# Count in memory instead of DB queries
total_plugins = len(snapshot_results)
completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress using per-plugin progress
now = timezone.now()
plugin_progress_values: list[int] = []
all_plugins: list[dict[str, object]] = []
seen_plugin_keys: set[str] = set()
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
def plugin_sort_key(ar):
status_order = {
ArchiveResult.StatusChoices.STARTED: 0,
ArchiveResult.StatusChoices.QUEUED: 1,
ArchiveResult.StatusChoices.SUCCEEDED: 2,
ArchiveResult.StatusChoices.FAILED: 3,
ArchiveResult.StatusChoices.NORESULTS: 3,
ArchiveResult.StatusChoices.FAILED: 4,
}
return (status_order.get(ar.status, 4), ar.plugin)
return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
all_plugins = []
for ar in sorted(snapshot_results, key=plugin_sort_key):
status = ar.status
progress_value = 0
@@ -1270,6 +1375,7 @@ def live_progress_view(request):
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
):
progress_value = 100
elif status == ArchiveResult.StatusChoices.STARTED:
@@ -1284,20 +1390,49 @@ def live_progress_view(request):
progress_value = 0
plugin_progress_values.append(progress_value)
plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
plugin_payload = {
'id': str(ar.id),
'plugin': ar.plugin,
'label': label,
'hook_name': hook_name,
'phase': phase,
'status': status,
'process_id': str(ar.process_id) if ar.process_id else None,
}
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
plugin_payload['pid'] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload['progress'] = progress_value
plugin_payload['timeout'] = ar.timeout or 120
plugin_payload['source'] = 'archiveresult'
all_plugins.append(plugin_payload)
seen_plugin_keys.add(
str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}"
)
snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []):
proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
if proc_key in seen_plugin_keys:
continue
seen_plugin_keys.add(proc_key)
all_plugins.append(proc_payload)
proc_status = proc_payload.get("status")
if proc_status in ("succeeded", "failed", "skipped"):
plugin_progress_values.append(100)
elif proc_status == "started":
plugin_progress_values.append(1)
else:
plugin_progress_values.append(0)
total_plugins = len(all_plugins)
completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
@@ -1334,6 +1469,11 @@ def live_progress_view(request):
'started_snapshots': started_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'setup_plugins': crawl_setup_plugins,
'setup_total_plugins': crawl_setup_total,
'setup_completed_plugins': crawl_setup_completed,
'setup_failed_plugins': crawl_setup_failed,
'setup_pending_plugins': crawl_setup_pending,
'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start,
'urls_preview': urls_preview,
@@ -1461,7 +1601,11 @@ def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
from archivebox.machine.models import Machine
# Check if it's from archivebox.machine.config
# Environment variables override all persistent config sources.
if key in os.environ:
return 'Environment'
# Machine.config overrides ArchiveBox.conf.
try:
machine = Machine.current()
if machine.config and key in machine.config:
@@ -1469,10 +1613,6 @@ def find_config_source(key: str, merged_config: dict) -> str:
except Exception:
pass
# Check if it's from environment variable
if key in os.environ:
return 'Environment'
# Check if it's from archivebox.config.file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
@@ -1483,6 +1623,43 @@ def find_config_source(key: str, merged_config: dict) -> str:
return 'Default'
def find_plugin_for_config_key(key: str) -> str | None:
for plugin_name, schema in discover_plugin_configs().items():
if key in (schema.get('properties') or {}):
return plugin_name
return None
def get_config_definition_link(key: str) -> tuple[str, str]:
plugin_name = find_plugin_for_config_key(key)
if not plugin_name:
return (
f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code',
'archivebox/config',
)
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
if plugin_dir:
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(builtin_root):
return (
f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json',
f'abx_plugins/plugins/{plugin_name}/config.json',
)
user_root = USER_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(user_root):
return (
f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/',
f'data/custom_plugins/{plugin_name}/config.json',
)
return (
f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/',
f'abx_plugins/plugins/{plugin_name}/config.json',
)
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
@@ -1566,17 +1743,6 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
# Determine all sources for this config value
sources_info = []
# Default value
default_val = find_config_default(key)
if default_val:
sources_info.append(('Default', default_val, 'gray'))
# Config file value
if CONSTANTS.CONFIG_FILE.exists():
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
sources_info.append(('Config File', file_config[key], 'green'))
# Environment variable
if key in os.environ:
sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
@@ -1592,6 +1758,17 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
except Exception:
pass
# Config file value
if CONSTANTS.CONFIG_FILE.exists():
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
sources_info.append(('Config File', file_config[key], 'green'))
# Default value
default_val = find_config_default(key)
if default_val:
sources_info.append(('Default', default_val, 'gray'))
# Final computed value
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
if not key_is_safe(key):
@@ -1614,6 +1791,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
definition_url, definition_label = get_config_definition_link(key)
section_data = cast(SectionData, {
"name": section_header,
"description": None,
@@ -1621,7 +1800,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
'Source': find_config_source(key, merged_config),
'Currently read from': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
@@ -1631,14 +1810,14 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
<a href="{definition_url}" target="_blank" rel="noopener noreferrer">
See full definition in <code>{definition_label}</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
<b>Configuration Sources (in priority order):</b><br/><br/>
<b>Configuration Sources (highest priority first):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
@@ -1651,15 +1830,15 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
}"</code>
</p>
'''),
'Source': mark_safe(f'''
'Currently read from': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>

View File

@@ -131,7 +131,46 @@ class TagEditorWidget(forms.Widget):
}};
window.updateHiddenInput_{widget_id} = function() {{
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
var hiddenInput = document.getElementById('{widget_id}');
if (!hiddenInput) {{
return;
}}
hiddenInput.value = currentTags_{widget_id}.join(',');
hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
hiddenInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}};
function normalizeTags_{widget_id}(value) {{
var rawTags = Array.isArray(value) ? value : String(value || '').split(',');
var seen = {{}};
return rawTags
.map(function(tag) {{ return String(tag || '').trim(); }})
.filter(function(tag) {{
if (!tag) return false;
var normalized = tag.toLowerCase();
if (seen[normalized]) return false;
seen[normalized] = true;
return true;
}})
.sort(function(a, b) {{
return a.toLowerCase().localeCompare(b.toLowerCase());
}});
}}
window.setTags_{widget_id} = function(value, options) {{
currentTags_{widget_id} = normalizeTags_{widget_id}(value);
rebuildPills_{widget_id}();
if (!(options && options.skipHiddenUpdate)) {{
updateHiddenInput_{widget_id}();
}}
}};
window.syncTagEditorFromHidden_{widget_id} = function() {{
var hiddenInput = document.getElementById('{widget_id}');
if (!hiddenInput) {{
return;
}}
setTags_{widget_id}(hiddenInput.value, {{ skipHiddenUpdate: true }});
}};
function computeTagStyle_{widget_id}(tagName) {{
@@ -190,9 +229,7 @@ class TagEditorWidget(forms.Widget):
// Add to current tags
currentTags_{widget_id}.push(tagName);
currentTags_{widget_id}.sort(function(a, b) {{
return a.toLowerCase().localeCompare(b.toLowerCase());
}});
currentTags_{widget_id} = normalizeTags_{widget_id}(currentTags_{widget_id});
// Rebuild pills
rebuildPills_{widget_id}();
@@ -252,6 +289,14 @@ class TagEditorWidget(forms.Widget):
}}
}});
document.getElementById('{widget_id}').addEventListener('change', function() {{
syncTagEditorFromHidden_{widget_id}();
}});
document.getElementById('{widget_id}').addEventListener('archivebox:sync-tags', function() {{
syncTagEditorFromHidden_{widget_id}();
}});
window.handleTagKeydown_{widget_id} = function(event) {{
var input = event.target;
var value = input.value.trim();
@@ -320,6 +365,8 @@ class TagEditorWidget(forms.Widget):
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
return input ? input.value : '';
}}
syncTagEditorFromHidden_{widget_id}();
}})();
</script>
'''
@@ -327,15 +374,232 @@ class TagEditorWidget(forms.Widget):
return mark_safe(html)
class URLFiltersWidget(forms.Widget):
"""Render URL allowlist / denylist controls with same-domain autofill."""
template_name = ""
def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'):
self.source_selector = source_selector
super().__init__(attrs)
def render(self, name, value, attrs=None, renderer=None):
value = value if isinstance(value, dict) else {}
widget_id_raw = attrs.get('id', name) if attrs else name
widget_id = re.sub(r'[^A-Za-z0-9_]', '_', str(widget_id_raw)) or name
allowlist = escape(value.get('allowlist', '') or '')
denylist = escape(value.get('denylist', '') or '')
return mark_safe(f'''
<div id="{widget_id}_container" class="url-filters-widget">
<input type="hidden" name="{name}" value="">
<div class="url-filters-grid">
<div class="url-filters-column">
<div class="url-filter-label-row">
<label for="{widget_id}_allowlist" class="url-filter-label"><span class="url-filter-label-main">🟢 URL_ALLOWLIST</span></label>
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
</div>
<textarea id="{widget_id}_allowlist"
name="{name}_allowlist"
rows="2"
placeholder="^https?://([^/]+\\.)?(example\\.com|example\\.org)([:/]|$)">{allowlist}</textarea>
</div>
<div class="url-filters-column">
<div class="url-filter-label-row">
<label for="{widget_id}_denylist" class="url-filter-label"><span class="url-filter-label-main">⛔ URL_DENYLIST</span></label>
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
</div>
<textarea id="{widget_id}_denylist"
name="{name}_denylist"
rows="2"
placeholder="^https?://([^/]+\\.)?(cdn\\.example\\.com|analytics\\.example\\.org)([:/]|$)">{denylist}</textarea>
</div>
</div>
<label class="url-filters-toggle" for="{widget_id}_same_domain_only">
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
<span>Same domain only</span>
</label>
<div class="help-text">These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.</div>
<script>
(function() {{
var allowlistField = document.getElementById('{widget_id}_allowlist');
var denylistField = document.getElementById('{widget_id}_denylist');
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
var sourceField = document.querySelector({json.dumps(self.source_selector)});
var lastAutoGeneratedAllowlist = '';
if (!allowlistField || !sameDomainOnly || !sourceField) {{
return;
}}
function extractUrl(line) {{
var trimmed = String(line || '').trim();
if (!trimmed || trimmed.charAt(0) === '#') {{
return '';
}}
if (trimmed.charAt(0) === '{{') {{
try {{
var record = JSON.parse(trimmed);
return String(record.url || '').trim();
}} catch (error) {{
return '';
}}
}}
return trimmed;
}}
function escapeRegex(text) {{
return String(text || '').replace(/[.*+?^${{}}()|[\\]\\\\]/g, '\\\\$&');
}}
function buildHostRegex(domains) {{
if (!domains.length) {{
return '';
}}
return '^https?://(' + domains.map(escapeRegex).join('|') + ')([:/]|$)';
}}
function getConfigEditorRows() {{
return document.getElementById('id_config_rows');
}}
function getConfigUpdater() {{
return window.updateHiddenField_id_config || null;
}}
function findConfigRow(key) {{
var rows = getConfigEditorRows();
if (!rows) {{
return null;
}}
var matches = Array.prototype.filter.call(rows.querySelectorAll('.key-value-row'), function(row) {{
var keyInput = row.querySelector('.kv-key');
return keyInput && keyInput.value.trim() === key;
}});
return matches.length ? matches[0] : null;
}}
function addConfigRow() {{
if (typeof window.addKeyValueRow_id_config === 'function') {{
window.addKeyValueRow_id_config();
var rows = getConfigEditorRows();
return rows ? rows.lastElementChild : null;
}}
return null;
}}
function setConfigRow(key, value) {{
var rows = getConfigEditorRows();
var updater = getConfigUpdater();
if (!rows || !updater) {{
return;
}}
var row = findConfigRow(key);
if (!value) {{
if (row) {{
row.remove();
updater();
}}
return;
}}
if (!row) {{
row = addConfigRow();
}}
if (!row) {{
return;
}}
var keyInput = row.querySelector('.kv-key');
var valueInput = row.querySelector('.kv-value');
if (!keyInput || !valueInput) {{
return;
}}
keyInput.value = key;
valueInput.value = value;
keyInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
valueInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
updater();
}}
function syncConfigEditor() {{
setConfigRow('URL_ALLOWLIST', allowlistField.value.trim());
setConfigRow('URL_DENYLIST', denylistField ? denylistField.value.trim() : '');
}}
function syncAllowlistFromUrls() {{
if (!sameDomainOnly.checked) {{
if (allowlistField.value.trim() === lastAutoGeneratedAllowlist) {{
allowlistField.value = '';
syncConfigEditor();
}}
lastAutoGeneratedAllowlist = '';
return;
}}
var seen = Object.create(null);
var domains = [];
sourceField.value.split(/\\n+/).forEach(function(line) {{
var url = extractUrl(line);
if (!url) {{
return;
}}
try {{
var parsed = new URL(url);
var domain = String(parsed.hostname || '').toLowerCase();
if (!domain || seen[domain]) {{
return;
}}
seen[domain] = true;
domains.push(domain);
}} catch (error) {{
return;
}}
}});
lastAutoGeneratedAllowlist = buildHostRegex(domains);
allowlistField.value = lastAutoGeneratedAllowlist;
syncConfigEditor();
}}
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
sourceField.addEventListener('input', syncAllowlistFromUrls);
sourceField.addEventListener('change', syncAllowlistFromUrls);
allowlistField.addEventListener('input', syncConfigEditor);
allowlistField.addEventListener('change', syncConfigEditor);
if (denylistField) {{
denylistField.addEventListener('input', syncConfigEditor);
denylistField.addEventListener('change', syncConfigEditor);
}}
if (document.readyState === 'loading') {{
document.addEventListener('DOMContentLoaded', syncConfigEditor, {{ once: true }});
}} else {{
syncConfigEditor();
}}
}})();
</script>
</div>
''')
def value_from_datadict(self, data, files, name):
return {
'allowlist': data.get(f'{name}_allowlist', ''),
'denylist': data.get(f'{name}_denylist', ''),
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
}
class InlineTagEditorWidget(TagEditorWidget):
"""
Inline version of TagEditorWidget for use in list views.
Includes AJAX save functionality for immediate persistence.
"""
def __init__(self, attrs=None, snapshot_id=None):
def __init__(self, attrs=None, snapshot_id=None, editable=True):
super().__init__(attrs, snapshot_id)
self.snapshot_id = snapshot_id
self.editable = editable
def render(self, name, value, attrs=None, renderer=None, snapshot_id=None):
"""Render inline tag editor with AJAX save."""
@@ -361,20 +625,24 @@ class InlineTagEditorWidget(TagEditorWidget):
# Build pills HTML with filter links
pills_html = ''
for td in tag_data:
remove_button = ''
if self.editable:
remove_button = (
f'<button type="button" class="tag-remove-btn" '
f'data-tag-id="{td["id"]}" data-tag-name="{self._escape(td["name"])}">&times;</button>'
)
pills_html += f'''
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}" style="{self._tag_style(td['name'])}">
<a href="/admin/core/snapshot/?tags__id__exact={td['id']}" class="tag-link">{self._escape(td['name'])}</a>
<button type="button" class="tag-remove-btn" data-tag-id="{td['id']}" data-tag-name="{self._escape(td['name'])}">&times;</button>
{remove_button}
</span>
'''
tags_json = escape(json.dumps(tag_data))
html = f'''
<span id="{widget_id}_container" class="tag-editor-inline" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}">
<span id="{widget_id}_pills" class="tag-pills-inline">
{pills_html}
</span>
input_html = ''
readonly_class = ' readonly' if not self.editable else ''
if self.editable:
input_html = f'''
<input type="text"
id="{widget_id}_input"
class="tag-inline-input-sm"
@@ -384,6 +652,14 @@ class InlineTagEditorWidget(TagEditorWidget):
data-inline-tag-input="1"
>
<datalist id="{widget_id}_datalist"></datalist>
'''
html = f'''
<span id="{widget_id}_container" class="tag-editor-inline{readonly_class}" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}" data-readonly="{int(not self.editable)}">
<span id="{widget_id}_pills" class="tag-pills-inline">
{pills_html}
</span>
{input_html}
</span>
'''

View File

@@ -1,8 +1,11 @@
__package__ = 'archivebox.crawls'
from django import forms
from django.utils.html import format_html, format_html_join
from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed
from django.shortcuts import get_object_or_404, redirect
from django.urls import path, reverse
from django.utils.html import escape, format_html, format_html_join
from django.utils import timezone
from django.utils.safestring import mark_safe
from django.contrib import admin, messages
from django.db.models import Count, Q
@@ -13,16 +16,19 @@ from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.core.models import Snapshot
from archivebox.core.widgets import TagEditorWidget
from archivebox.crawls.models import Crawl, CrawlSchedule
def render_snapshots_list(snapshots_qs, limit=20):
def render_snapshots_list(snapshots_qs, limit=20, crawl=None):
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
total_results=Count('archiveresult'),
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
started_results=Count('archiveresult', filter=Q(archiveresult__status='started')),
skipped_results=Count('archiveresult', filter=Q(archiveresult__status='skipped')),
)
if not snapshots:
@@ -43,17 +49,57 @@ def render_snapshots_list(snapshots_qs, limit=20):
# Calculate progress
total = snapshot.total_results
done = snapshot.succeeded_results + snapshot.failed_results
succeeded = snapshot.succeeded_results
failed = snapshot.failed_results
running = snapshot.started_results
skipped = snapshot.skipped_results
done = succeeded + failed + skipped
pending = max(total - done - running, 0)
progress_pct = int((done / total) * 100) if total > 0 else 0
progress_text = f'{done}/{total}' if total > 0 else '-'
progress_title = (
f'{succeeded} succeeded, {failed} failed, {running} running, '
f'{pending} pending, {skipped} skipped'
)
progress_color = '#28a745'
if failed:
progress_color = '#dc3545'
elif running:
progress_color = '#17a2b8'
elif pending:
progress_color = '#ffc107'
# Truncate title and URL
title = (snapshot.title or 'Untitled')[:60]
if len(snapshot.title or '') > 60:
snapshot_title = snapshot.title or 'Untitled'
title = snapshot_title[:60]
if len(snapshot_title) > 60:
title += '...'
url_display = snapshot.url[:50]
if len(snapshot.url) > 50:
url_display += '...'
delete_button = ''
exclude_button = ''
if crawl is not None:
delete_url = reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk])
exclude_url = reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, snapshot.pk])
delete_button = f'''
<button type="button"
class="crawl-snapshots-action"
data-post-url="{escape(delete_url)}"
data-confirm="Delete this snapshot from the crawl?"
title="Delete this snapshot from the crawl and remove its URL from the crawl queue."
aria-label="Delete snapshot"
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">🗑</button>
'''
exclude_button = f'''
<button type="button"
class="crawl-snapshots-action"
data-post-url="{escape(exclude_url)}"
data-confirm="Exclude this domain from the crawl? This removes matching queued URLs, deletes pending matching snapshots, and blocks future matches."
title="Exclude this domain from this crawl. This removes matching URLs from the crawl queue, deletes pending matching snapshots, and blocks future matches."
aria-label="Exclude domain from crawl"
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">⊘</button>
'''
# Format date
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
@@ -74,18 +120,18 @@ def render_snapshots_list(snapshots_qs, limit=20):
</td>
<td style="padding: 6px 8px; max-width: 300px;">
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
title="{snapshot.title or 'Untitled'}">{title}</a>
title="{escape(snapshot_title)}">{escape(title)}</a>
</td>
<td style="padding: 6px 8px; max-width: 250px;">
<a href="{snapshot.url}" target="_blank"
<a href="{escape(snapshot.url)}" target="_blank"
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
title="{snapshot.url}">{url_display}</a>
title="{escape(snapshot.url)}">{escape(url_display)}</a>
</td>
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
<div style="display: inline-flex; align-items: center; gap: 6px;">
<div style="display: inline-flex; align-items: center; gap: 6px;" title="{escape(progress_title)}">
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
<div style="width: {progress_pct}%; height: 100%;
background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
background: {progress_color};
transition: width 0.3s;"></div>
</div>
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
@@ -96,6 +142,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
{date_str}
</td>
{"<td style=\"padding: 6px 8px; white-space: nowrap; text-align: right;\"><div style=\"display: inline-flex; gap: 6px;\">%s%s</div></td>" % (exclude_button, delete_button) if crawl is not None else ""}
</tr>
''')
@@ -111,7 +158,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
'''
return mark_safe(f'''
<div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
<div data-crawl-snapshots-list style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
<thead>
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
@@ -121,6 +168,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
{'<th style="padding: 8px; text-align: right; font-weight: 600; color: #333;">Actions</th>' if crawl is not None else ''}
</tr>
</thead>
<tbody>
@@ -129,11 +177,197 @@ def render_snapshots_list(snapshots_qs, limit=20):
</tbody>
</table>
</div>
{'''
<script>
(function() {
if (window.__archiveboxCrawlSnapshotActionsBound) {
return;
}
window.__archiveboxCrawlSnapshotActionsBound = true;
function getCookie(name) {
var cookieValue = null;
if (!document.cookie) {
return cookieValue;
}
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {
var cookie = cookies[i].trim();
if (cookie.substring(0, name.length + 1) === (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
return cookieValue;
}
document.addEventListener('click', function(event) {
var button = event.target.closest('.crawl-snapshots-action');
if (!button) {
return;
}
event.preventDefault();
var confirmMessage = button.getAttribute('data-confirm');
if (confirmMessage && !window.confirm(confirmMessage)) {
return;
}
button.disabled = true;
fetch(button.getAttribute('data-post-url'), {
method: 'POST',
credentials: 'same-origin',
headers: {
'X-CSRFToken': getCookie('csrftoken') || '',
'X-Requested-With': 'XMLHttpRequest'
}
}).then(function(response) {
return response.json().then(function(data) {
if (!response.ok) {
throw new Error(data.error || 'Request failed');
}
return data;
});
}).then(function() {
window.location.reload();
}).catch(function(error) {
button.disabled = false;
window.alert(error.message || 'Request failed');
});
});
})();
</script>
''' if crawl is not None else ''}
''')
class URLFiltersWidget(forms.Widget):
def render(self, name, value, attrs=None, renderer=None):
value = value if isinstance(value, dict) else {}
widget_id = (attrs or {}).get('id', name)
allowlist = escape(value.get('allowlist', '') or '')
denylist = escape(value.get('denylist', '') or '')
return mark_safe(f'''
<div id="{widget_id}_container" style="min-width: 420px;">
<input type="hidden" name="{name}" value="">
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
<div>
<label for="{widget_id}_allowlist" style="display: block; font-weight: 600; margin-bottom: 4px;">Allowlist</label>
<textarea id="{widget_id}_allowlist" name="{name}_allowlist" rows="3"
style="width: 100%; font-family: monospace; font-size: 12px;"
placeholder="example.com&#10;*.example.com">{allowlist}</textarea>
</div>
<div>
<label for="{widget_id}_denylist" style="display: block; font-weight: 600; margin-bottom: 4px;">Denylist</label>
<textarea id="{widget_id}_denylist" name="{name}_denylist" rows="3"
style="width: 100%; font-family: monospace; font-size: 12px;"
placeholder="static.example.com">{denylist}</textarea>
</div>
</div>
<label style="display: inline-flex; align-items: center; gap: 6px; margin-top: 8px; font-weight: 500;">
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
Same domain only
</label>
<p style="color: #666; font-size: 11px; margin: 6px 0 0 0;">
Enter domains, wildcards, or regex patterns. Denylist takes precedence over allowlist.
</p>
<script>
(function() {{
if (window.__archiveboxUrlFilterEditors && window.__archiveboxUrlFilterEditors['{widget_id}']) {{
return;
}}
window.__archiveboxUrlFilterEditors = window.__archiveboxUrlFilterEditors || {{}};
window.__archiveboxUrlFilterEditors['{widget_id}'] = true;
var urlsField = document.getElementById('id_urls');
var allowlistField = document.getElementById('{widget_id}_allowlist');
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
function extractUrl(line) {{
var trimmed = (line || '').trim();
if (!trimmed || trimmed.charAt(0) === '#') {{
return '';
}}
if (trimmed.charAt(0) === '{{') {{
try {{
var record = JSON.parse(trimmed);
return String(record.url || '').trim();
}} catch (error) {{
return '';
}}
}}
return trimmed;
}}
function syncAllowlistFromUrls() {{
if (!urlsField || !allowlistField || !sameDomainOnly || !sameDomainOnly.checked) {{
return;
}}
var domains = [];
var seen = Object.create(null);
urlsField.value.split(/\\n+/).forEach(function(line) {{
var url = extractUrl(line);
if (!url) {{
return;
}}
try {{
var parsed = new URL(url);
var domain = (parsed.hostname || '').toLowerCase();
if (domain && !seen[domain]) {{
seen[domain] = true;
domains.push(domain);
}}
}} catch (error) {{
return;
}}
}});
allowlistField.value = domains.join('\\n');
}}
if (sameDomainOnly) {{
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
}}
if (urlsField) {{
urlsField.addEventListener('input', syncAllowlistFromUrls);
urlsField.addEventListener('change', syncAllowlistFromUrls);
}}
}})();
</script>
</div>
''')
def value_from_datadict(self, data, files, name):
return {
'allowlist': data.get(f'{name}_allowlist', ''),
'denylist': data.get(f'{name}_denylist', ''),
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
}
class URLFiltersField(forms.Field):
widget = URLFiltersWidget
def to_python(self, value):
if isinstance(value, dict):
return value
return {'allowlist': '', 'denylist': '', 'same_domain_only': False}
class CrawlAdminForm(forms.ModelForm):
"""Custom form for Crawl admin to render urls field as textarea."""
tags_editor = forms.CharField(
label='Tags',
required=False,
widget=TagEditorWidget(),
help_text='Type tag names and press Enter or Space to add. Click × to remove.',
)
url_filters = URLFiltersField(
label='URL Filters',
required=False,
help_text='Set URL_ALLOWLIST / URL_DENYLIST for this crawl.',
)
class Meta:
model = Crawl
@@ -144,8 +378,62 @@ class CrawlAdminForm(forms.ModelForm):
'style': 'width: 100%; font-family: monospace; font-size: 13px;',
'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
}),
'notes': forms.Textarea(attrs={
'rows': 1,
'style': 'width: 100%; min-height: 0; resize: vertical;',
}),
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {}
if self.instance and self.instance.pk:
self.initial['tags_editor'] = self.instance.tags_str
self.initial['url_filters'] = {
'allowlist': config.get('URL_ALLOWLIST', ''),
'denylist': config.get('URL_DENYLIST', ''),
'same_domain_only': False,
}
def clean_tags_editor(self):
tags_str = self.cleaned_data.get('tags_editor', '')
tag_names = []
seen = set()
for raw_name in tags_str.split(','):
name = raw_name.strip()
if not name:
continue
lowered = name.lower()
if lowered in seen:
continue
seen.add(lowered)
tag_names.append(name)
return ','.join(tag_names)
def clean_url_filters(self):
value = self.cleaned_data.get('url_filters') or {}
return {
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
'same_domain_only': bool(value.get('same_domain_only')),
}
def save(self, commit=True):
instance = super().save(commit=False)
instance.tags_str = self.cleaned_data.get('tags_editor', '')
url_filters = self.cleaned_data.get('url_filters') or {}
instance.set_url_filters(
url_filters.get('allowlist', ''),
url_filters.get('denylist', ''),
)
if commit:
instance.save()
instance.apply_crawl_config_filters()
save_m2m = getattr(self, '_save_m2m', None)
if callable(save_m2m):
save_m2m()
return instance
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
form = CrawlAdminForm
@@ -161,11 +449,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes', 'tags_str'),
'fields': ('label', 'notes', 'tags_editor'),
'classes': ('card',),
}),
('Settings', {
'fields': ('max_depth', 'config'),
'fields': (('max_depth', 'url_filters'), 'config'),
'classes': ('card',),
}),
('Status', {
@@ -185,6 +473,28 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card', 'wide'),
}),
)
add_fieldsets = (
('URLs', {
'fields': ('urls',),
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes', 'tags_editor'),
'classes': ('card',),
}),
('Settings', {
'fields': (('max_depth', 'url_filters'), 'config'),
'classes': ('card',),
}),
('Status', {
'fields': ('status', 'retry_at'),
'classes': ('card',),
}),
('Relations', {
'fields': ('schedule', 'created_by'),
'classes': ('card',),
}),
)
list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
@@ -199,6 +509,25 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
num_snapshots_cached=Count('snapshot_set')
)
def get_fieldsets(self, request, obj=None):
return self.fieldsets if obj else self.add_fieldsets
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path(
'<path:object_id>/snapshot/<path:snapshot_id>/delete/',
self.admin_site.admin_view(self.delete_snapshot_view),
name='crawls_crawl_snapshot_delete',
),
path(
'<path:object_id>/snapshot/<path:snapshot_id>/exclude-domain/',
self.admin_site.admin_view(self.exclude_domain_view),
name='crawls_crawl_snapshot_exclude_domain',
),
]
return custom_urls + urls
@admin.action(description='Delete selected crawls')
def delete_selected_batched(self, request, queryset):
"""Delete crawls in a single transaction to avoid SQLite concurrency issues."""
@@ -218,8 +547,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
from django.utils import timezone
from django.shortcuts import redirect
# Validate URLs (required for crawl to start)
if not obj.urls:
@@ -252,7 +579,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
def snapshots(self, obj):
return render_snapshots_list(obj.snapshot_set.all())
return render_snapshots_list(obj.snapshot_set.all(), crawl=obj)
def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
if request.method != 'POST':
return HttpResponseNotAllowed(['POST'])
crawl = get_object_or_404(Crawl, pk=object_id)
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
if snapshot.status == Snapshot.StatusChoices.STARTED:
snapshot.cancel_running_hooks()
removed_urls = crawl.prune_url(snapshot.url)
snapshot.delete()
return JsonResponse({
'ok': True,
'snapshot_id': str(snapshot.id),
'removed_urls': removed_urls,
})
def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
if request.method != 'POST':
return HttpResponseNotAllowed(['POST'])
crawl = get_object_or_404(Crawl, pk=object_id)
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
result = crawl.exclude_domain(snapshot.url)
return JsonResponse({
'ok': True,
**result,
})
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):

View File

@@ -2,9 +2,12 @@ __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
import uuid
import json
import re
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
from urllib.parse import urlparse
from django.db import models
from django.core.validators import MaxValueValidator, MinValueValidator
@@ -141,22 +144,21 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return f'[...{short_id}] {first_url[:120]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
log_worker_event(
worker_type='DB',
event='Created Crawl',
indent_level=1,
metadata={
'id': str(self.id),
'first_url': first_url[:64],
'max_depth': self.max_depth,
'status': self.status,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
# log_worker_event(
# worker_type='DB',
# event='Created Crawl',
# indent_level=1,
# metadata={
# 'id': str(self.id),
# 'first_url': first_url[:64],
# 'max_depth': self.max_depth,
# 'status': self.status,
# },
# )
@property
def api_url(self) -> str:
@@ -248,6 +250,222 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if url.strip() and not url.strip().startswith('#')
]
@staticmethod
def normalize_domain(value: str) -> str:
candidate = (value or '').strip().lower()
if not candidate:
return ''
if '://' not in candidate and '/' not in candidate:
candidate = f'https://{candidate.lstrip(".")}'
try:
parsed = urlparse(candidate)
hostname = parsed.hostname or ''
if not hostname:
return ''
if parsed.port:
return f'{hostname}_{parsed.port}'
return hostname
except Exception:
return ''
@staticmethod
def split_filter_patterns(value) -> list[str]:
patterns = []
seen = set()
if isinstance(value, list):
raw_values = value
elif isinstance(value, str):
raw_values = value.splitlines()
else:
raw_values = []
for raw_value in raw_values:
pattern = str(raw_value or '').strip()
if not pattern or pattern in seen:
continue
seen.add(pattern)
patterns.append(pattern)
return patterns
@classmethod
def _pattern_matches_url(cls, url: str, pattern: str) -> bool:
normalized_pattern = str(pattern or '').strip()
if not normalized_pattern:
return False
if re.fullmatch(r'[\w.*:-]+', normalized_pattern):
wildcard_only_subdomains = normalized_pattern.startswith('*.')
normalized_domain = cls.normalize_domain(
normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern
)
normalized_url_domain = cls.normalize_domain(url)
if not normalized_domain or not normalized_url_domain:
return False
pattern_host = normalized_domain.split('_', 1)[0]
url_host = normalized_url_domain.split('_', 1)[0]
if wildcard_only_subdomains:
return url_host.endswith(f'.{pattern_host}')
if normalized_url_domain == normalized_domain:
return True
return url_host == pattern_host or url_host.endswith(f'.{pattern_host}')
try:
return bool(re.search(normalized_pattern, url))
except re.error:
return False
def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
if use_effective_config:
from archivebox.config.configset import get_config
config = get_config(crawl=self, snapshot=snapshot)
else:
config = self.config or {}
return self.split_filter_patterns(config.get('URL_ALLOWLIST', ''))
def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
if use_effective_config:
from archivebox.config.configset import get_config
config = get_config(crawl=self, snapshot=snapshot)
else:
config = self.config or {}
return self.split_filter_patterns(config.get('URL_DENYLIST', ''))
def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool:
denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot)
allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot)
for pattern in denylist:
if self._pattern_matches_url(url, pattern):
return False
if allowlist:
return any(self._pattern_matches_url(url, pattern) for pattern in allowlist)
return True
def set_url_filters(self, allowlist, denylist) -> None:
config = dict(self.config or {})
allow_patterns = self.split_filter_patterns(allowlist)
deny_patterns = self.split_filter_patterns(denylist)
if allow_patterns:
config['URL_ALLOWLIST'] = '\n'.join(allow_patterns)
else:
config.pop('URL_ALLOWLIST', None)
if deny_patterns:
config['URL_DENYLIST'] = '\n'.join(deny_patterns)
else:
config.pop('URL_DENYLIST', None)
self.config = config
def apply_crawl_config_filters(self) -> dict[str, int]:
from archivebox.core.models import Snapshot
removed_urls = self.prune_urls(
lambda url: not self.url_passes_filters(url, use_effective_config=False)
)
filtered_snapshots = [
snapshot
for snapshot in self.snapshot_set.filter(
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
).only('pk', 'url', 'status')
if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False)
]
deleted_snapshots = 0
if filtered_snapshots:
started_snapshots = [
snapshot for snapshot in filtered_snapshots
if snapshot.status == Snapshot.StatusChoices.STARTED
]
for snapshot in started_snapshots:
snapshot.cancel_running_hooks()
filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots]
deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete()
return {
'removed_urls': len(removed_urls),
'deleted_snapshots': deleted_snapshots,
}
def _iter_url_lines(self) -> list[tuple[str, str]]:
entries: list[tuple[str, str]] = []
for raw_line in (self.urls or '').splitlines():
stripped = raw_line.strip()
if not stripped:
continue
if stripped.startswith('#'):
entries.append((raw_line.rstrip(), ''))
continue
try:
entry = json.loads(stripped)
entries.append((raw_line.rstrip(), str(entry.get('url', '') or '').strip()))
except json.JSONDecodeError:
entries.append((raw_line.rstrip(), stripped))
return entries
def prune_urls(self, predicate) -> list[str]:
kept_lines: list[str] = []
removed_urls: list[str] = []
for raw_line, url in self._iter_url_lines():
if not url:
kept_lines.append(raw_line)
continue
if predicate(url):
removed_urls.append(url)
continue
kept_lines.append(raw_line)
next_urls = '\n'.join(kept_lines)
if next_urls != (self.urls or ''):
self.urls = next_urls
self.save(update_fields=['urls', 'modified_at'])
return removed_urls
def prune_url(self, url: str) -> int:
target = (url or '').strip()
removed = self.prune_urls(lambda candidate: candidate == target)
return len(removed)
def exclude_domain(self, domain: str) -> dict[str, int | str | bool]:
normalized_domain = self.normalize_domain(domain)
if not normalized_domain:
return {
'domain': '',
'created': False,
'removed_urls': 0,
'deleted_snapshots': 0,
}
domains = self.get_url_denylist(use_effective_config=False)
created = normalized_domain not in domains
if created:
domains.append(normalized_domain)
self.set_url_filters(
self.get_url_allowlist(use_effective_config=False),
domains,
)
self.save(update_fields=['config', 'modified_at'])
filter_result = self.apply_crawl_config_filters()
return {
'domain': normalized_domain,
'created': created,
'removed_urls': filter_result['removed_urls'],
'deleted_snapshots': filter_result['deleted_snapshots'],
}
def get_system_task(self) -> str | None:
urls = self.get_urls_list()
if len(urls) != 1:
@@ -284,11 +502,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
True if URL was added, False if skipped (duplicate or depth exceeded)
"""
import json
from archivebox.misc.util import fix_url_from_markdown
url = entry.get('url', '')
url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
if not url:
return False
if not self.url_passes_filters(url):
return False
depth = entry.get('depth', 1)
@@ -301,20 +521,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return False
# Check if already in urls (parse existing JSONL entries)
existing_urls = set()
for line in self.urls.splitlines():
if not line.strip():
continue
try:
existing_entry = json.loads(line)
existing_urls.add(existing_entry.get('url', ''))
except json.JSONDecodeError:
existing_urls.add(line.strip())
existing_urls = {url for _raw_line, url in self._iter_url_lines() if url}
if url in existing_urls:
return False
# Append as JSONL
entry = {**entry, 'url': url}
jsonl_entry = json.dumps(entry)
self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n')
self.save(update_fields=['urls', 'modified_at'])
@@ -327,15 +540,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
List of newly created Snapshot objects
"""
import sys
import json
from archivebox.core.models import Snapshot
from archivebox.misc.util import fix_url_from_markdown
created_snapshots = []
print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
for line in self.urls.splitlines():
if not line.strip():
continue
@@ -343,13 +552,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Parse JSONL or plain URL
try:
entry = json.loads(line)
url = entry.get('url', '')
url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
depth = entry.get('depth', 0)
title = entry.get('title')
timestamp = entry.get('timestamp')
tags = entry.get('tags', '')
except json.JSONDecodeError:
url = line.strip()
url = fix_url_from_markdown(line.strip())
depth = 0
title = None
timestamp = None
@@ -357,6 +566,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if not url:
continue
if not self.url_passes_filters(url):
continue
# Skip if depth exceeds max_depth
if depth > self.max_depth:

View File

@@ -64,6 +64,7 @@ from abx_plugins import get_plugins_dir
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config.constants import CONSTANTS
from archivebox.misc.util import fix_url_from_markdown
if TYPE_CHECKING:
from archivebox.machine.models import Process
@@ -266,7 +267,7 @@ def run_hook(
if process.status == 'exited':
records = process.get_records() # Get parsed JSONL output
"""
from archivebox.machine.models import Process, Machine
from archivebox.machine.models import Process, Machine, NetworkInterface
from archivebox.config.constants import CONSTANTS
import sys
@@ -280,6 +281,8 @@ def run_hook(
# Get current machine
machine = Machine.current()
iface = NetworkInterface.current(refresh=True)
machine = iface.machine
# Auto-detect parent process if not explicitly provided
# This enables automatic hierarchy tracking: Worker -> Hook
@@ -294,6 +297,7 @@ def run_hook(
# Create a failed Process record for hooks that don't exist
process = Process.objects.create(
machine=machine,
iface=iface,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
@@ -449,6 +453,7 @@ def run_hook(
# Create Process record
process = Process.objects.create(
machine=machine,
iface=iface,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
@@ -458,6 +463,7 @@ def run_hook(
# Copy the env dict we already built (includes os.environ + all customizations)
process.env = env.copy()
process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script))
# Save env before launching
process.save()
@@ -472,6 +478,7 @@ def run_hook(
# Create a failed Process record for exceptions
process = Process.objects.create(
machine=machine,
iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
@@ -544,6 +551,9 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
text = urls_file.read_text()
for entry in Process.parse_records_from_text(text):
if entry.get('url'):
entry['url'] = fix_url_from_markdown(str(entry['url']).strip())
if not entry['url']:
continue
# Track which parser plugin found this URL
entry['plugin'] = subdir.name
urls.append(entry)
@@ -615,11 +625,30 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
from archivebox.config.configset import get_config
config = get_config()
def normalize_enabled_plugins(value: Any) -> List[str]:
if value is None:
return []
if isinstance(value, str):
raw = value.strip()
if not raw:
return []
if raw.startswith('['):
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
parsed = None
if isinstance(parsed, list):
return [str(plugin).strip() for plugin in parsed if str(plugin).strip()]
return [plugin.strip() for plugin in raw.split(',') if plugin.strip()]
if isinstance(value, (list, tuple, set)):
return [str(plugin).strip() for plugin in value if str(plugin).strip()]
return [str(value).strip()] if str(value).strip() else []
# Support explicit ENABLED_PLUGINS override (legacy)
if 'ENABLED_PLUGINS' in config:
return config['ENABLED_PLUGINS']
return normalize_enabled_plugins(config['ENABLED_PLUGINS'])
if 'ENABLED_EXTRACTORS' in config:
return config['ENABLED_EXTRACTORS']
return normalize_enabled_plugins(config['ENABLED_EXTRACTORS'])
# Filter all plugins by enabled status
all_plugins = get_plugins()
@@ -1042,6 +1071,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
if record_type == 'Snapshot':
from archivebox.core.models import Snapshot
if record.get('url'):
record = {
**record,
'url': fix_url_from_markdown(str(record['url']).strip()),
}
if not record['url']:
continue
# Check if discovered snapshot exceeds crawl max_depth
snapshot_depth = record.get('depth', 0)
crawl = overrides.get('crawl')

View File

@@ -113,7 +113,7 @@ class BinaryAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
readonly_fields = ('created_at', 'modified_at')
readonly_fields = ('created_at', 'modified_at', 'output_dir')
fieldsets = (
('Binary Info', {
@@ -166,7 +166,7 @@ class ProcessAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid')
search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr')
readonly_fields = ('created_at', 'modified_at', 'machine', 'binary', 'iface', 'archiveresult_link')
readonly_fields = ('created_at', 'modified_at', 'machine', 'binary_link', 'iface_link', 'archiveresult_link')
fieldsets = (
('Process Info', {
@@ -178,7 +178,7 @@ class ProcessAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Execution', {
'fields': ('binary', 'iface', 'pid', 'exit_code', 'url'),
'fields': ('binary_link', 'iface_link', 'pid', 'exit_code', 'url'),
'classes': ('card',),
}),
('Timing', {
@@ -216,6 +216,21 @@ class ProcessAdmin(BaseModelAdmin):
process.binary.id, process.binary.name, process.binary.version,
)
@admin.display(description='Binary', ordering='binary__name')
def binary_link(self, process):
return self.binary_info(process)
@admin.display(description='Network Interface', ordering='iface__id')
def iface_link(self, process):
if not process.iface:
return '-'
return format_html(
'<a href="/admin/machine/networkinterface/{}/change"><code>{}</code> {}</a>',
process.iface.id,
str(process.iface.id)[:8],
process.iface.iface or process.iface.ip_public or process.iface.ip_local,
)
@admin.display(description='ArchiveResult')
def archiveresult_link(self, process):
if not hasattr(process, 'archiveresult'):

View File

@@ -49,6 +49,89 @@ BINARY_RECHECK_INTERVAL = 1 * 30 * 60
PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds
PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid
START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching
LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"})
def _find_existing_binary_for_reference(machine: 'Machine', reference: str) -> 'Binary | None':
reference = str(reference or '').strip()
if not reference:
return None
qs = Binary.objects.filter(machine=machine)
direct_match = qs.filter(abspath=reference).order_by('-modified_at').first()
if direct_match:
return direct_match
ref_name = Path(reference).name
if ref_name:
named_match = qs.filter(name=ref_name).order_by('-modified_at').first()
if named_match:
return named_match
return qs.filter(name=reference).order_by('-modified_at').first()
def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]:
env = env or {}
plugin_name = str(plugin_name or '').strip()
hook_path = str(hook_path or '').strip()
plugin_key = plugin_name.upper().replace('-', '_')
keys: list[str] = []
seen: set[str] = set()
def add(key: str) -> None:
if key and key not in seen and env.get(key):
seen.add(key)
keys.append(key)
if plugin_key:
add(f'{plugin_key}_BINARY')
try:
from archivebox.hooks import discover_plugin_configs
plugin_schema = discover_plugin_configs().get(plugin_name, {})
schema_keys = [
key
for key in (plugin_schema.get('properties') or {})
if key.endswith('_BINARY')
]
except Exception:
schema_keys = []
schema_keys.sort(key=lambda key: (
key != f'{plugin_key}_BINARY',
key.endswith('_NODE_BINARY'),
key.endswith('_CHROME_BINARY'),
key,
))
for key in schema_keys:
add(key)
if plugin_name.startswith('search_backend_'):
backend_name = plugin_name.removeprefix('search_backend_').upper().replace('-', '_')
configured_engine = str(env.get('SEARCH_BACKEND_ENGINE') or '').strip().upper().replace('-', '_')
if backend_name and backend_name == configured_engine:
add(f'{backend_name}_BINARY')
hook_suffix = Path(hook_path).suffix.lower()
if hook_suffix == '.js':
if plugin_key:
add(f'{plugin_key}_NODE_BINARY')
add('NODE_BINARY')
return keys
def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]:
if not isinstance(config, dict):
return {}
sanitized = dict(config)
for key in LEGACY_MACHINE_CONFIG_KEYS:
sanitized.pop(key, None)
return sanitized
class MachineManager(models.Manager):
@@ -89,13 +172,13 @@ class Machine(ModelWithHealthStats):
global _CURRENT_MACHINE
if _CURRENT_MACHINE:
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
_CURRENT_MACHINE = None
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
guid=get_host_guid(),
defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()},
)
return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
@classmethod
def _hydrate_config_from_sibling(cls, machine: 'Machine') -> 'Machine':
@@ -115,6 +198,15 @@ class Machine(ModelWithHealthStats):
machine.save(update_fields=['config', 'modified_at'])
return machine
@classmethod
def _sanitize_config(cls, machine: 'Machine') -> 'Machine':
sanitized = _sanitize_machine_config(machine.config)
current = machine.config or {}
if sanitized != current:
machine.config = sanitized
machine.save(update_fields=['config', 'modified_at'])
return machine
def to_json(self) -> dict:
"""
Convert Machine model instance to a JSON-serializable dict.
@@ -152,11 +244,10 @@ class Machine(ModelWithHealthStats):
Returns:
Machine instance or None
"""
config_patch = record.get('config')
if isinstance(config_patch, dict) and config_patch:
config_patch = _sanitize_machine_config(record.get('config'))
if config_patch:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config = _sanitize_machine_config(machine.config)
machine.config.update(config_patch)
machine.save(update_fields=['config'])
return machine
@@ -194,13 +285,17 @@ class NetworkInterface(ModelWithHealthStats):
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
@classmethod
def current(cls) -> 'NetworkInterface':
def current(cls, refresh: bool = False) -> 'NetworkInterface':
global _CURRENT_INTERFACE
machine = Machine.current()
if _CURRENT_INTERFACE:
if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
if (
not refresh
and _CURRENT_INTERFACE.machine_id == machine.id
and timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
):
return _CURRENT_INTERFACE
_CURRENT_INTERFACE = None
machine = Machine.current()
net_info = get_host_network()
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
@@ -747,14 +842,17 @@ class ProcessManager(models.Manager):
Called during migration and when creating new ArchiveResults.
"""
iface = kwargs.get('iface') or NetworkInterface.current()
# Defaults from ArchiveResult if not provided
defaults = {
'machine': Machine.current(),
'machine': iface.machine,
'pwd': kwargs.get('pwd') or str(archiveresult.snapshot.output_dir / archiveresult.plugin),
'cmd': kwargs.get('cmd') or [],
'status': 'queued',
'timeout': kwargs.get('timeout', 120),
'env': kwargs.get('env', {}),
'iface': iface,
}
defaults.update(kwargs)
@@ -971,6 +1069,28 @@ class Process(models.Model):
record['timeout'] = self.timeout
return record
def hydrate_binary_from_context(self, *, plugin_name: str = '', hook_path: str = '') -> 'Binary | None':
machine = self.machine if self.machine_id else Machine.current()
references: list[str] = []
for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env):
value = str(self.env.get(key) or '').strip()
if value and value not in references:
references.append(value)
if self.cmd:
cmd_0 = str(self.cmd[0]).strip()
if cmd_0 and cmd_0 not in references:
references.append(cmd_0)
for reference in references:
binary = _find_existing_binary_for_reference(machine, reference)
if binary:
self.binary = binary
return binary
return None
@classmethod
def parse_records_from_text(cls, text: str) -> list[dict]:
"""Parse JSONL records from raw text using the shared JSONL parser."""
@@ -1044,6 +1164,7 @@ class Process(models.Model):
current_pid = os.getpid()
machine = Machine.current()
iface = NetworkInterface.current()
# Check cache validity
if _CURRENT_PROCESS:
@@ -1053,6 +1174,9 @@ class Process(models.Model):
and _CURRENT_PROCESS.machine_id == machine.id
and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
):
if _CURRENT_PROCESS.iface_id != iface.id:
_CURRENT_PROCESS.iface = iface
_CURRENT_PROCESS.save(update_fields=['iface', 'modified_at'])
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
_CURRENT_PROCESS = None
@@ -1080,6 +1204,9 @@ class Process(models.Model):
db_start_time = existing.started_at.timestamp()
if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
_CURRENT_PROCESS = existing
if existing.iface_id != iface.id:
existing.iface = iface
existing.save(update_fields=['iface', 'modified_at'])
_CURRENT_PROCESS.ensure_log_files()
return existing
@@ -1112,6 +1239,7 @@ class Process(models.Model):
pid=current_pid,
started_at=started_at,
status=cls.StatusChoices.RUNNING,
iface=iface,
)
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
@@ -1176,7 +1304,9 @@ class Process(models.Model):
if 'supervisord' in argv_str:
return cls.TypeChoices.SUPERVISORD
elif 'archivebox run' in argv_str or 'runner_watch' in argv_str:
elif 'runner_watch' in argv_str:
return cls.TypeChoices.WORKER
elif 'archivebox run' in argv_str:
return cls.TypeChoices.ORCHESTRATOR
elif 'archivebox' in argv_str:
return cls.TypeChoices.CLI
@@ -1321,14 +1451,17 @@ class Process(models.Model):
if self.cmd:
try:
os_cmdline = os_proc.cmdline()
# Check if first arg (binary) matches
if os_cmdline and self.cmd:
os_binary = os_cmdline[0] if os_cmdline else ''
db_binary = self.cmd[0] if self.cmd else ''
# Match by basename (handles /usr/bin/python3 vs python3)
if os_binary and db_binary:
if Path(os_binary).name != Path(db_binary).name:
return None # Different binary, PID reused
if db_binary:
db_binary_name = Path(db_binary).name
cmd_matches = any(
arg == db_binary or Path(arg).name == db_binary_name
for arg in os_cmdline
if arg
)
if not cmd_matches:
return None # Different command, PID reused
except (psutil.AccessDenied, psutil.ZombieProcess):
pass # Can't check cmdline, trust start time match

View File

@@ -4,6 +4,7 @@ import re
import requests
import json as pyjson
import http.cookiejar
from dateparser import parse as dateparser
from typing import List, Optional, Any, Callable
from pathlib import Path
@@ -13,7 +14,6 @@ from hashlib import sha256
from urllib.parse import urlparse, quote, unquote
from html import escape, unescape
from datetime import datetime, timezone
from dateparser import parse as dateparser
from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode
@@ -122,9 +122,35 @@ def fix_url_from_markdown(url_str: str) -> str:
return url_str
def split_comma_separated_urls(url: str):
offset = 0
while True:
http_index = url.find('http://', 1)
https_index = url.find('https://', 1)
next_indices = [idx for idx in (http_index, https_index) if idx != -1]
if not next_indices:
yield offset, url
return
next_index = min(next_indices)
if url[next_index - 1] != ',':
yield offset, url
return
yield offset, url[:next_index - 1]
offset += next_index
url = url[next_index:]
def find_all_urls(urls_str: str):
for url in re.findall(URL_REGEX, urls_str):
yield fix_url_from_markdown(url)
skipped_starts = set()
for match in re.finditer(URL_REGEX, urls_str):
if match.start() in skipped_starts:
continue
for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))):
if offset:
skipped_starts.add(match.start() + offset)
yield url
def is_static_file(url: str):
@@ -214,7 +240,25 @@ def parse_date(date: Any) -> datetime | None:
date = str(date)
if isinstance(date, str):
parsed_date = dateparser(date, settings={'TIMEZONE': 'UTC'})
normalized = date.strip()
if not normalized:
raise ValueError(f'Tried to parse invalid date string! {date}')
try:
return datetime.fromtimestamp(float(normalized), tz=timezone.utc)
except (TypeError, ValueError, OSError):
pass
try:
iso_date = normalized.replace('Z', '+00:00')
parsed_date = datetime.fromisoformat(iso_date)
if parsed_date.tzinfo is None:
return parsed_date.replace(tzinfo=timezone.utc)
return parsed_date.astimezone(timezone.utc)
except ValueError:
pass
parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'})
if parsed_date is None:
raise ValueError(f'Tried to parse invalid date string! {date}')
return parsed_date.astimezone(timezone.utc)
@@ -408,6 +452,7 @@ assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguat
URL_REGEX_TESTS = [
('https://example.com', ['https://example.com']),
('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']),
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),

View File

@@ -1,2 +1,169 @@
__package__ = "archivebox.personas"
# Register your models here.
import shutil
from django.contrib import admin, messages
from django.utils.html import format_html, format_html_join
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.personas.forms import PersonaAdminForm
from archivebox.personas.importers import discover_local_browser_profiles
from archivebox.personas.models import Persona
class PersonaAdmin(ConfigEditorMixin, BaseModelAdmin):
form = PersonaAdminForm
change_form_template = "admin/personas/persona/change_form.html"
list_display = ("name", "created_by", "created_at", "chrome_profile_state", "cookies_state", "auth_state")
search_fields = ("name", "created_by__username")
list_filter = ("created_by",)
ordering = ["name"]
list_per_page = 100
readonly_fields = ("id", "created_at", "persona_paths", "import_artifact_status")
add_fieldsets = (
("Persona", {
"fields": ("name", "created_by"),
"classes": ("card",),
}),
("Browser Import", {
"fields": (
"import_mode",
"import_discovered_profile",
"import_source",
"import_profile_name",
"import_copy_profile",
"import_extract_cookies",
"import_capture_storage",
),
"classes": ("card", "wide"),
}),
("Advanced", {
"fields": ("config",),
"classes": ("card", "wide"),
}),
)
change_fieldsets = add_fieldsets + (
("Artifacts", {
"fields": ("persona_paths", "import_artifact_status"),
"classes": ("card", "wide"),
}),
("Timestamps", {
"fields": ("id", "created_at"),
"classes": ("card",),
}),
)
@admin.display(description="Chrome Profile")
def chrome_profile_state(self, obj: Persona) -> str:
return "yes" if (obj.path / "chrome_user_data").exists() else "no"
@admin.display(description="cookies.txt")
def cookies_state(self, obj: Persona) -> str:
return "yes" if obj.COOKIES_FILE else "no"
@admin.display(description="auth.json")
def auth_state(self, obj: Persona) -> str:
return "yes" if obj.AUTH_STORAGE_FILE else "no"
@admin.display(description="Persona Paths")
def persona_paths(self, obj: Persona) -> str:
return format_html(
"<div class='abx-persona-path-list'>"
"<div><strong>Persona root</strong><code>{}</code></div>"
"<div><strong>chrome_user_data</strong><code>{}</code></div>"
"<div><strong>chrome_extensions</strong><code>{}</code></div>"
"<div><strong>chrome_downloads</strong><code>{}</code></div>"
"<div><strong>cookies.txt</strong><code>{}</code></div>"
"<div><strong>auth.json</strong><code>{}</code></div>"
"</div>",
obj.path,
obj.CHROME_USER_DATA_DIR,
obj.CHROME_EXTENSIONS_DIR,
obj.CHROME_DOWNLOADS_DIR,
obj.COOKIES_FILE or (obj.path / "cookies.txt"),
obj.AUTH_STORAGE_FILE or (obj.path / "auth.json"),
)
@admin.display(description="Import Artifacts")
def import_artifact_status(self, obj: Persona) -> str:
entries = [
("Browser profile", (obj.path / "chrome_user_data").exists(), obj.CHROME_USER_DATA_DIR),
("cookies.txt", bool(obj.COOKIES_FILE), obj.COOKIES_FILE or (obj.path / "cookies.txt")),
("auth.json", bool(obj.AUTH_STORAGE_FILE), obj.AUTH_STORAGE_FILE or (obj.path / "auth.json")),
]
return format_html(
"<div class='abx-persona-artifacts'>{}</div>",
format_html_join(
"",
"<div class='abx-persona-artifact'><strong>{}</strong><span class='{}'>{}</span><code>{}</code></div>",
(
(
label,
"abx-artifact-state abx-artifact-state--yes" if enabled else "abx-artifact-state abx-artifact-state--no",
"present" if enabled else "missing",
path,
)
for label, enabled, path in entries
),
),
)
def get_fieldsets(self, request, obj=None):
return self.change_fieldsets if obj else self.add_fieldsets
def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None):
context["detected_profile_count"] = len(discover_local_browser_profiles())
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
def save_model(self, request, obj, form, change):
old_path = None
new_path = None
if change:
previous = Persona.objects.get(pk=obj.pk)
if previous.name != obj.name:
old_path = previous.path
new_path = obj.path
super().save_model(request, obj, form, change)
if old_path and new_path and old_path != new_path and old_path.exists():
if new_path.exists():
raise FileExistsError(f"Cannot rename Persona directory because the destination already exists: {new_path}")
shutil.move(str(old_path), str(new_path))
obj.ensure_dirs()
import_result = form.apply_import(obj)
if import_result is None:
return
completed_actions = []
if import_result.profile_copied:
completed_actions.append("profile copied")
if import_result.cookies_imported:
completed_actions.append("cookies.txt generated")
if import_result.storage_captured:
completed_actions.append("auth.json captured")
if import_result.user_agent_imported:
completed_actions.append("USER_AGENT copied")
if completed_actions:
messages.success(
request,
f'Imported {", ".join(completed_actions)} from {import_result.source.display_label}.',
)
else:
messages.warning(
request,
f"Persona saved, but no browser artifacts were imported from {import_result.source.display_label}.",
)
for warning in import_result.warnings:
messages.warning(request, warning)
def register_admin(admin_site: admin.AdminSite) -> None:
admin_site.register(Persona, PersonaAdmin)

View File

@@ -0,0 +1,210 @@
#!/usr/bin/env node
/**
* Export cookies and open-tab storage from a Chromium profile or live CDP URL.
*
* Environment variables:
* ARCHIVEBOX_ABX_PLUGINS_DIR Absolute path to abx_plugins/plugins
* CHROME_USER_DATA_DIR Local Chromium user-data directory to launch
* CHROME_CDP_URL Existing browser CDP URL to attach to
* COOKIES_OUTPUT_FILE Optional output path for Netscape cookies.txt
* AUTH_STORAGE_OUTPUT_FILE Optional output path for auth.json
* CHROME_BINARY Optional browser binary override
* NODE_MODULES_DIR Optional node_modules path for puppeteer-core
*/
const fs = require('fs');
const os = require('os');
const path = require('path');
const pluginsDir = process.env.ARCHIVEBOX_ABX_PLUGINS_DIR || process.env.ABX_PLUGINS_DIR;
if (!pluginsDir) {
console.error('ARCHIVEBOX_ABX_PLUGINS_DIR is required');
process.exit(1);
}
const baseUtils = require(path.join(pluginsDir, 'base', 'utils.js'));
baseUtils.ensureNodeModuleResolution(module);
const chromeUtils = require(path.join(pluginsDir, 'chrome', 'chrome_utils.js'));
const puppeteer = require('puppeteer-core');
function cookieToNetscape(cookie) {
let domain = cookie.domain;
if (!domain.startsWith('.') && !cookie.hostOnly) {
domain = '.' + domain;
}
const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
const cookiePath = cookie.path || '/';
const secure = cookie.secure ? 'TRUE' : 'FALSE';
const expiry = cookie.expires && cookie.expires > 0 ? Math.floor(cookie.expires).toString() : '0';
return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`;
}
function writeCookiesFile(cookies, outputPath) {
const lines = [
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by ArchiveBox persona cookie extraction',
'#',
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
'',
];
for (const cookie of cookies) {
lines.push(cookieToNetscape(cookie));
}
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
fs.writeFileSync(outputPath, lines.join('\n') + '\n');
}
async function collectStorage(browser) {
const localStorage = {};
const sessionStorage = {};
const pages = await browser.pages();
for (const page of pages) {
try {
const url = page.url();
if (!url || url === 'about:blank') continue;
if (url.startsWith('chrome:') || url.startsWith('edge:') || url.startsWith('devtools:')) continue;
const payload = await page.evaluate(() => ({
origin: window.location.origin,
localStorage: Object.fromEntries(Object.entries(window.localStorage)),
sessionStorage: Object.fromEntries(Object.entries(window.sessionStorage)),
}));
if (!payload.origin || payload.origin === 'null') continue;
if (Object.keys(payload.localStorage || {}).length > 0) {
localStorage[payload.origin] = payload.localStorage;
}
if (Object.keys(payload.sessionStorage || {}).length > 0) {
sessionStorage[payload.origin] = payload.sessionStorage;
}
} catch (error) {
// Ignore pages that cannot be inspected via evaluate().
}
}
return { localStorage, sessionStorage };
}
async function openBrowser() {
const cdpUrl = process.env.CHROME_CDP_URL || '';
if (cdpUrl) {
const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, cdpUrl, { defaultViewport: null });
return {
browser,
async cleanup() {
try {
await browser.disconnect();
} catch (error) {}
},
sourceDescription: cdpUrl,
};
}
const userDataDir = process.env.CHROME_USER_DATA_DIR;
if (!userDataDir) {
throw new Error('Either CHROME_USER_DATA_DIR or CHROME_CDP_URL is required');
}
if (!fs.existsSync(userDataDir)) {
throw new Error(`User data directory does not exist: ${userDataDir}`);
}
const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'abx-browser-state-'));
const binary = process.env.CHROME_BINARY || chromeUtils.findAnyChromiumBinary();
if (!binary) {
throw new Error('Could not find a Chromium binary for browser state export');
}
const launched = await chromeUtils.launchChromium({
binary,
outputDir,
userDataDir,
headless: true,
killZombies: false,
});
if (!launched.success) {
throw new Error(launched.error || 'Chrome launch failed');
}
const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, launched.cdpUrl, { defaultViewport: null });
return {
browser,
async cleanup() {
try {
await browser.disconnect();
} catch (error) {}
try {
await chromeUtils.killChrome(launched.pid, outputDir);
} catch (error) {}
try {
fs.rmSync(outputDir, { recursive: true, force: true });
} catch (error) {}
},
sourceDescription: userDataDir,
};
}
async function main() {
const cookiesOutput = process.env.COOKIES_OUTPUT_FILE || '';
const authOutput = process.env.AUTH_STORAGE_OUTPUT_FILE || '';
if (!cookiesOutput && !authOutput) {
throw new Error('COOKIES_OUTPUT_FILE or AUTH_STORAGE_OUTPUT_FILE is required');
}
const { browser, cleanup, sourceDescription } = await openBrowser();
try {
const session = await browser.target().createCDPSession();
const browserVersion = await session.send('Browser.getVersion');
const cookieResult = await session.send('Storage.getCookies');
const cookies = cookieResult?.cookies || [];
const { localStorage, sessionStorage } = await collectStorage(browser);
const userAgent = browserVersion?.userAgent || '';
if (cookiesOutput) {
writeCookiesFile(cookies, cookiesOutput);
}
if (authOutput) {
fs.mkdirSync(path.dirname(authOutput), { recursive: true });
fs.writeFileSync(
authOutput,
JSON.stringify(
{
TYPE: 'auth',
SOURCE: sourceDescription,
captured_at: new Date().toISOString(),
user_agent: userAgent,
cookies,
localStorage,
sessionStorage,
},
null,
2,
) + '\n',
);
}
console.error(
`[+] Exported ${cookies.length} cookies` +
`${authOutput ? ` and ${Object.keys(localStorage).length + Object.keys(sessionStorage).length} storage origins` : ''}` +
`${userAgent ? ' with browser USER_AGENT' : ''}` +
` from ${sourceDescription}`,
);
} finally {
await cleanup();
}
}
main().catch((error) => {
console.error(`ERROR: ${error.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,176 @@
__package__ = "archivebox.personas"
from typing import Any
from django import forms
from django.utils.safestring import mark_safe
from archivebox.personas.importers import (
PersonaImportResult,
PersonaImportSource,
discover_local_browser_profiles,
import_persona_from_source,
resolve_custom_import_source,
validate_persona_name,
)
from archivebox.personas.models import Persona
def _mode_label(title: str, description: str) -> str:
return mark_safe(
f'<span class="abx-import-mode-option"><strong>{title}</strong><span>{description}</span></span>'
)
class PersonaAdminForm(forms.ModelForm):
import_mode = forms.ChoiceField(
required=False,
initial="none",
label="Bootstrap this persona",
widget=forms.RadioSelect,
choices=(
("none", _mode_label("Blank Persona", "Create the persona without importing browser state yet.")),
("discovered", _mode_label("Use a detected profile", "Pick from Chromium profiles auto-discovered on this host.")),
("custom", _mode_label("Use a custom path or CDP URL", "Paste an absolute Chromium path or attach to a live browser debugging endpoint.")),
),
help_text="These options run after the Persona row is saved, using the same backend import helpers as the CLI.",
)
import_discovered_profile = forms.ChoiceField(
required=False,
label="Autodiscovered profiles",
widget=forms.RadioSelect,
choices=(),
help_text="Detected from local Chrome, Chromium, Brave, and Edge profile roots.",
)
import_source = forms.CharField(
required=False,
label="Absolute path or CDP URL",
widget=forms.TextInput(
attrs={
"placeholder": "/Users/alice/Library/Application Support/Google/Chrome or ws://127.0.0.1:9222/devtools/browser/...",
"style": "width: 100%; font-family: monospace;",
}
),
help_text="Accepts an absolute Chromium user-data dir, an exact profile dir, or a live HTTP/WS CDP endpoint.",
)
import_profile_name = forms.CharField(
required=False,
label="Profile directory name",
widget=forms.TextInput(
attrs={
"placeholder": "Default or Profile 1",
"style": "width: 100%; font-family: monospace;",
}
),
help_text="Only used when the custom path points at a browser root containing multiple profiles.",
)
import_copy_profile = forms.BooleanField(
required=False,
initial=True,
label="Copy browser profile into this persona",
help_text="Copies the chosen Chromium user-data tree into `chrome_user_data` for future archiving runs.",
)
import_extract_cookies = forms.BooleanField(
required=False,
initial=True,
label="Generate `cookies.txt`",
help_text="Extracts cookies through Chrome DevTools Protocol and writes a Netscape cookie jar for wget/curl-based plugins.",
)
import_capture_storage = forms.BooleanField(
required=False,
initial=True,
label="Capture open-tab storage into `auth.json`",
help_text="Snapshots currently open tab `localStorage` / `sessionStorage` values by origin. This is most useful for live CDP imports.",
)
class Meta:
model = Persona
fields = ("name", "created_by", "config")
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.discovered_profiles = discover_local_browser_profiles()
self._resolved_import_source: PersonaImportSource | None = None
self.fields["import_mode"].widget.attrs["class"] = "abx-import-mode"
self.fields["import_discovered_profile"].widget.attrs["class"] = "abx-profile-picker"
if self.discovered_profiles:
self.fields["import_discovered_profile"].choices = [
(profile.choice_value, profile.as_choice_label()) for profile in self.discovered_profiles
]
else:
self.fields["import_discovered_profile"].choices = []
self.fields["import_discovered_profile"].help_text = (
"No local Chromium profiles were detected on this host right now. "
"Use the custom path/CDP option if the browser data lives elsewhere."
)
def clean_name(self) -> str:
name = str(self.cleaned_data.get("name") or "").strip()
is_valid, error_message = validate_persona_name(name)
if not is_valid:
raise forms.ValidationError(error_message)
return name
def clean(self) -> dict[str, Any]:
cleaned_data = super().clean()
self._resolved_import_source = None
import_mode = str(cleaned_data.get("import_mode") or "none").strip() or "none"
if import_mode == "none":
return cleaned_data
if import_mode == "discovered":
selection = str(cleaned_data.get("import_discovered_profile") or "").strip()
if not selection:
self.add_error("import_discovered_profile", "Choose one of the discovered profiles to import.")
return cleaned_data
try:
self._resolved_import_source = PersonaImportSource.from_choice_value(selection)
except ValueError as err:
self.add_error("import_discovered_profile", str(err))
return cleaned_data
elif import_mode == "custom":
raw_value = str(cleaned_data.get("import_source") or "").strip()
if not raw_value:
self.add_error("import_source", "Provide an absolute Chromium profile path or a CDP URL.")
return cleaned_data
try:
self._resolved_import_source = resolve_custom_import_source(
raw_value,
profile_dir=str(cleaned_data.get("import_profile_name") or "").strip() or None,
)
except ValueError as err:
self.add_error("import_source", str(err))
return cleaned_data
else:
self.add_error("import_mode", "Choose how this Persona should be bootstrapped.")
return cleaned_data
copy_profile = bool(cleaned_data.get("import_copy_profile"))
import_cookies = bool(cleaned_data.get("import_extract_cookies"))
capture_storage = bool(cleaned_data.get("import_capture_storage"))
if self._resolved_import_source.kind == "cdp":
if not (import_cookies or capture_storage):
self.add_error(
"import_extract_cookies",
"CDP imports can only capture cookies and/or open-tab storage. Profile copying is not available for a remote browser endpoint.",
)
elif not (copy_profile or import_cookies or capture_storage):
raise forms.ValidationError("Select at least one import action.")
return cleaned_data
def apply_import(self, persona: Persona) -> PersonaImportResult | None:
if not self._resolved_import_source:
return None
return import_persona_from_source(
persona,
self._resolved_import_source,
copy_profile=bool(self.cleaned_data.get("import_copy_profile")),
import_cookies=bool(self.cleaned_data.get("import_extract_cookies")),
capture_storage=bool(self.cleaned_data.get("import_capture_storage")),
)

View File

@@ -0,0 +1,845 @@
"""
Shared persona browser discovery/import helpers.
These helpers are used by both the CLI and the Django admin so Persona import
behavior stays consistent regardless of where it is triggered from.
"""
from __future__ import annotations
import json
import os
import platform
import shutil
import subprocess
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Optional
from urllib.parse import urlparse
from django.utils.html import format_html
from django.utils.safestring import SafeString
if TYPE_CHECKING:
from archivebox.personas.models import Persona
BROWSER_LABELS = {
"chrome": "Google Chrome",
"chromium": "Chromium",
"brave": "Brave",
"edge": "Microsoft Edge",
"custom": "Custom Path",
"persona": "Persona Template",
}
BROWSER_PROFILE_DIR_NAMES = (
"Default",
"Profile ",
"Guest Profile",
)
VOLATILE_PROFILE_COPY_PATTERNS = (
"Cache",
"Code Cache",
"GPUCache",
"ShaderCache",
"Service Worker",
"GCM Store",
"*.log",
"Crashpad",
"BrowserMetrics",
"BrowserMetrics-spare.pma",
"SingletonLock",
"SingletonSocket",
"SingletonCookie",
)
PERSONA_PROFILE_DIR_CANDIDATES = (
"chrome_profile",
"chrome_user_data",
)
@dataclass(frozen=True)
class PersonaImportSource:
kind: str
browser: str = "custom"
source_name: str | None = None
user_data_dir: Path | None = None
profile_dir: str | None = None
browser_binary: str | None = None
cdp_url: str | None = None
@property
def browser_label(self) -> str:
return BROWSER_LABELS.get(self.browser, self.browser.title())
@property
def profile_path(self) -> Path | None:
if not self.user_data_dir or not self.profile_dir:
return None
return self.user_data_dir / self.profile_dir
@property
def display_label(self) -> str:
if self.kind == "cdp":
return self.cdp_url or "CDP URL"
profile_suffix = f" / {self.profile_dir}" if self.profile_dir else ""
source_prefix = f": {self.source_name}" if self.source_name else ""
return f"{self.browser_label}{source_prefix}{profile_suffix}"
@property
def choice_value(self) -> str:
return json.dumps(
{
"kind": self.kind,
"browser": self.browser,
"source_name": self.source_name or "",
"user_data_dir": str(self.user_data_dir) if self.user_data_dir else "",
"profile_dir": self.profile_dir or "",
"browser_binary": self.browser_binary or "",
"cdp_url": self.cdp_url or "",
},
sort_keys=True,
)
def as_choice_label(self) -> SafeString:
path_str = str(self.profile_path or self.user_data_dir or self.cdp_url or "")
binary_suffix = f"Using {self.browser_binary}" if self.browser_binary else "Will auto-detect a Chromium binary"
return format_html(
'<span class="abx-profile-option">'
'<strong>{}</strong>'
'<span class="abx-profile-option__meta">{}</span>'
'<code>{}</code>'
"</span>",
self.display_label,
binary_suffix,
path_str,
)
@classmethod
def from_choice_value(cls, value: str) -> "PersonaImportSource":
try:
payload = json.loads(value)
except json.JSONDecodeError as err:
raise ValueError("Invalid discovered profile selection.") from err
if payload.get("kind") != "browser-profile":
raise ValueError("Invalid discovered profile selection.")
user_data_dir = Path(str(payload.get("user_data_dir") or "")).expanduser()
profile_dir = str(payload.get("profile_dir") or "").strip()
browser = str(payload.get("browser") or "custom").strip().lower() or "custom"
source_name = str(payload.get("source_name") or "").strip() or None
browser_binary = str(payload.get("browser_binary") or "").strip() or None
return resolve_browser_profile_source(
browser=browser,
source_name=source_name,
user_data_dir=user_data_dir,
profile_dir=profile_dir,
browser_binary=browser_binary,
)
@dataclass
class PersonaImportResult:
source: PersonaImportSource
profile_copied: bool = False
cookies_imported: bool = False
storage_captured: bool = False
user_agent_imported: bool = False
warnings: list[str] = field(default_factory=list)
@property
def did_work(self) -> bool:
return self.profile_copied or self.cookies_imported or self.storage_captured or self.user_agent_imported
def get_chrome_user_data_dir() -> Optional[Path]:
"""Get the default Chrome user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == "Darwin":
candidates = [
home / "Library" / "Application Support" / "Google" / "Chrome",
home / "Library" / "Application Support" / "Chromium",
]
elif system == "Linux":
candidates = [
home / ".config" / "google-chrome",
home / ".config" / "chromium",
home / ".config" / "chrome",
home / "snap" / "chromium" / "common" / "chromium",
]
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / "Google" / "Chrome" / "User Data",
local_app_data / "Chromium" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and _list_profile_names(candidate):
return candidate
return None
def get_brave_user_data_dir() -> Optional[Path]:
"""Get the default Brave user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == "Darwin":
candidates = [
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
]
elif system == "Linux":
candidates = [
home / ".config" / "BraveSoftware" / "Brave-Browser",
]
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and _list_profile_names(candidate):
return candidate
return None
def get_edge_user_data_dir() -> Optional[Path]:
"""Get the default Edge user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == "Darwin":
candidates = [
home / "Library" / "Application Support" / "Microsoft Edge",
]
elif system == "Linux":
candidates = [
home / ".config" / "microsoft-edge",
home / ".config" / "microsoft-edge-beta",
home / ".config" / "microsoft-edge-dev",
]
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / "Microsoft" / "Edge" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and _list_profile_names(candidate):
return candidate
return None
def get_browser_binary(browser: str) -> Optional[str]:
system = platform.system()
home = Path.home()
browser = browser.lower()
if system == "Darwin":
candidates = {
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
}.get(browser, [])
elif system == "Linux":
candidates = {
"chrome": ["/usr/bin/google-chrome", "/usr/bin/google-chrome-stable", "/usr/bin/google-chrome-beta", "/usr/bin/google-chrome-unstable"],
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
"edge": ["/usr/bin/microsoft-edge", "/usr/bin/microsoft-edge-stable", "/usr/bin/microsoft-edge-beta", "/usr/bin/microsoft-edge-dev"],
}.get(browser, [])
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = {
"chrome": [
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
],
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
"brave": [
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
],
"edge": [
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
],
}.get(browser, [])
else:
candidates = []
for candidate in candidates:
if candidate and Path(candidate).exists():
return candidate
return None
BROWSER_PROFILE_FINDERS = {
"chrome": get_chrome_user_data_dir,
"chromium": get_chrome_user_data_dir,
"brave": get_brave_user_data_dir,
"edge": get_edge_user_data_dir,
}
CHROMIUM_BROWSERS = tuple(BROWSER_PROFILE_FINDERS.keys())
NETSCAPE_COOKIE_HEADER = [
"# Netscape HTTP Cookie File",
"# https://curl.se/docs/http-cookies.html",
"# This file was generated by ArchiveBox persona cookie extraction",
"#",
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
"",
]
def validate_persona_name(name: str) -> tuple[bool, str]:
"""Validate persona name to prevent path traversal."""
if not name or not name.strip():
return False, "Persona name cannot be empty"
if "/" in name or "\\" in name:
return False, "Persona name cannot contain path separators (/ or \\)"
if ".." in name:
return False, "Persona name cannot contain parent directory references (..)"
if name.startswith("."):
return False, "Persona name cannot start with a dot (.)"
if "\x00" in name or "\n" in name or "\r" in name:
return False, "Persona name contains invalid characters"
return True, ""
def discover_local_browser_profiles() -> list[PersonaImportSource]:
discovered: list[PersonaImportSource] = []
for browser, finder in BROWSER_PROFILE_FINDERS.items():
user_data_dir = finder()
if not user_data_dir:
continue
browser_binary = get_browser_binary(browser)
for profile_dir in _list_profile_names(user_data_dir):
try:
discovered.append(
resolve_browser_profile_source(
browser=browser,
user_data_dir=user_data_dir,
profile_dir=profile_dir,
browser_binary=browser_binary,
)
)
except ValueError:
continue
discovered.extend(discover_persona_template_profiles())
return discovered
def discover_persona_template_profiles(personas_dir: Path | None = None) -> list[PersonaImportSource]:
from archivebox.config.constants import CONSTANTS
templates: list[PersonaImportSource] = []
candidate_roots: list[Path] = []
if personas_dir is not None:
candidate_roots.append(personas_dir.expanduser())
else:
candidate_roots.extend(
[
CONSTANTS.PERSONAS_DIR.expanduser(),
Path.home() / ".config" / "abx" / "personas",
]
)
seen_roots: set[Path] = set()
for personas_root in candidate_roots:
resolved_root = personas_root.resolve()
if resolved_root in seen_roots:
continue
seen_roots.add(resolved_root)
if not resolved_root.exists() or not resolved_root.is_dir():
continue
for persona_dir in sorted((path for path in resolved_root.iterdir() if path.is_dir()), key=lambda path: path.name.lower()):
for candidate_dir_name in PERSONA_PROFILE_DIR_CANDIDATES:
user_data_dir = persona_dir / candidate_dir_name
if not user_data_dir.exists() or not user_data_dir.is_dir():
continue
for profile_dir in _list_profile_names(user_data_dir):
try:
templates.append(
resolve_browser_profile_source(
browser="persona",
source_name=persona_dir.name,
user_data_dir=user_data_dir,
profile_dir=profile_dir,
browser_binary=get_browser_binary("chrome"),
)
)
except ValueError:
continue
return templates
def resolve_browser_import_source(browser: str, profile_dir: str | None = None) -> PersonaImportSource:
browser = browser.lower().strip()
if browser not in BROWSER_PROFILE_FINDERS:
supported = ", ".join(BROWSER_PROFILE_FINDERS)
raise ValueError(f"Unknown browser: {browser}. Supported browsers: {supported}")
user_data_dir = BROWSER_PROFILE_FINDERS[browser]()
if not user_data_dir:
raise ValueError(f"Could not find {browser} profile directory")
chosen_profile = profile_dir or pick_default_profile_dir(user_data_dir)
if not chosen_profile:
raise ValueError(f"Could not find a profile in {user_data_dir}")
return resolve_browser_profile_source(
browser=browser,
user_data_dir=user_data_dir,
profile_dir=chosen_profile,
browser_binary=get_browser_binary(browser),
)
def resolve_browser_profile_source(
browser: str,
user_data_dir: Path,
profile_dir: str,
source_name: str | None = None,
browser_binary: str | None = None,
) -> PersonaImportSource:
resolved_root = user_data_dir.expanduser()
if not resolved_root.is_absolute():
resolved_root = resolved_root.resolve()
if not resolved_root.exists():
raise ValueError(f"Profile root does not exist: {resolved_root}")
if not profile_dir.strip():
raise ValueError("Profile directory name cannot be empty.")
profile_path = resolved_root / profile_dir
if not _looks_like_profile_dir(profile_path):
raise ValueError(f"Profile directory does not look valid: {profile_path}")
return PersonaImportSource(
kind="browser-profile",
browser=browser,
source_name=source_name,
user_data_dir=resolved_root,
profile_dir=profile_dir,
browser_binary=browser_binary,
)
def resolve_custom_import_source(raw_value: str, profile_dir: str | None = None) -> PersonaImportSource:
raw_value = raw_value.strip()
if not raw_value:
raise ValueError("Provide an absolute browser profile path or a CDP URL.")
if _looks_like_cdp_url(raw_value):
return PersonaImportSource(kind="cdp", cdp_url=raw_value)
source_path = Path(raw_value).expanduser()
if not source_path.is_absolute():
raise ValueError("Custom browser path must be an absolute path.")
if not source_path.exists():
raise ValueError(f"Custom browser path does not exist: {source_path}")
explicit_profile = profile_dir.strip() if profile_dir else ""
if _looks_like_profile_dir(source_path):
if explicit_profile and explicit_profile != source_path.name:
raise ValueError("Profile name does not match the provided profile directory path.")
return resolve_browser_profile_source(
browser="custom",
user_data_dir=source_path.parent.resolve(),
profile_dir=source_path.name,
)
chosen_profile = explicit_profile or pick_default_profile_dir(source_path)
if not chosen_profile:
raise ValueError(
"Could not find a Chromium profile in that directory. "
"Provide an exact profile directory path or fill in the profile name field."
)
return resolve_browser_profile_source(
browser="custom",
user_data_dir=source_path.resolve(),
profile_dir=chosen_profile,
)
def pick_default_profile_dir(user_data_dir: Path) -> str | None:
profiles = _list_profile_names(user_data_dir)
if not profiles:
return None
if "Default" in profiles:
return "Default"
return profiles[0]
def import_persona_from_source(
persona: "Persona",
source: PersonaImportSource,
*,
copy_profile: bool = True,
import_cookies: bool = True,
capture_storage: bool = False,
) -> PersonaImportResult:
persona.ensure_dirs()
result = PersonaImportResult(source=source)
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
cookies_file = persona.path / "cookies.txt"
auth_file = persona.path / "auth.json"
launch_user_data_dir: Path | None = None
if source.kind == "browser-profile":
if copy_profile and source.user_data_dir:
resolved_source_root = source.user_data_dir.resolve()
resolved_persona_root = persona_chrome_dir.resolve()
if resolved_source_root == resolved_persona_root:
result.warnings.append("Skipped profile copy because the selected source is already this persona's chrome_user_data directory.")
else:
copy_browser_user_data_dir(resolved_source_root, resolved_persona_root)
persona.cleanup_chrome_profile(resolved_persona_root)
result.profile_copied = True
launch_user_data_dir = resolved_persona_root
else:
launch_user_data_dir = source.user_data_dir
elif copy_profile:
result.warnings.append("Profile copying is only available for local Chromium profile paths. CDP imports can only pull cookies and open-tab storage.")
if source.kind == "cdp":
export_success, auth_payload, export_message = export_browser_state(
cdp_url=source.cdp_url,
cookies_output_file=cookies_file if import_cookies else None,
auth_output_file=auth_file if capture_storage else None,
)
else:
export_success, auth_payload, export_message = export_browser_state(
user_data_dir=launch_user_data_dir,
profile_dir=source.profile_dir,
chrome_binary=source.browser_binary,
cookies_output_file=cookies_file if import_cookies else None,
auth_output_file=auth_file if capture_storage else None,
)
if not export_success:
result.warnings.append(export_message or "Browser import failed.")
return result
if import_cookies and cookies_file.exists():
result.cookies_imported = True
if capture_storage and auth_file.exists():
result.storage_captured = True
if _apply_imported_user_agent(persona, auth_payload):
result.user_agent_imported = True
return result
def copy_browser_user_data_dir(source_dir: Path, destination_dir: Path) -> None:
destination_dir.parent.mkdir(parents=True, exist_ok=True)
shutil.rmtree(destination_dir, ignore_errors=True)
shutil.copytree(
source_dir,
destination_dir,
symlinks=True,
ignore=shutil.ignore_patterns(*VOLATILE_PROFILE_COPY_PATTERNS),
)
def export_browser_state(
*,
user_data_dir: Path | None = None,
cdp_url: str | None = None,
profile_dir: str | None = None,
chrome_binary: str | None = None,
cookies_output_file: Path | None = None,
auth_output_file: Path | None = None,
) -> tuple[bool, dict | None, str]:
if not user_data_dir and not cdp_url:
return False, None, "Missing browser source."
from abx_plugins import get_plugins_dir
from archivebox.config.common import STORAGE_CONFIG
state_script = Path(__file__).with_name("export_browser_state.js")
if not state_script.exists():
return False, None, f"Browser state export script not found at {state_script}"
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
chrome_plugin_dir = Path(get_plugins_dir()).resolve()
env = os.environ.copy()
env["NODE_MODULES_DIR"] = str(node_modules_dir)
env["ARCHIVEBOX_ABX_PLUGINS_DIR"] = str(chrome_plugin_dir)
if user_data_dir:
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
if cdp_url:
env["CHROME_CDP_URL"] = cdp_url
env["CHROME_IS_LOCAL"] = "false"
if chrome_binary:
env["CHROME_BINARY"] = str(chrome_binary)
if profile_dir:
extra_arg = f"--profile-directory={profile_dir}"
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
args_list: list[str] = []
if existing_extra:
if existing_extra.startswith("["):
try:
parsed = json.loads(existing_extra)
if isinstance(parsed, list):
args_list.extend(str(x) for x in parsed)
except Exception:
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
else:
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
args_list.append(extra_arg)
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
temp_dir: Path | None = None
tmp_cookies_file: Path | None = None
tmp_auth_file: Path | None = None
if cookies_output_file and cookies_output_file.exists():
temp_dir = Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
tmp_cookies_file = temp_dir / "cookies.txt"
env["COOKIES_OUTPUT_FILE"] = str(tmp_cookies_file)
elif cookies_output_file:
env["COOKIES_OUTPUT_FILE"] = str(cookies_output_file)
if auth_output_file and auth_output_file.exists():
temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
tmp_auth_file = temp_dir / "auth.json"
env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file)
elif auth_output_file:
env["AUTH_STORAGE_OUTPUT_FILE"] = str(auth_output_file)
else:
temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
tmp_auth_file = temp_dir / "auth.json"
env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file)
try:
result = subprocess.run(
["node", str(state_script)],
env=env,
capture_output=True,
text=True,
timeout=120,
)
except subprocess.TimeoutExpired:
return False, None, "Browser state export timed out."
except FileNotFoundError:
return False, None, "Node.js was not found, so ArchiveBox could not extract browser state."
except Exception as err:
return False, None, f"Browser state export failed: {err}"
if result.returncode != 0:
message = (result.stderr or result.stdout or "").strip() or "Browser state export failed."
return False, None, message
auth_payload: dict | None = None
if cookies_output_file and tmp_cookies_file and tmp_cookies_file.exists():
_merge_netscape_cookies(cookies_output_file, tmp_cookies_file)
if auth_output_file and tmp_auth_file and tmp_auth_file.exists():
_merge_auth_storage(auth_output_file, tmp_auth_file)
auth_payload = _load_auth_storage(tmp_auth_file)
elif auth_output_file and auth_output_file.exists():
auth_payload = _load_auth_storage(auth_output_file)
elif tmp_auth_file and tmp_auth_file.exists():
auth_payload = _load_auth_storage(tmp_auth_file)
if temp_dir and temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
return True, auth_payload, (result.stderr or result.stdout or "").strip()
def _list_profile_names(user_data_dir: Path) -> list[str]:
if not user_data_dir.exists() or not user_data_dir.is_dir():
return []
profiles: list[str] = []
for child in sorted(user_data_dir.iterdir(), key=lambda path: path.name.lower()):
if not child.is_dir():
continue
if child.name == "System Profile":
continue
if child.name == "Default" or child.name.startswith("Profile ") or child.name.startswith("Guest Profile"):
if _looks_like_profile_dir(child):
profiles.append(child.name)
continue
if _looks_like_profile_dir(child):
profiles.append(child.name)
return profiles
def _looks_like_profile_dir(path: Path) -> bool:
if not path.exists() or not path.is_dir():
return False
marker_paths = (
path / "Preferences",
path / "History",
path / "Cookies",
path / "Network" / "Cookies",
path / "Local Storage",
path / "Session Storage",
)
if any(marker.exists() for marker in marker_paths):
return True
return any(path.name == prefix or path.name.startswith(prefix) for prefix in BROWSER_PROFILE_DIR_NAMES)
def _looks_like_cdp_url(value: str) -> bool:
parsed = urlparse(value)
return parsed.scheme in {"ws", "wss", "http", "https"} and bool(parsed.netloc)
def _parse_netscape_cookies(path: Path) -> dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]:
cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]] = {}
if not path.exists():
return cookies
for line in path.read_text().splitlines():
if not line or line.startswith("#"):
continue
parts = line.split("\t")
if len(parts) < 7:
continue
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
cookies[(domain, cookie_path, name)] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
return cookies
def _write_netscape_cookies(
path: Path,
cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]],
) -> None:
lines = list(NETSCAPE_COOKIE_HEADER)
for cookie in cookies.values():
lines.append("\t".join(cookie))
path.write_text("\n".join(lines) + "\n")
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
existing = _parse_netscape_cookies(existing_file)
new = _parse_netscape_cookies(new_file)
existing.update(new)
_write_netscape_cookies(existing_file, existing)
def _merge_auth_storage(existing_file: Path, new_file: Path) -> None:
existing_payload = _load_auth_storage(existing_file)
new_payload = _load_auth_storage(new_file)
existing_local = existing_payload.setdefault("localStorage", {})
existing_session = existing_payload.setdefault("sessionStorage", {})
for origin, payload in (new_payload.get("localStorage") or {}).items():
existing_local[origin] = payload
for origin, payload in (new_payload.get("sessionStorage") or {}).items():
existing_session[origin] = payload
cookies = _merge_cookie_dicts(existing_payload.get("cookies") or [], new_payload.get("cookies") or [])
merged = {
**existing_payload,
**new_payload,
"cookies": cookies,
"localStorage": existing_local,
"sessionStorage": existing_session,
"user_agent": new_payload.get("user_agent") or existing_payload.get("user_agent") or "",
}
existing_file.write_text(json.dumps(merged, indent=2, sort_keys=True) + "\n")
def _load_auth_storage(path: Path) -> dict:
if not path.exists():
return {
"TYPE": "auth",
"cookies": [],
"localStorage": {},
"sessionStorage": {},
}
try:
payload = json.loads(path.read_text())
except json.JSONDecodeError:
return {
"TYPE": "auth",
"cookies": [],
"localStorage": {},
"sessionStorage": {},
}
if not isinstance(payload, dict):
return {
"TYPE": "auth",
"cookies": [],
"localStorage": {},
"sessionStorage": {},
}
return payload
def _merge_cookie_dicts(existing: list[dict], new: list[dict]) -> list[dict]:
merged: dict[tuple[str, str, str], dict] = {}
for cookie in existing:
key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or ""))
merged[key] = cookie
for cookie in new:
key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or ""))
merged[key] = cookie
return list(merged.values())
def _apply_imported_user_agent(persona: "Persona", auth_payload: dict | None) -> bool:
if not auth_payload:
return False
user_agent = str(auth_payload.get("user_agent") or "").strip()
if not user_agent:
return False
config = dict(persona.config or {})
if config.get("USER_AGENT") == user_agent:
return False
config["USER_AGENT"] = user_agent
persona.config = config
persona.save(update_fields=["config"])
return True

View File

@@ -117,6 +117,12 @@ class Persona(ModelWithConfig):
cookies_path = self.path / 'cookies.txt'
return str(cookies_path) if cookies_path.exists() else ''
@property
def AUTH_STORAGE_FILE(self) -> str:
"""Derived path to auth.json for this persona (if it exists)."""
auth_path = self.path / 'auth.json'
return str(auth_path) if auth_path.exists() else ''
def get_derived_config(self) -> dict:
"""
Get config dict with derived paths filled in.
@@ -127,6 +133,7 @@ class Persona(ModelWithConfig):
- CHROME_EXTENSIONS_DIR (derived from persona path)
- CHROME_DOWNLOADS_DIR (derived from persona path)
- COOKIES_FILE (derived from persona path, if file exists)
- AUTH_STORAGE_FILE (derived from persona path, if file exists)
- ACTIVE_PERSONA (set to this persona's name)
"""
derived = dict(self.config or {})
@@ -140,6 +147,8 @@ class Persona(ModelWithConfig):
derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
derived['COOKIES_FILE'] = self.COOKIES_FILE
if 'AUTH_STORAGE_FILE' not in derived and self.AUTH_STORAGE_FILE:
derived['AUTH_STORAGE_FILE'] = self.AUTH_STORAGE_FILE
# Always set ACTIVE_PERSONA to this persona's name
derived['ACTIVE_PERSONA'] = self.name

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import json
import mimetypes
from collections import defaultdict
from pathlib import Path
@@ -7,9 +8,10 @@ from pathlib import Path
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ArchiveResultEvent
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
from .process_service import ProcessService, parse_event_datetime
@@ -48,22 +50,93 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
def _normalize_status(status: str) -> str:
if status == "noresult":
return "skipped"
return "noresults"
return status or "failed"
def _has_content_files(output_files: list[str]) -> bool:
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
def _iter_archiveresult_records(stdout: str) -> list[dict]:
records: list[dict] = []
for raw_line in stdout.splitlines():
line = raw_line.strip()
if not line.startswith("{"):
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
if record.get("type") == "ArchiveResult":
records.append(record)
return records
class ArchiveResultService(BaseService):
LISTENS_TO = [ArchiveResultEvent]
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
EMITS = []
def __init__(self, bus, *, process_service: ProcessService):
self.process_service = process_service
super().__init__(bus)
async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None:
await sync_to_async(self._project, thread_sensitive=True)(event)
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
if snapshot_output_dir is None:
return
plugin_dir = Path(snapshot_output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
def _project(self, event: ArchiveResultEvent) -> None:
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
return
plugin_dir = Path(event.output_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
await run_db_op(
self._project_from_process_completed,
event,
record,
output_files,
output_size,
output_mimetypes,
)
return
synthetic_record = {
"plugin": event.plugin_name,
"hook_name": event.hook_name,
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
"output_str": event.stderr if event.exit_code != 0 else "",
"error": event.stderr if event.exit_code != 0 else "",
}
await run_db_op(
self._project_from_process_completed,
event,
synthetic_record,
output_files,
output_size,
output_mimetypes,
)
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
return str(snapshot.output_dir) if snapshot is not None else None
def _project(
self,
event: ArchiveResultEvent,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
@@ -86,8 +159,6 @@ class ArchiveResultService(BaseService):
},
)
plugin_dir = Path(snapshot.output_dir) / event.plugin
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
result.process = process or result.process
result.status = _normalize_status(event.status)
result.output_str = event.output_str
@@ -97,7 +168,28 @@ class ArchiveResultService(BaseService):
result.output_mimetypes = output_mimetypes
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
result.retry_at = None
if event.error:
result.notes = event.error
result.save()
def _project_from_process_completed(
self,
event: ProcessCompletedEvent,
record: dict,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
archive_result_event = ArchiveResultEvent(
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
plugin=record.get("plugin") or event.plugin_name,
hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "",
process_id=event.process_id,
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
)
self._project(archive_result_event, output_files, output_size, output_mimetypes)

View File

@@ -1,19 +1,23 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
import asyncio
from abx_dl.events import BinaryEvent, BinaryInstalledEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class BinaryService(BaseService):
LISTENS_TO = [BinaryEvent, BinaryInstalledEvent]
EMITS = []
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
await sync_to_async(self._project_binary, thread_sensitive=True)(event)
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
await run_db_op(self._project_binary, event)
async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None:
await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event)
async def on_BinaryInstalledEvent__Outer(self, event: BinaryInstalledEvent) -> None:
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
await run_db_op(self._project_installed_binary, event, resolved)
def _project_binary(self, event: BinaryEvent) -> None:
from archivebox.machine.models import Binary, Machine
@@ -44,7 +48,39 @@ class BinaryService(BaseService):
},
)
def _project_installed_binary(self, event: BinaryInstalledEvent) -> None:
def _resolve_installed_binary_metadata(self, event: BinaryInstalledEvent) -> dict[str, str]:
resolved = {
"abspath": event.abspath or "",
"version": event.version or "",
"sha256": event.sha256 or "",
"binproviders": event.binproviders or "",
"binprovider": event.binprovider or "",
}
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
return resolved
try:
from abx_dl.dependencies import load_binary
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
spec = {
"name": event.name,
"binproviders": allowed_providers,
"overrides": event.overrides or {},
}
binary = load_binary(spec)
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
if provider_name:
resolved["binprovider"] = str(provider_name)
except Exception:
pass
return resolved
def _project_installed_binary(self, event: BinaryInstalledEvent, resolved: dict[str, str]) -> None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
@@ -55,10 +91,14 @@ class BinaryService(BaseService):
"status": Binary.StatusChoices.QUEUED,
},
)
binary.abspath = event.abspath or binary.abspath
binary.version = event.version or binary.version
binary.sha256 = event.sha256 or binary.sha256
binary.binprovider = event.binprovider or binary.binprovider
binary.abspath = resolved["abspath"] or binary.abspath
binary.version = resolved["version"] or binary.version
binary.sha256 = resolved["sha256"] or binary.sha256
if resolved["binproviders"]:
binary.binproviders = resolved["binproviders"]
binary.binprovider = resolved["binprovider"] or binary.binprovider
if event.overrides and binary.overrides != event.overrides:
binary.overrides = event.overrides
binary.status = Binary.StatusChoices.INSTALLED
binary.retry_at = None
binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"])
binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])

View File

@@ -1,11 +1,10 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class CrawlService(BaseService):
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
@@ -15,17 +14,17 @@ class CrawlService(BaseService):
self.crawl_id = crawl_id
super().__init__(bus)
async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None:
await sync_to_async(self._mark_started, thread_sensitive=True)()
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None:
await sync_to_async(self._mark_started, thread_sensitive=True)()
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None:
await sync_to_async(self._mark_started, thread_sensitive=True)()
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None:
await sync_to_async(self._mark_completed, thread_sensitive=True)()
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
await run_db_op(self._mark_completed)
def _mark_started(self) -> None:
from archivebox.crawls.models import Crawl

16
archivebox/services/db.py Normal file
View File

@@ -0,0 +1,16 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from django.db import close_old_connections
def _run_db_op(func, *args, **kwargs):
close_old_connections()
try:
return func(*args, **kwargs)
finally:
close_old_connections()
async def run_db_op(func, *args, **kwargs):
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)

View File

@@ -0,0 +1 @@
from abx_dl.cli import LiveBusUI

View File

@@ -1,16 +1,17 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from abx_dl.events import MachineEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class MachineService(BaseService):
LISTENS_TO = [MachineEvent]
EMITS = []
async def on_MachineEvent(self, event: MachineEvent) -> None:
await sync_to_async(self._project, thread_sensitive=True)(event)
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
await run_db_op(self._project, event)
def _project(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine

View File

@@ -3,12 +3,13 @@ from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
if TYPE_CHECKING:
from archivebox.machine.models import Process
@@ -33,27 +34,33 @@ class ProcessService(BaseService):
self.process_ids: dict[str, str] = {}
super().__init__(bus)
async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None:
await sync_to_async(self._project_started, thread_sensitive=True)(event)
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
await run_db_op(self._project_started, event)
async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None:
await sync_to_async(self._project_completed, thread_sensitive=True)(event)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
await run_db_op(self._project_completed, event)
def get_db_process_id(self, process_id: str) -> str | None:
return self.process_ids.get(process_id)
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
from archivebox.machine.models import Machine, Process
from archivebox.machine.models import NetworkInterface, Process
db_process_id = self.process_ids.get(event.process_id)
iface = NetworkInterface.current(refresh=True)
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
process.save(update_fields=["iface", "machine", "modified_at"])
return process
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
process = Process.objects.create(
machine=Machine.current(),
machine=iface.machine,
iface=iface,
process_type=process_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
@@ -77,12 +84,14 @@ class ProcessService(BaseService):
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
process.status = process.StatusChoices.RUNNING
process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()
def _project_completed(self, event: ProcessCompletedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir
process.cmd = [event.hook_path, *event.hook_args]
if not process.cmd:
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
@@ -92,4 +101,5 @@ class ProcessService(BaseService):
process.exit_code = event.exit_code
process.status = process.StatusChoices.EXITED
process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()

View File

@@ -3,16 +3,21 @@ from __future__ import annotations
import asyncio
import json
import os
import shutil
import subprocess
import sys
import time
from contextlib import nullcontext
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
from django.utils import timezone
from rich.console import Console
from abx_dl.events import BinaryEvent
from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
@@ -21,6 +26,7 @@ from .machine_service import MachineService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
from .live_ui import LiveBusUI
def _bus_name(prefix: str, identifier: str) -> str:
@@ -35,6 +41,19 @@ def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
return [name.strip() for name in raw.split(",") if name.strip()]
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
total = 0
for plugin in selected.values():
total += len(list(plugin.get_crawl_hooks()))
total += len(list(plugin.get_snapshot_hooks()))
return total
def _runner_debug(message: str) -> None:
print(f"[runner] {message}", file=sys.stderr, flush=True)
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
@@ -78,10 +97,51 @@ async def _stop_bus_trace(bus) -> None:
bus._archivebox_trace_task = None
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
return False
from archivebox.config import CONSTANTS
from archivebox.machine.models import Machine, Process
Process.cleanup_stale_running()
machine = Machine.current()
if Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists():
return False
log_path = CONSTANTS.LOGS_DIR / "errors.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
with log_path.open("a", encoding="utf-8") as log_handle:
subprocess.Popen(
[sys.executable, "-m", "archivebox", "run", "--daemon"],
cwd=str(CONSTANTS.DATA_DIR),
env=env,
stdin=subprocess.DEVNULL,
stdout=log_handle,
stderr=log_handle,
start_new_session=True,
)
return True
class CrawlRunner:
MAX_CONCURRENT_SNAPSHOTS = 8
def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None):
def __init__(
self,
crawl,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
):
self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
@@ -90,7 +150,12 @@ class CrawlRunner:
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot)
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
self.bus,
crawl_id=str(crawl.id),
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids
@@ -100,6 +165,29 @@ class CrawlRunner:
self.persona = None
self.base_config: dict[str, Any] = {}
self.primary_url = ""
self._live_stream = None
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
CrawlService(bus, crawl_id=str(self.crawl.id))
SnapshotService(
bus,
crawl_id=str(self.crawl.id),
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=self.plugins,
config_overrides=config_overrides,
auto_install=True,
emit_jsonl=False,
)
return bus, abx_services
async def run(self) -> None:
from asgiref.sync import sync_to_async
@@ -107,35 +195,63 @@ class CrawlRunner:
try:
await sync_to_async(self._prepare, thread_sensitive=True)()
_attach_bus_trace(self.bus)
self.abx_services = setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides=self.base_config,
auto_install=True,
emit_jsonl=False,
)
if self.crawl.get_system_task() == INSTALL_URL:
await self._run_install_crawl()
else:
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
await self._run_crawl_setup(root_snapshot_id)
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
await self._wait_for_snapshot_tasks()
await self._run_crawl_cleanup(root_snapshot_id)
if self.abx_services is not None:
await self.abx_services.process.wait_for_background_monitors()
live_ui = self._create_live_ui()
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(self.bus)
self.abx_services = setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides=self.base_config,
auto_install=True,
emit_jsonl=False,
)
if self.crawl.get_system_task() == INSTALL_URL:
await self._run_install_crawl()
else:
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
await self._run_crawl_setup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
await self._wait_for_snapshot_tasks()
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
await self._run_crawl_cleanup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
if self.abx_services is not None:
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
await self.abx_services.process.wait_for_background_monitors()
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
finally:
await _stop_bus_trace(self.bus)
await self.bus.stop()
if self._live_stream is not None:
try:
self._live_stream.close()
except Exception:
pass
self._live_stream = None
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
if crawl_is_finished:
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
else:
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
async def enqueue_snapshot(self, snapshot_id: str) -> None:
@@ -145,17 +261,36 @@ class CrawlRunner:
task = asyncio.create_task(self._run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
return None
async def _wait_for_snapshot_tasks(self) -> None:
while True:
active = [task for task in self.snapshot_tasks.values() if not task.done()]
if not active:
pending_tasks: list[asyncio.Task[None]] = []
for snapshot_id, task in list(self.snapshot_tasks.items()):
if task.done():
if self.snapshot_tasks.get(snapshot_id) is task:
self.snapshot_tasks.pop(snapshot_id, None)
task.result()
continue
pending_tasks.append(task)
if not pending_tasks:
return
await asyncio.gather(*active)
done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
task.result()
def _prepare(self) -> None:
from archivebox.config.configset import get_config
from archivebox.machine.models import NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
current_iface = NetworkInterface.current(refresh=True)
current_process = Process.current()
if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
current_process.iface = current_iface
current_process.machine = current_iface.machine
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
if self.selected_plugins is None:
@@ -168,6 +303,52 @@ class CrawlRunner:
if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl)
def _create_live_ui(self) -> LiveBusUI | None:
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
if not interactive_tty:
return None
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
stream = self._live_stream
except OSError:
self._live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
live_ui = LiveBusUI(
self.bus,
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
ui_console=ui_console,
interactive_tty=True,
)
live_ui.print_intro(
url=self.primary_url or INSTALL_URL,
output_dir=Path(self.crawl.output_dir),
plugins_label=plugins_label,
)
return live_ui
def _create_root_snapshots(self) -> list[str]:
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
@@ -290,18 +471,34 @@ class CrawlRunner:
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
snapshot_bus, snapshot_services = self._create_projector_bus(
identifier=f"{self.crawl.id}_{snapshot['id']}",
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
try:
_attach_bus_trace(snapshot_bus)
_runner_debug(f"snapshot {snapshot_id} starting download()")
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=snapshot_bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
await snapshot_services.process.wait_for_background_monitors()
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
await _stop_bus_trace(snapshot_bus)
await snapshot_bus.stop()
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
@@ -322,11 +519,24 @@ class CrawlRunner:
}
def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None:
def run_crawl(
crawl_id: str,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=crawl_id)
asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run())
asyncio.run(
CrawlRunner(
crawl,
snapshot_ids=snapshot_ids,
selected_plugins=selected_plugins,
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
).run()
)
async def _run_binary(binary_id: str) -> None:
@@ -397,28 +607,203 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
live_stream = None
try:
_attach_bus_trace(bus)
await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
config_overrides=config,
emit_jsonl=False,
bus=bus,
)
await abx_services.process.wait_for_background_monitors()
selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names)
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
timeout_seconds = int(config.get("TIMEOUT") or 60)
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
ui_console = None
live_ui = None
if interactive_tty:
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
stream = live_stream
except OSError:
live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
output_dir = Path(temp_dir)
if ui_console is not None:
live_ui = LiveBusUI(
bus,
total_hooks=_count_selected_hooks(selected_plugins, None),
timeout_seconds=timeout_seconds,
ui_console=ui_console,
interactive_tty=interactive_tty,
)
live_ui.print_intro(
url=INSTALL_URL,
output_dir=output_dir,
plugins_label=plugins_label,
)
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(bus)
results = await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
output_dir=output_dir,
config_overrides=config,
emit_jsonl=False,
bus=bus,
)
await abx_services.process.wait_for_background_monitors()
if live_ui is not None:
live_ui.print_summary(results, output_dir=output_dir)
finally:
await _stop_bus_trace(bus)
await bus.stop()
try:
if live_stream is not None:
live_stream.close()
except Exception:
pass
def run_install(*, plugin_names: list[str] | None = None) -> None:
asyncio.run(_run_install(plugin_names=plugin_names))
def recover_orphaned_crawls() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.machine.models import Process
active_crawl_ids: set[str] = set()
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
continue
crawl_id = env.get("CRAWL_ID")
if crawl_id:
active_crawl_ids.add(str(crawl_id))
recovered = 0
now = timezone.now()
orphaned_crawls = Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set")
for crawl in orphaned_crawls:
if str(crawl.id) in active_crawl_ids:
continue
snapshots = list(crawl.snapshot_set.all())
if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
crawl.retry_at = now
crawl.save(update_fields=["retry_at", "modified_at"])
recovered += 1
return recovered
def recover_orphaned_snapshots() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
active_snapshot_ids: set[str] = set()
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
continue
snapshot_id = env.get("SNAPSHOT_ID")
if snapshot_id:
active_snapshot_ids.add(str(snapshot_id))
recovered = 0
now = timezone.now()
orphaned_snapshots = (
Snapshot.objects
.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set")
)
for snapshot in orphaned_snapshots:
if str(snapshot.id) in active_snapshot_ids:
continue
results = list(snapshot.archiveresult_set.all())
if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or now
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
crawl = snapshot.crawl
if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = now
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
crawl.status = Crawl.StatusChoices.QUEUED
crawl.retry_at = now
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
return recovered
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Snapshot
from archivebox.machine.models import Binary
while True:
@@ -436,10 +821,48 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
.first()
)
if binary is not None:
if not binary.claim_processing_lock(lock_seconds=60):
continue
run_binary(str(binary.id))
continue
pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED)
queued_crawls = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.QUEUED,
)
if crawl_id:
queued_crawls = queued_crawls.filter(id=crawl_id)
queued_crawls = queued_crawls.order_by("retry_at", "created_at")
queued_crawl = queued_crawls.first()
if queued_crawl is not None:
if not queued_crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
continue
if crawl_id is None:
snapshot = (
Snapshot.objects.filter(retry_at__lte=timezone.now())
.exclude(status=Snapshot.StatusChoices.SEALED)
.select_related("crawl")
.order_by("retry_at", "created_at")
.first()
)
if snapshot is not None:
if not snapshot.claim_processing_lock(lock_seconds=60):
continue
run_crawl(
str(snapshot.crawl_id),
snapshot_ids=[str(snapshot.id)],
process_discovered_snapshots_inline=False,
)
continue
pending = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.STARTED,
)
if crawl_id:
pending = pending.filter(id=crawl_id)
pending = pending.order_by("retry_at", "created_at")
@@ -451,4 +874,7 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
continue
return 0
run_crawl(str(crawl.id))
if not crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)

View File

@@ -1,13 +1,13 @@
from __future__ import annotations
import re
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class SnapshotService(BaseService):
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
@@ -18,13 +18,17 @@ class SnapshotService(BaseService):
self.schedule_snapshot = schedule_snapshot
super().__init__(bus)
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
snapshot_id = await run_db_op(self._project_snapshot, event)
if snapshot_id:
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
if snapshot_id:
await sync_to_async(self._write_snapshot_details)(snapshot_id)
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
from archivebox.core.models import Snapshot
@@ -39,7 +43,6 @@ class SnapshotService(BaseService):
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
snapshot.ensure_crawl_symlink()
return str(snapshot.id)
if event.depth > crawl.max_depth:
@@ -73,56 +76,36 @@ class SnapshotService(BaseService):
if snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
snapshot.ensure_crawl_symlink()
return str(snapshot.id)
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
from archivebox.config.configset import get_config
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
config = get_config(
user=getattr(crawl, "created_by", None),
crawl=crawl,
snapshot=parent_snapshot,
)
def to_pattern_list(value):
if isinstance(value, list):
return value
if isinstance(value, str):
return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
return []
allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
for pattern in denylist:
try:
if re.search(pattern, url):
return False
except re.error:
continue
if allowlist:
for pattern in allowlist:
try:
if re.search(pattern, url):
return True
except re.error:
continue
return False
return True
def _seal_snapshot(self, snapshot_id: str) -> None:
def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
if snapshot is None:
return
return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
return str(snapshot.id)
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is not None:
snapshot.ensure_crawl_symlink()
def _write_snapshot_details(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is None:
return
snapshot.write_index_jsonl()
snapshot.write_json_details()
snapshot.write_html_details()

View File

@@ -1,16 +1,17 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from abx_dl.events import TagEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class TagService(BaseService):
LISTENS_TO = [TagEvent]
EMITS = []
async def on_TagEvent(self, event: TagEvent) -> None:
await sync_to_async(self._project, thread_sensitive=True)(event)
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
await run_db_op(self._project, event)
def _project(self, event: TagEvent) -> None:
from archivebox.core.models import Snapshot, Tag

View File

@@ -1083,8 +1083,11 @@
width: 100% !important;
}
body.filters-collapsed.change-list #changelist .changelist-form-container > div {
body.filters-collapsed.change-list #changelist .changelist-form-container > div,
body.filters-collapsed.change-list #changelist .changelist-form-container > form {
max-width: 100% !important;
width: 100% !important;
flex: 1 1 100% !important;
}
/* Actions bar */
@@ -1372,7 +1375,8 @@
order: 2;
align-self: flex-start;
}
body.change-list #changelist .changelist-form-container > div {
body.change-list #changelist .changelist-form-container > div,
body.change-list #changelist .changelist-form-container > form {
flex: 1 1 auto;
min-width: 0;
order: 1;

View File

@@ -0,0 +1,268 @@
{% extends "admin/change_form.html" %}
{% block bodyclass %}{{ block.super }} app-core model-tag tag-form-page{% endblock %}
{% block extrastyle %}
{{ block.super }}
<style>
.tag-form-hero {
margin: 0 0 20px;
padding: 22px 24px;
border-radius: 20px;
border: 1px solid #dbe4ee;
background:
radial-gradient(circle at top right, rgba(245, 158, 11, 0.12), transparent 30%),
linear-gradient(135deg, #fff7ed 0%, #ffffff 48%, #eff6ff 100%);
box-shadow: 0 12px 30px rgba(15, 23, 42, 0.06);
display: grid;
gap: 16px;
grid-template-columns: minmax(0, 1.7fr) minmax(260px, 1fr);
}
.tag-form-hero h2 {
margin: 0 0 8px;
font-size: 28px;
line-height: 1.05;
color: #111827;
}
.tag-form-hero p {
margin: 0;
color: #475569;
font-size: 14px;
max-width: 70ch;
}
.tag-form-hero__meta {
display: grid;
gap: 10px;
}
.tag-form-hero__meta div {
padding: 14px 16px;
border-radius: 14px;
border: 1px solid rgba(203, 213, 225, 0.85);
background: rgba(255, 255, 255, 0.88);
}
.tag-form-hero__meta span {
display: block;
margin-bottom: 8px;
font-size: 11px;
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.08em;
color: #64748b;
}
.tag-similar-panel {
margin-top: 18px;
padding: 18px;
border-radius: 18px;
border: 1px solid #dbe4ee;
background: #fff;
box-shadow: 0 10px 24px rgba(15, 23, 42, 0.05);
}
.tag-similar-panel h3 {
margin: 0 0 6px;
font-size: 16px;
color: #111827;
}
.tag-similar-panel p {
margin: 0 0 14px;
font-size: 13px;
color: #64748b;
}
.tag-similar-list {
display: grid;
gap: 10px;
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
}
.tag-similar-card {
display: grid;
gap: 8px;
padding: 14px 16px;
border-radius: 16px;
border: 1px solid #dbe4ee;
background: #f8fafc;
text-decoration: none;
color: #0f172a;
}
.tag-similar-card strong {
font-size: 15px;
line-height: 1.1;
}
.tag-similar-card span {
font-size: 12px;
color: #64748b;
}
.tag-similar-card__snapshots {
display: flex;
flex-wrap: wrap;
gap: 8px;
}
.tag-similar-snapshot {
display: inline-flex;
align-items: center;
gap: 6px;
min-width: 0;
max-width: 100%;
padding: 6px 8px;
border-radius: 999px;
background: #fff;
border: 1px solid #dbe4ee;
font-size: 11px;
color: #334155;
}
.tag-similar-snapshot img {
width: 14px;
height: 14px;
border-radius: 4px;
flex: 0 0 auto;
}
.tag-similar-empty {
padding: 16px;
border-radius: 16px;
border: 1px dashed #cbd5e1;
background: #f8fafc;
color: #64748b;
font-size: 13px;
}
@media (max-width: 920px) {
.tag-form-hero {
grid-template-columns: 1fr;
}
}
</style>
{% endblock %}
{% block form_top %}
<section class="tag-form-hero">
<div>
<h2>{% if add %}New Tag{% else %}Edit Tag{% endif %}</h2>
<p>Similar tags are shown below while typing.</p>
</div>
<div class="tag-form-hero__meta">
<div>
<span>Matches</span>
<strong>Current tags</strong>
</div>
<div>
<span>Links</span>
<strong>Open filtered snapshots</strong>
</div>
</div>
</section>
{{ block.super }}
{% endblock %}
{% block after_field_sets %}
{{ block.super }}
<section
id="tag-similar-panel"
class="tag-similar-panel"
data-search-url="{{ tag_search_api_url }}"
>
<h3>Similar Tags</h3>
<p>Updates while typing.</p>
<div id="tag-similar-list" class="tag-similar-list"></div>
</section>
{{ tag_similar_cards|json_script:"abx-tag-similar-data" }}
<script>
document.addEventListener('DOMContentLoaded', function () {
const panel = document.getElementById('tag-similar-panel');
const list = document.getElementById('tag-similar-list');
const nameInput = document.querySelector('input[data-tag-name-input="1"]');
if (!panel || !list || !nameInput) return;
const searchUrl = panel.dataset.searchUrl;
let similarCards = JSON.parse(document.getElementById('abx-tag-similar-data').textContent || '[]');
let timeoutId = null;
function escapeHtml(value) {
const div = document.createElement('div');
div.textContent = value == null ? '' : String(value);
return div.innerHTML;
}
function getApiKey() {
return (window.ARCHIVEBOX_API_KEY || '').trim();
}
function withApiKey(url) {
const apiKey = getApiKey();
if (!apiKey) return url;
const separator = url.includes('?') ? '&' : '?';
return url + separator + 'api_key=' + encodeURIComponent(apiKey);
}
function buildHeaders() {
const headers = {};
const apiKey = getApiKey();
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
return headers;
}
function render(cards) {
const filtered = (cards || []).filter(function (card) {
return (card.name || '').toLowerCase() !== (nameInput.value || '').trim().toLowerCase();
});
if (!filtered.length) {
list.innerHTML = '<div class="tag-similar-empty">No similar tags.</div>';
return;
}
list.innerHTML = filtered.map(function (card) {
const snapshots = (card.snapshots || []).slice(0, 3).map(function (snapshot) {
return '' +
'<span class="tag-similar-snapshot">' +
'<img src="' + escapeHtml(snapshot.favicon_url) + '" alt="" onerror="this.style.display=\\'none\\'">' +
'<span>' + escapeHtml(snapshot.title) + '</span>' +
'</span>';
}).join('');
return '' +
'<a class="tag-similar-card" href="' + escapeHtml(card.filter_url) + '">' +
'<strong>' + escapeHtml(card.name) + '</strong>' +
'<span>' + escapeHtml(card.num_snapshots) + ' snapshots · slug: ' + escapeHtml(card.slug) + '</span>' +
'<div class="tag-similar-card__snapshots">' + (snapshots || '<span class="tag-similar-snapshot">No snapshots</span>') + '</div>' +
'</a>';
}).join('');
}
async function fetchSimilar(query) {
const response = await fetch(withApiKey(searchUrl + '?q=' + encodeURIComponent(query || '')), {
headers: buildHeaders(),
credentials: 'same-origin',
});
if (!response.ok) return [];
const payload = await response.json();
return payload.tags || [];
}
nameInput.addEventListener('input', function () {
window.clearTimeout(timeoutId);
timeoutId = window.setTimeout(async function () {
similarCards = await fetchSimilar((nameInput.value || '').trim());
render(similarCards);
}, 140);
});
render(similarCards);
});
</script>
{% endblock %}

View File

@@ -0,0 +1,997 @@
{% extends "admin/change_list.html" %}
{% block bodyclass %}{{ block.super }} app-core model-tag change-list tag-admin-page{% endblock %}
{% block object-tools %}{% endblock %}
{% block extrastyle %}
{{ block.super }}
<style>
.tag-admin-shell {
display: grid;
gap: 12px;
}
.tag-admin-toolbar {
display: flex;
flex-wrap: wrap;
gap: 12px;
align-items: start;
}
.tag-admin-panel {
flex: 1 1 320px;
padding: 12px;
border-radius: 16px;
border: 1px solid #dbe4ee;
background: #fff;
box-shadow: 0 8px 18px rgba(15, 23, 42, 0.05);
}
.tag-admin-panel--search {
flex: 3 1 360px;
}
.tag-admin-panel--filters {
flex: 3 1 440px;
}
.tag-admin-panel--create {
flex: 1 1 280px;
}
.tag-admin-panel h2 {
margin: 0 0 12px;
font-size: 16px;
color: #0f172a;
}
.tag-create-form,
.tag-search-form {
display: grid;
gap: 10px;
}
.tag-input-row {
display: flex;
gap: 10px;
align-items: center;
}
.tag-create-form .tag-input-row {
display: grid;
grid-template-columns: minmax(0, 1fr) auto;
align-items: center;
}
.tag-input-row input {
flex: 1 1 auto;
min-width: 0;
height: 40px;
box-sizing: border-box;
padding: 0 12px;
line-height: 1.2;
border-radius: 10px;
border: 1px solid #cbd5e1;
background: #f8fafc;
font-size: 13px;
color: #0f172a;
}
.tag-input-row input:focus {
outline: none;
border-color: #0ea5e9;
box-shadow: 0 0 0 4px rgba(14, 165, 233, 0.14);
background: #fff;
}
.tag-button,
.tag-chip-button {
border: 0;
border-radius: 10px;
cursor: pointer;
font-weight: 700;
transition: transform 0.12s ease, box-shadow 0.12s ease, opacity 0.12s ease;
}
.tag-button:hover,
.tag-chip-button:hover {
transform: translateY(-1px);
box-shadow: 0 8px 20px rgba(15, 23, 42, 0.08);
}
.tag-button:disabled,
.tag-chip-button:disabled {
cursor: wait;
opacity: 0.6;
transform: none;
box-shadow: none;
}
.tag-button {
flex: 0 0 auto;
height: 40px;
padding: 0 12px;
background: linear-gradient(135deg, #0f766e 0%, #0ea5e9 100%);
color: #fff;
white-space: nowrap;
font-size: 12px;
}
.tag-toolbar-meta {
display: flex;
align-items: center;
justify-content: space-between;
gap: 10px;
flex-wrap: wrap;
font-size: 12px;
color: #64748b;
}
.tag-toolbar-meta strong {
color: #0f172a;
}
.tag-help {
margin: 0;
font-size: 12px;
color: #64748b;
}
.tag-filter-grid {
display: grid;
gap: 10px;
grid-template-columns: repeat(3, minmax(0, 1fr));
}
.tag-select-field {
display: grid;
gap: 4px;
min-width: 0;
font-size: 11px;
font-weight: 700;
color: #475569;
}
.tag-select-field select {
width: 100%;
min-width: 0;
height: 40px;
box-sizing: border-box;
padding: 0 10px;
line-height: 1.2;
border-radius: 10px;
border: 1px solid #cbd5e1;
background: #f8fafc;
color: #0f172a;
font-size: 12px;
vertical-align: middle;
}
.tag-select-field select:focus {
outline: none;
border-color: #0ea5e9;
box-shadow: 0 0 0 4px rgba(14, 165, 233, 0.14);
background: #fff;
}
.tag-grid {
display: grid;
gap: 12px;
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
}
.tag-card {
position: relative;
display: grid;
gap: 10px;
padding: 10px;
border-radius: 16px;
border: 1px solid #dbe4ee;
background:
linear-gradient(180deg, rgba(255, 255, 255, 0.96) 0%, rgba(248, 250, 252, 0.94) 100%);
box-shadow: 0 8px 18px rgba(15, 23, 42, 0.05);
transition: transform 0.14s ease, border-color 0.14s ease, box-shadow 0.14s ease;
cursor: pointer;
}
.tag-card:hover {
transform: translateY(-2px);
border-color: #93c5fd;
box-shadow: 0 14px 26px rgba(15, 23, 42, 0.08);
}
.tag-card__header {
display: flex;
justify-content: space-between;
gap: 10px;
align-items: flex-start;
}
.tag-card__title {
flex: 1 1 auto;
min-width: 0;
display: grid;
gap: 4px;
}
.tag-card__title strong,
.tag-card__rename strong {
display: block;
font-size: 17px;
line-height: 1.1;
color: #111827;
word-break: break-word;
}
.tag-card__count {
display: inline-flex;
align-items: center;
white-space: nowrap;
padding: 3px 8px;
border-radius: 999px;
background: #e0f2fe;
color: #075985;
font-size: 11px;
font-weight: 700;
}
.tag-card__actions {
flex: 0 0 auto;
display: flex;
flex-wrap: wrap;
justify-content: flex-end;
align-items: center;
gap: 6px;
}
.tag-chip-button {
height: 30px;
padding: 0 8px;
background: #fff;
border: 1px solid #dbe4ee;
color: #334155;
font-size: 11px;
}
.tag-chip-button.is-danger {
background: #fff1f2;
border-color: #fecdd3;
color: #be123c;
}
.tag-card__rename {
display: none;
gap: 6px;
align-items: center;
flex-wrap: wrap;
margin-top: 2px;
}
.tag-card.is-editing .tag-card__display {
display: none;
}
.tag-card.is-editing .tag-card__rename {
display: flex;
}
.tag-card.is-editing .tag-card__header {
display: grid;
grid-template-columns: minmax(0, 1fr);
}
.tag-card.is-editing .tag-card__actions {
justify-content: flex-start;
}
.tag-card__rename input {
flex: 1 1 220px;
min-width: 0;
height: 34px;
padding: 0 10px;
border-radius: 10px;
border: 1px solid #cbd5e1;
background: #fff;
font-size: 12px;
}
.tag-card__snapshots {
display: grid;
gap: 8px;
grid-template-columns: repeat(auto-fit, minmax(130px, 1fr));
}
.tag-snapshot-badge {
display: flex;
align-items: center;
gap: 8px;
min-width: 0;
padding: 6px 8px;
border-radius: 12px;
border: 1px solid #dbe4ee;
background: rgba(255, 255, 255, 0.86);
text-decoration: none;
color: #0f172a;
}
.tag-snapshot-badge img {
width: 16px;
height: 16px;
border-radius: 4px;
flex: 0 0 auto;
background: #f8fafc;
}
.tag-snapshot-badge span {
min-width: 0;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
font-size: 11px;
font-weight: 600;
}
.tag-card__empty {
padding: 14px;
border-radius: 14px;
border: 1px dashed #cbd5e1;
background: #f8fafc;
color: #64748b;
font-size: 13px;
}
.tag-toast {
position: sticky;
top: 12px;
z-index: 30;
display: none;
width: fit-content;
max-width: min(100%, 420px);
padding: 12px 14px;
border-radius: 14px;
font-size: 13px;
font-weight: 700;
box-shadow: 0 14px 30px rgba(15, 23, 42, 0.12);
}
.tag-toast.is-visible {
display: block;
}
.tag-toast.is-success {
background: #dcfce7;
color: #166534;
}
.tag-toast.is-error {
background: #fee2e2;
color: #991b1b;
}
.tag-empty-state {
padding: 24px 18px;
border-radius: 16px;
border: 1px dashed #cbd5e1;
background: #fff;
text-align: center;
color: #64748b;
font-size: 13px;
}
</style>
{% endblock %}
{% block content %}
<div id="content-main">
<div
id="abx-tag-admin"
class="tag-admin-shell"
data-search-url="{{ tag_search_api_url }}"
data-create-url="{{ tag_create_api_url }}"
>
<section class="tag-admin-toolbar">
<div class="tag-admin-panel tag-admin-panel--search">
<div class="tag-search-form">
<div class="tag-input-row">
<input
id="tag-live-search"
type="search"
placeholder="Search by tag name"
value="{{ initial_query }}"
autocomplete="off"
>
</div>
<div class="tag-toolbar-meta">
<span id="tag-query-label">{% if initial_query %}“{{ initial_query }}”{% else %}All tags{% endif %}</span>
</div>
</div>
</div>
<div class="tag-admin-panel tag-admin-panel--filters">
<div class="tag-filter-grid">
<label class="tag-select-field" for="tag-sort-select">
<span>Sort</span>
<select id="tag-sort-select">
{% for value, label in tag_sort_choices %}
<option value="{{ value }}"{% if value == initial_sort %} selected{% endif %}>{{ label }}</option>
{% endfor %}
</select>
</label>
<label class="tag-select-field" for="tag-created-by-select">
<span>Created By</span>
<select id="tag-created-by-select">
<option value="">All users</option>
{% for value, label in tag_created_by_choices %}
<option value="{{ value }}"{% if value == initial_created_by %} selected{% endif %}>{{ label }}</option>
{% endfor %}
</select>
</label>
<label class="tag-select-field" for="tag-year-select">
<span>Year</span>
<select id="tag-year-select">
<option value="">All years</option>
{% for value in tag_year_choices %}
<option value="{{ value }}"{% if value == initial_year %} selected{% endif %}>{{ value }}</option>
{% endfor %}
</select>
</label>
</div>
</div>
<div class="tag-admin-panel tag-admin-panel--create">
<form id="tag-create-form" class="tag-create-form">
{% csrf_token %}
<div class="tag-input-row">
<input
id="tag-create-name"
type="text"
name="name"
placeholder="New tag name"
autocomplete="off"
value=""
>
<button class="tag-button" type="submit">Create</button>
</div>
</form>
</div>
</section>
<div id="tag-toast" class="tag-toast" aria-live="polite"></div>
<div id="tag-card-grid" class="tag-grid">
{% if initial_tag_cards %}
{% for card in initial_tag_cards %}
<article
class="tag-card"
data-id="{{ card.id }}"
data-filter-url="{{ card.filter_url }}"
data-rename-url="{{ card.rename_url }}"
data-delete-url="{{ card.delete_url }}"
data-export-urls-url="{{ card.export_urls_url }}"
data-export-jsonl-url="{{ card.export_jsonl_url }}"
>
<div class="tag-card__header">
<div class="tag-card__title">
<div class="tag-card__display">
<strong><a href="{{ card.filter_url }}" style="color:inherit;text-decoration:none;">{{ card.name }}</a></strong>
</div>
<div class="tag-card__rename">
<input type="text" value="{{ card.name }}" aria-label="Rename tag {{ card.name }}">
<button type="button" class="tag-chip-button" data-action="save-edit">Save</button>
<button type="button" class="tag-chip-button" data-action="cancel-edit">Cancel</button>
</div>
</div>
<div class="tag-card__actions">
<button type="button" class="tag-chip-button" data-action="edit" aria-label="Rename tag" title="Rename tag"></button>
<button type="button" class="tag-chip-button" data-action="copy-urls">Copy URLs</button>
<button type="button" class="tag-chip-button" data-action="download-jsonl">JSONL</button>
<button type="button" class="tag-chip-button is-danger" data-action="delete">Delete</button>
<span class="tag-card__count">{{ card.num_snapshots }}</span>
</div>
</div>
<div class="tag-card__snapshots">
{% if card.snapshots %}
{% for snapshot in card.snapshots %}
<a class="tag-snapshot-badge" href="{{ snapshot.admin_url }}" title="{{ snapshot.url }}">
<img src="{{ snapshot.favicon_url }}" alt="" onerror="this.style.display='none'">
<span>{{ snapshot.title }}</span>
</a>
{% endfor %}
{% else %}
<div class="tag-card__empty">No snapshots attached yet.</div>
{% endif %}
</div>
</article>
{% endfor %}
{% else %}
<div class="tag-empty-state">No tags.</div>
{% endif %}
</div>
</div>
</div>
{{ initial_tag_cards|json_script:"abx-tag-cards-data" }}
<script>
document.addEventListener('DOMContentLoaded', function () {
const shell = document.getElementById('abx-tag-admin');
if (!shell) return;
const initialCards = JSON.parse(document.getElementById('abx-tag-cards-data').textContent || '[]');
const searchUrl = shell.dataset.searchUrl;
const createUrl = shell.dataset.createUrl;
const searchInput = document.getElementById('tag-live-search');
const sortSelect = document.getElementById('tag-sort-select');
const createdBySelect = document.getElementById('tag-created-by-select');
const yearSelect = document.getElementById('tag-year-select');
const createForm = document.getElementById('tag-create-form');
const createInput = document.getElementById('tag-create-name');
const grid = document.getElementById('tag-card-grid');
const queryLabel = document.getElementById('tag-query-label');
const toast = document.getElementById('tag-toast');
let cards = initialCards;
let searchTimeout = null;
let activeQuery = (searchInput?.value || '').trim();
function escapeHtml(value) {
const div = document.createElement('div');
div.textContent = value == null ? '' : String(value);
return div.innerHTML;
}
function slugify(value) {
return String(value || '')
.toLowerCase()
.trim()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '') || 'tag';
}
function getCSRFToken() {
const input = document.querySelector('input[name="csrfmiddlewaretoken"]');
if (input) return input.value;
const cookies = document.cookie.split(';');
for (const cookieRaw of cookies) {
const cookie = cookieRaw.trim();
if (cookie.startsWith('csrftoken=')) return cookie.slice('csrftoken='.length);
}
return '';
}
function getApiKey() {
return (window.ARCHIVEBOX_API_KEY || '').trim();
}
function withApiKey(url) {
const apiKey = getApiKey();
if (!apiKey) return url;
const separator = url.includes('?') ? '&' : '?';
return url + separator + 'api_key=' + encodeURIComponent(apiKey);
}
function buildHeaders(isJsonBody) {
const headers = {};
if (isJsonBody) headers['Content-Type'] = 'application/json';
const csrfToken = getCSRFToken();
if (csrfToken) headers['X-CSRFToken'] = csrfToken;
const apiKey = getApiKey();
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
return headers;
}
function setToast(message, tone) {
toast.textContent = message;
toast.className = 'tag-toast is-visible ' + (tone === 'error' ? 'is-error' : 'is-success');
window.clearTimeout(setToast._timer);
setToast._timer = window.setTimeout(function () {
toast.className = 'tag-toast';
toast.textContent = '';
}, 2600);
}
function getCurrentState(overrides) {
const next = overrides || {};
return {
query: typeof next.query === 'string' ? next.query.trim() : (searchInput?.value || '').trim(),
sort: typeof next.sort === 'string' ? next.sort : (sortSelect?.value || 'created_desc'),
created_by: typeof next.created_by === 'string' ? next.created_by : (createdBySelect?.value || ''),
year: typeof next.year === 'string' ? next.year : (yearSelect?.value || ''),
};
}
function syncSearchState(state) {
if (searchInput) searchInput.value = state.query;
if (sortSelect) sortSelect.value = state.sort;
if (createdBySelect) createdBySelect.value = state.created_by;
if (yearSelect) yearSelect.value = state.year;
}
function syncLocation(state) {
const url = new URL(window.location.href);
if (state.query) {
url.searchParams.set('q', state.query);
} else {
url.searchParams.delete('q');
}
if (state.sort && state.sort !== 'created_desc') {
url.searchParams.set('sort', state.sort);
} else {
url.searchParams.delete('sort');
}
if (state.created_by) {
url.searchParams.set('created_by', state.created_by);
} else {
url.searchParams.delete('created_by');
}
if (state.year) {
url.searchParams.set('year', state.year);
} else {
url.searchParams.delete('year');
}
window.history.replaceState({}, '', url.toString());
}
function setMeta(state, count) {
const baseLabel = state.query ? '"' + state.query + '"' : 'All tags';
queryLabel.textContent = baseLabel + ' · ' + count + ' shown';
activeQuery = state.query;
}
function renderCards(nextCards, state) {
cards = Array.isArray(nextCards) ? nextCards : [];
setMeta(state || getCurrentState(), cards.length);
if (!cards.length) {
grid.innerHTML = '<div class="tag-empty-state">No tags.</div>';
return;
}
grid.innerHTML = cards.map(function (card) {
const snapshotHtml = (card.snapshots || []).length
? card.snapshots.map(function (snapshot) {
return '' +
'<a class="tag-snapshot-badge" href="' + escapeHtml(snapshot.admin_url) + '" title="' + escapeHtml(snapshot.url) + '">' +
'<img src="' + escapeHtml(snapshot.favicon_url) + '" alt="" onerror="this.hidden=true">' +
'<span>' + escapeHtml(snapshot.title) + '</span>' +
'</a>';
}).join('')
: '<div class="tag-card__empty">No snapshots attached yet.</div>';
return '' +
'<article class="tag-card" data-id="' + escapeHtml(card.id) + '" data-filter-url="' + escapeHtml(card.filter_url) + '" data-rename-url="' + escapeHtml(card.rename_url) + '" data-delete-url="' + escapeHtml(card.delete_url) + '" data-export-urls-url="' + escapeHtml(card.export_urls_url) + '" data-export-jsonl-url="' + escapeHtml(card.export_jsonl_url) + '">' +
'<div class="tag-card__header">' +
'<div class="tag-card__title">' +
'<div class="tag-card__display">' +
'<strong>' + escapeHtml(card.name) + '</strong>' +
'</div>' +
'<div class="tag-card__rename">' +
'<input type="text" value="' + escapeHtml(card.name) + '" aria-label="Rename tag ' + escapeHtml(card.name) + '">' +
'<button type="button" class="tag-chip-button" data-action="save-edit">Save</button>' +
'<button type="button" class="tag-chip-button" data-action="cancel-edit">Cancel</button>' +
'</div>' +
'</div>' +
'<div class="tag-card__actions">' +
'<button type="button" class="tag-chip-button" data-action="edit" aria-label="Rename tag" title="Rename tag">✎</button>' +
'<button type="button" class="tag-chip-button" data-action="copy-urls">Copy URLs</button>' +
'<button type="button" class="tag-chip-button" data-action="download-jsonl">JSONL</button>' +
'<button type="button" class="tag-chip-button is-danger" data-action="delete">Delete</button>' +
'<span class="tag-card__count">' + escapeHtml(card.num_snapshots) + '</span>' +
'</div>' +
'</div>' +
'<div class="tag-card__snapshots">' + snapshotHtml + '</div>' +
'</article>';
}).join('');
}
async function fetchCards(state) {
const params = new URLSearchParams();
if (state.query) params.set('q', state.query);
if (state.sort) params.set('sort', state.sort);
if (state.created_by) params.set('created_by', state.created_by);
if (state.year) params.set('year', state.year);
const url = withApiKey(searchUrl + '?' + params.toString());
const response = await fetch(url, {
headers: buildHeaders(false),
credentials: 'same-origin',
});
if (!response.ok) {
const message = await response.text();
throw new Error(message || 'Failed to load matching tags');
}
const payload = await response.json();
return {
tags: payload.tags || [],
state: {
query: state.query,
sort: payload.sort || state.sort,
created_by: payload.created_by || '',
year: payload.year || '',
},
};
}
async function refreshCards(overrides) {
const requestedState = getCurrentState(overrides);
const result = await fetchCards(requestedState);
syncSearchState(result.state);
renderCards(result.tags, result.state);
syncLocation(result.state);
return result.tags;
}
async function submitJson(url, method, payload) {
const response = await fetch(withApiKey(url), {
method: method,
headers: buildHeaders(true),
credentials: 'same-origin',
body: JSON.stringify(payload || {}),
});
if (!response.ok) {
let message = 'Request failed';
try {
const data = await response.json();
message = data.detail || data.message || message;
} catch (_err) {
message = await response.text() || message;
}
throw new Error(message);
}
if (response.status === 204) return {};
return response.json();
}
async function copyTextFromUrl(url) {
const response = await fetch(withApiKey(url), {
headers: buildHeaders(false),
credentials: 'same-origin',
});
if (!response.ok) throw new Error('Failed to export URLs');
const text = await response.text();
await copyTextToClipboard(text);
return text;
}
async function copyTextToClipboard(text) {
if (navigator.clipboard && window.isSecureContext) {
try {
await navigator.clipboard.writeText(text);
return;
} catch (_error) {
}
}
const textarea = document.createElement('textarea');
textarea.value = text;
textarea.setAttribute('readonly', '');
textarea.style.position = 'fixed';
textarea.style.top = '-9999px';
textarea.style.left = '-9999px';
document.body.appendChild(textarea);
textarea.focus();
textarea.select();
const copied = document.execCommand('copy');
document.body.removeChild(textarea);
if (!copied) {
throw new Error('Clipboard write failed');
}
}
function getDownloadFilename(response, fallbackFilename) {
const disposition = response.headers.get('Content-Disposition') || '';
const utf8Match = disposition.match(/filename\\*=UTF-8''([^;]+)/i);
if (utf8Match && utf8Match[1]) {
return decodeURIComponent(utf8Match[1]);
}
const filenameMatch = disposition.match(/filename="?([^";]+)"?/i);
if (filenameMatch && filenameMatch[1]) {
return filenameMatch[1];
}
return fallbackFilename;
}
async function downloadFileFromUrl(url, fallbackFilename) {
const response = await fetch(withApiKey(url), {
headers: buildHeaders(false),
credentials: 'same-origin',
});
if (!response.ok) {
let message = 'Download failed';
try {
const data = await response.json();
message = data.detail || data.message || message;
} catch (_err) {
message = await response.text() || message;
}
throw new Error(message);
}
const blob = await response.blob();
const downloadUrl = URL.createObjectURL(blob);
const link = document.createElement('a');
link.href = downloadUrl;
link.download = getDownloadFilename(response, fallbackFilename);
document.body.appendChild(link);
link.click();
link.remove();
window.setTimeout(function () {
URL.revokeObjectURL(downloadUrl);
}, 1000);
}
createForm?.addEventListener('submit', async function (event) {
event.preventDefault();
const name = (createInput.value || '').trim();
if (!name) {
setToast('Enter a tag name first.', 'error');
createInput.focus();
return;
}
const button = createForm.querySelector('button[type="submit"]');
button.disabled = true;
try {
const result = await submitJson(createUrl, 'POST', { name: name });
createInput.value = '';
await refreshCards({ query: result.tag_name || name });
setToast(result.created ? 'Tag created.' : 'Existing tag loaded.', 'success');
} catch (error) {
setToast(error.message || 'Failed to create tag.', 'error');
} finally {
button.disabled = false;
}
});
searchInput?.addEventListener('input', function () {
window.clearTimeout(searchTimeout);
searchTimeout = window.setTimeout(async function () {
try {
await refreshCards();
} catch (error) {
setToast(error.message || 'Failed to search tags.', 'error');
}
}, 150);
});
[sortSelect, createdBySelect, yearSelect].forEach(function (field) {
field?.addEventListener('change', async function () {
try {
await refreshCards();
} catch (error) {
setToast(error.message || 'Failed to update tag filters.', 'error');
}
});
});
grid.addEventListener('click', async function (event) {
const actionButton = event.target.closest('[data-action]');
const snapshotLink = event.target.closest('.tag-snapshot-badge');
if (snapshotLink) return;
const cardEl = event.target.closest('.tag-card');
if (!cardEl) return;
if (!actionButton) {
window.location.href = cardEl.dataset.filterUrl;
return;
}
event.preventDefault();
event.stopPropagation();
const action = actionButton.dataset.action;
if (action === 'edit') {
cardEl.classList.add('is-editing');
const input = cardEl.querySelector('.tag-card__rename input');
if (input) {
input.focus();
input.select();
}
return;
}
if (action === 'cancel-edit') {
cardEl.classList.remove('is-editing');
return;
}
if (action === 'save-edit') {
const input = cardEl.querySelector('.tag-card__rename input');
const nextName = (input?.value || '').trim();
if (!nextName) {
setToast('Tag name is required.', 'error');
input?.focus();
return;
}
actionButton.disabled = true;
try {
await submitJson(cardEl.dataset.renameUrl, 'POST', { name: nextName });
await refreshCards();
setToast('Tag renamed.', 'success');
} catch (error) {
setToast(error.message || 'Rename failed.', 'error');
} finally {
actionButton.disabled = false;
}
return;
}
if (action === 'delete') {
const tagName = cardEl.querySelector('.tag-card__display strong')?.textContent || 'this tag';
if (!window.confirm('Delete "' + tagName + '"? This only removes the tag and its tag links.')) return;
actionButton.disabled = true;
try {
await fetch(withApiKey(cardEl.dataset.deleteUrl), {
method: 'DELETE',
headers: buildHeaders(false),
credentials: 'same-origin',
}).then(async function (response) {
if (!response.ok) {
let message = 'Delete failed';
try {
const payload = await response.json();
message = payload.detail || message;
} catch (_err) {
message = await response.text() || message;
}
throw new Error(message);
}
});
await refreshCards();
setToast('Tag deleted.', 'success');
} catch (error) {
setToast(error.message || 'Delete failed.', 'error');
} finally {
actionButton.disabled = false;
}
return;
}
if (action === 'copy-urls') {
actionButton.disabled = true;
try {
await copyTextFromUrl(cardEl.dataset.exportUrlsUrl);
} catch (error) {
setToast(error.message || 'Failed to copy URLs.', 'error');
} finally {
actionButton.disabled = false;
}
return;
}
if (action === 'download-jsonl') {
actionButton.disabled = true;
try {
const tagName = cardEl.querySelector('.tag-card__display strong')?.textContent || 'tag';
await downloadFileFromUrl(cardEl.dataset.exportJsonlUrl, 'tag-' + slugify(tagName) + '-snapshots.jsonl');
} catch (error) {
setToast(error.message || 'Failed to download JSONL.', 'error');
} finally {
actionButton.disabled = false;
}
}
});
grid.addEventListener('keydown', function (event) {
if (event.key !== 'Enter') return;
const input = event.target.closest('.tag-card__rename input');
if (!input) return;
event.preventDefault();
const saveButton = input.closest('.tag-card__rename')?.querySelector('[data-action="save-edit"]');
saveButton?.click();
});
const initialState = getCurrentState();
renderCards(cards, initialState);
syncLocation(initialState);
});
</script>
{% endblock %}

View File

@@ -0,0 +1,249 @@
{% extends "admin/change_form.html" %}
{% block bodyclass %}{{ block.super }} app-personas model-persona{% endblock %}
{% block extrastyle %}
{{ block.super }}
<style>
.persona-import-hero {
margin: 0 0 22px;
padding: 22px 24px;
border-radius: 18px;
border: 1px solid #d8dee9;
background:
radial-gradient(circle at top right, rgba(67, 97, 238, 0.10), transparent 32%),
linear-gradient(135deg, #fff7ed 0%, #ffffff 45%, #ecfeff 100%);
box-shadow: 0 10px 30px rgba(15, 23, 42, 0.06);
display: grid;
gap: 18px;
grid-template-columns: minmax(0, 1.8fr) minmax(280px, 1fr);
align-items: start;
}
.persona-import-hero h2 {
margin: 0 0 8px;
font-size: 28px;
line-height: 1.1;
color: #111827;
}
.persona-import-hero p {
margin: 0;
color: #475569;
max-width: 70ch;
font-size: 14px;
}
.persona-import-hero__meta {
display: grid;
gap: 10px;
grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
}
.persona-import-hero__stat {
padding: 14px 16px;
border-radius: 14px;
background: rgba(255, 255, 255, 0.86);
border: 1px solid rgba(203, 213, 225, 0.85);
}
.persona-import-hero__stat span {
display: block;
font-size: 11px;
text-transform: uppercase;
letter-spacing: 0.08em;
font-weight: 700;
color: #64748b;
margin-bottom: 8px;
}
.persona-import-hero__stat strong,
.persona-import-hero__stat code {
font-size: 18px;
color: #0f172a;
}
.field-import_mode ul,
.field-import_discovered_profile ul {
margin: 0;
padding: 0;
list-style: none;
display: grid;
gap: 12px;
}
.field-import_mode ul {
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
}
.field-import_discovered_profile ul {
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
max-height: 460px;
overflow: auto;
padding-right: 4px;
}
.field-import_mode li,
.field-import_discovered_profile li {
margin: 0;
}
.field-import_mode label,
.field-import_discovered_profile label {
display: flex;
gap: 12px;
align-items: flex-start;
min-height: 100%;
padding: 14px 16px;
border-radius: 14px;
border: 1px solid #dbe4ee;
background: #fff;
box-shadow: 0 1px 2px rgba(15, 23, 42, 0.04);
cursor: pointer;
transition: transform 0.15s ease, border-color 0.15s ease, box-shadow 0.15s ease;
}
.field-import_mode label:hover,
.field-import_discovered_profile label:hover {
transform: translateY(-1px);
border-color: #7c3aed;
box-shadow: 0 8px 20px rgba(124, 58, 237, 0.10);
}
.field-import_mode input[type="radio"],
.field-import_discovered_profile input[type="radio"] {
margin-top: 3px;
flex: 0 0 auto;
}
.abx-import-mode-option,
.abx-profile-option {
display: grid;
gap: 6px;
}
.abx-import-mode-option strong,
.abx-profile-option strong {
color: #0f172a;
font-size: 15px;
}
.abx-import-mode-option span:last-child,
.abx-profile-option__meta {
color: #64748b;
font-size: 12px;
line-height: 1.5;
}
.abx-profile-option code {
font-size: 11px;
line-height: 1.5;
white-space: normal;
overflow-wrap: anywhere;
color: #334155;
background: #f8fafc;
border: 1px solid #e2e8f0;
border-radius: 10px;
padding: 8px 10px;
}
.abx-persona-path-list,
.abx-persona-artifacts {
display: grid;
gap: 10px;
}
.abx-persona-path-list div,
.abx-persona-artifact {
display: grid;
gap: 6px;
padding: 12px 14px;
border-radius: 12px;
border: 1px solid #e2e8f0;
background: #f8fafc;
}
.abx-persona-path-list code,
.abx-persona-artifact code {
white-space: normal;
overflow-wrap: anywhere;
font-size: 12px;
}
.abx-artifact-state {
display: inline-flex;
width: fit-content;
align-items: center;
border-radius: 999px;
padding: 2px 10px;
font-size: 11px;
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.abx-artifact-state--yes {
background: #dcfce7;
color: #166534;
}
.abx-artifact-state--no {
background: #fee2e2;
color: #991b1b;
}
@media (max-width: 960px) {
.persona-import-hero {
grid-template-columns: 1fr;
}
}
</style>
{% endblock %}
{% block extrahead %}
{{ block.super }}
<script>
document.addEventListener('DOMContentLoaded', function () {
const modeInputs = Array.from(document.querySelectorAll('input[name="import_mode"]'));
const discoveredRow = document.querySelector('.form-row.field-import_discovered_profile');
const sourceRow = document.querySelector('.form-row.field-import_source');
const profileRow = document.querySelector('.form-row.field-import_profile_name');
const updateVisibility = () => {
const selected = modeInputs.find((input) => input.checked)?.value || 'none';
if (discoveredRow) discoveredRow.style.display = selected === 'discovered' ? '' : 'none';
if (sourceRow) sourceRow.style.display = selected === 'custom' ? '' : 'none';
if (profileRow) profileRow.style.display = selected === 'custom' ? '' : 'none';
};
modeInputs.forEach((input) => input.addEventListener('change', updateVisibility));
updateVisibility();
});
</script>
{% endblock %}
{% block form_top %}
<section class="persona-import-hero">
<div>
<h2>Bootstrap a persona from a real browser session</h2>
<p>
Pick a local Chromium profile, paste an absolute profile path, or attach to a live CDP endpoint.
The form saves the Persona normally, then imports profile files, cookies, and optional tab storage into
the Persona's own directories.
</p>
</div>
<div class="persona-import-hero__meta">
<div class="persona-import-hero__stat">
<span>Detected profiles</span>
<strong>{{ detected_profile_count }}</strong>
</div>
<div class="persona-import-hero__stat">
<span>Persona artifacts</span>
<code>chrome_user_data</code>
<code>cookies.txt</code>
<code>auth.json</code>
</div>
</div>
</section>
{{ block.super }}
{% endblock %}

View File

@@ -706,14 +706,14 @@
? Math.max(0, Math.min(100, extractor.progress))
: null;
const progressStyle = progress !== null ? ` style="width: ${progress}%;"` : '';
const pidHtml = extractor.pid ? `<span class="pid-label compact">pid ${extractor.pid}</span>` : '';
const pidHtml = extractor.status === 'started' && extractor.pid ? `<span class="pid-label compact">pid ${extractor.pid}</span>` : '';
return `
<span class="extractor-badge ${extractor.status || 'queued'}">
<span class="progress-fill"${progressStyle}></span>
<span class="badge-content">
<span class="badge-icon">${icon}</span>
<span>${extractor.plugin || 'unknown'}</span>
<span>${extractor.label || extractor.plugin || 'unknown'}</span>
${pidHtml}
</span>
</span>
@@ -742,6 +742,23 @@
`;
}
const hasProcessEntries = (snapshot.all_plugins || []).some(extractor => extractor.source === 'process');
const hasArchiveResults = (snapshot.all_plugins || []).some(extractor => extractor.source === 'archiveresult');
const processOnly = hasProcessEntries && !hasArchiveResults;
const runningProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'started').length;
const failedProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'failed').length;
const snapshotMeta = (snapshot.total_plugins || 0) > 0
? processOnly
? runningProcessCount > 0
? `Running ${runningProcessCount}/${snapshot.total_plugins || 0} setup hooks`
: failedProcessCount > 0
? `${failedProcessCount} setup hook${failedProcessCount === 1 ? '' : 's'} failed`
: `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} setup hooks`
: hasProcessEntries
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} tasks${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}${runningProcessCount > 0 ? ` <span style="color:#d29922">(${runningProcessCount} hooks running)</span>` : ''}`
: `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
: 'Waiting for extractors...';
return `
<div class="snapshot-item">
<div class="snapshot-header">
@@ -750,9 +767,7 @@
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${(snapshot.total_plugins || 0) > 0
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
: 'Waiting for extractors...'}
${snapshotMeta}
</div>
</div>
${snapshotPidHtml}
@@ -762,7 +777,7 @@
</div>
<div class="snapshot-progress">
<div class="progress-bar-container">
<div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
<div class="progress-bar snapshot ${((processOnly && runningProcessCount > 0) || (snapshot.status === 'started' && (snapshot.progress || 0) === 0)) ? 'indeterminate' : ''}"
style="width: ${snapshot.progress || 0}%"></div>
</div>
</div>
@@ -784,6 +799,29 @@
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
}
let setupHtml = '';
if (crawl.setup_plugins && crawl.setup_plugins.length > 0) {
const setupSummary = `${crawl.setup_completed_plugins || 0}/${crawl.setup_total_plugins || 0} setup tasks${(crawl.setup_failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${crawl.setup_failed_plugins} failed)</span>` : ''}`;
const sortedSetup = [...crawl.setup_plugins].sort((a, b) =>
(a.plugin || '').localeCompare(b.plugin || '')
);
setupHtml = `
<div class="snapshot-item">
<div class="snapshot-header">
<div class="snapshot-header-link">
<span class="snapshot-icon">&#9881;</span>
<div class="snapshot-info">
<div class="snapshot-url">Crawl Setup</div>
<div class="snapshot-meta">${setupSummary}</div>
</div>
</div>
</div>
<div class="extractor-list">
${sortedSetup.map(e => renderExtractor(e)).join('')}
</div>
</div>
`;
}
// Show warning if crawl is stuck (queued but can't start)
let warningHtml = '';
@@ -847,6 +885,7 @@
${warningHtml}
<div class="crawl-body">
<div class="snapshot-list">
${setupHtml}
${snapshotsHtml}
</div>
</div>

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
{% load static tz admin_urls %}
{% load static tz admin_urls core_tags %}
<!DOCTYPE html>
<html lang="en">
@@ -9,6 +9,10 @@
<link rel="stylesheet" href="{% static 'admin/css/base.css' %}">
<link rel="stylesheet" href="{% static 'admin.css' %}">
<link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">
{% api_token as api_token %}
<script>
window.ARCHIVEBOX_API_KEY = "{{ api_token|escapejs }}";
</script>
<script src="{% static 'jquery.min.js' %}"></script>
{% block extra_head %}

View File

@@ -6,7 +6,7 @@
<a href="/admin/core/tag/">Tags</a> |
<a href="/admin/core/archiveresult/?o=-1">Log</a> &nbsp; &nbsp;
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
<a href="/api">API</a> |
<a href="/api/v1/docs">API</a> |
<a href="{% url 'public-index' %}">Public</a> |
<a href="/admin/">Admin</a>
&nbsp; &nbsp;

View File

@@ -456,6 +456,9 @@
text-overflow: ellipsis;
white-space: nowrap;
}
.thumb-card:has([data-compact]) .card-text {
display: none;
}
.thumb-card:has([data-compact]) .thumbnail-text-header,
.thumb-card:has([data-compact]) .thumbnail-compact-icon,
.thumb-card:has([data-compact]) .thumbnail-compact-label {
@@ -620,8 +623,9 @@
<div class="header-top container-fluid">
<div class="row nav">
<div class="col-lg-2" style="line-height: 50px; vertical-align: middle">
<a href="../../index.html" class="header-archivebox" title="Go to Main Index...">
<img src="/static/archive.png" alt="Archive Icon">
{% public_base_url as public_base %}
<a href="{% if public_base %}{{ public_base }}/public/{% else %}/{% endif %}" class="header-archivebox" title="Go to Public Index...">
<img src="{% if public_base %}{{ public_base }}/static/archive.png{% else %}/static/archive.png{% endif %}" alt="Archive Icon">
ArchiveBox
</a>
</div>
@@ -683,12 +687,10 @@
<div class="info-chunk">
<h5>🗃&nbsp; Snapshot: <a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|default:id|truncatechars:24}}</small></code></a></h5>
<a href="{% snapshot_url snapshot 'index.json' %}" title="JSON summary of archived link.">JSON</a> |
<a href="{% snapshot_url snapshot 'warc/' %}" title="Any WARC archives for the page">WARC</a> |
<a href="{% snapshot_url snapshot 'media/' %}" title="Audio, Video, and Subtitle files.">Media</a> |
<a href="{% snapshot_url snapshot 'git/' %}" title="Any git repos at the url">Git</a> |
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse the full SNAP_DIR for this snapshot">See all files...</a> |
<a href="{% admin_base_url %}/admin/core/snapshot/?q={{snapshot_id|default:id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> |
<a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> |
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Webserver-provided index of files directory.">See all files...</a><br/>
<a href="https://web.archive.org/web/{{url}}" title="Search for a copy of the URL saved in Archive.org" target="_blank" rel="noreferrer">Archive.org</a><br/>
</div>
</div>
</div>
@@ -713,12 +715,12 @@
<a href="{{display_url}}" data-no-preview="1" title="Download output file" download>⬇️</a>
{% endif %}
</div>
<a href="{{ display_url }}" target="preview">
<h4 class="card-title">{% plugin_icon result_info.name %} {{ result_info.name|plugin_name|truncatechars:20 }}</h4>
</a>
<a href="{{ display_url }}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>{{ result_info.path }}</code></p>
</a>
<a href="{{ display_url }}" target="preview">
<h4 class="card-title">{{ result_info.name|title }}</h4>
</a>
{% if result_info.result %}
{% with plugin_base=result_info.name|plugin_name %}
{% if plugin_base == 'ytdlp' or plugin_base == 'yt-dlp' or plugin_base == 'youtube-dl' %}

View File

@@ -902,9 +902,9 @@
<div class="header-top">
<div class="header-nav">
<div class="header-col header-left" style="line-height: 58px; vertical-align: middle">
<a href="/" class="header-archivebox" title="Go to Main Index...">
{% web_base_url as web_base %}
<img src="{% if web_base %}//{{ web_base|cut:'http://'|cut:'https://' }}/static/archive.png{% else %}{% static 'archive.png' %}{% endif %}" alt="Archive Icon">
{% public_base_url as public_base %}
<a href="{% if public_base %}{{ public_base }}/public/{% else %}/{% endif %}" class="header-archivebox" title="Go to Public Index...">
<img src="{% if public_base %}{{ public_base }}/static/archive.png{% else %}{% static 'archive.png' %}{% endif %}" alt="Archive Icon">
ArchiveBox
</a>
</div>
@@ -996,8 +996,7 @@
<br/>
<div class="external-links">
📁 &nbsp;
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse files for this snapshot" target="_blank">FILES</a> &nbsp;|&nbsp; 🗃️
<a href="{% snapshot_url snapshot warc_path %}" title="Download the ArchiveBox-generated WARC file" target="_blank">WARC</a> &nbsp;|&nbsp;
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse the full SNAP_DIR for this snapshot" target="_blank">See all files...</a> &nbsp;|&nbsp;
<a href="https://web.archive.org/web/{{url}}" title="Search for a copy of the URL saved in Archive.org" target="_blank" rel="noreferrer">🏛️ Archive.org</a>
<!--<a href="https://archive.md/{{url}}" title="Search for a copy of the URL saved in Archive.today" target="_blank" rel="noreferrer">Archive.today</a> &nbsp;|&nbsp; -->
<!--<a href="https://ghostarchive.org/search?term={{url}}" title="Search for a copy of the URL saved in GhostArchive.org" target="_blank" rel="noreferrer">More...</a>-->
@@ -1010,7 +1009,7 @@
{% for result in archiveresults %}
{% with display_path=result.path|default:result.result.embed_path display_url='' %}
{% with display_path=result.path display_url='' %}
{% if display_path %}{% snapshot_url snapshot display_path as display_url %}{% endif %}
<div class="thumb-card{% if forloop.first %} selected-card{% endif %}"{% if display_url %} data-preview-url="{{display_url}}"{% endif %}>
<div class="thumb-body">

View File

@@ -78,6 +78,7 @@ textarea, select, input[type="text"] {
box-shadow: 4px 4px 4px rgba(0,0,0,0.02);
width: 100%;
padding: 8px 12px;
font-family: inherit;
font-size: 14px;
}
@@ -85,6 +86,10 @@ textarea {
min-height: 300px;
}
input[type="text"] {
min-height: 42px;
}
textarea[rows="3"] {
min-height: 80px;
}
@@ -153,6 +158,13 @@ select {
margin-bottom: 20px;
}
.settings-row {
display: grid;
grid-template-columns: minmax(260px, 340px) minmax(420px, 1fr);
gap: 18px;
align-items: start;
}
.form-field label {
display: block;
font-size: 16px;
@@ -160,6 +172,234 @@ select {
margin-bottom: 8px;
}
.field-header {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 8px;
}
.field-header label {
margin-bottom: 0;
}
.url-workbench {
display: grid;
grid-template-columns: minmax(0, 1fr) minmax(280px, 360px);
gap: 18px;
align-items: start;
}
.url-editor-column {
min-width: 0;
}
.url-editor-shell {
position: relative;
}
.url-editor-shell textarea[name="url"] {
position: relative;
z-index: 2;
background: transparent;
color: #1f2937;
-webkit-text-fill-color: #1f2937;
caret-color: #1f2937;
min-height: 240px;
height: 240px;
line-height: 1.5;
resize: vertical;
}
.url-editor-shell textarea[name="url"]::selection {
background: rgba(0, 72, 130, 0.18);
}
.url-highlight-layer {
position: absolute;
inset: 2px;
z-index: 1;
margin: 0;
padding: 8px 12px;
overflow: auto;
pointer-events: none;
white-space: pre-wrap;
overflow-wrap: anywhere;
word-break: break-word;
font-family: inherit;
font-size: 14px;
line-height: 1.5;
color: transparent;
background: transparent;
border-radius: 2px;
scrollbar-width: none;
}
.url-highlight-layer::-webkit-scrollbar {
display: none;
}
.url-highlight-segment {
border-radius: 3px;
}
.detected-urls-panel {
display: flex;
flex-direction: column;
min-height: 240px;
padding: 12px 14px;
background: linear-gradient(180deg, #fff 0%, #f6f8fb 100%);
border: 1px solid #d7e2eb;
border-radius: 8px;
overflow: hidden;
}
.detected-urls-header {
display: flex;
align-items: baseline;
justify-content: space-between;
gap: 12px;
margin-bottom: 10px;
}
.detected-urls-summary {
font-size: 12px;
color: #5f6c78;
}
.detected-urls-list {
flex: 1;
min-height: 0;
display: grid;
align-content: start;
gap: 8px;
overflow: auto;
padding-right: 4px;
}
.detected-urls-empty {
padding: 8px 0;
color: #6b7280;
font-size: 13px;
line-height: 1.5;
}
.detected-url-item {
display: grid;
gap: 8px;
padding: 10px 12px;
border-left: 4px solid var(--detected-url-border, #d0d7de);
border-radius: 6px;
background: linear-gradient(90deg, var(--detected-url-bg, rgba(0, 0, 0, 0.03)), rgba(255, 255, 255, 0.96) 28%);
}
.detected-url-topline {
display: flex;
align-items: center;
justify-content: space-between;
gap: 8px;
}
.detected-url-controls {
display: flex;
flex-wrap: nowrap;
gap: 6px;
min-width: 0;
}
.detected-url-number {
width: 20px;
height: 20px;
display: inline-flex;
align-items: center;
justify-content: center;
border-radius: 999px;
background: rgba(15, 23, 42, 0.08);
color: #24303b;
font-size: 10px;
font-weight: 700;
}
.detected-url-body {
min-width: 0;
}
.detected-url-value {
display: block;
font-size: 12px;
line-height: 1.45;
color: #1f2937;
overflow-wrap: anywhere;
}
.detected-url-toggle-btn {
flex: 0 0 auto;
display: inline-flex;
align-items: center;
justify-content: center;
padding: 4px 8px;
min-height: 24px;
border: 1px solid rgba(148, 163, 184, 0.4);
border-radius: 999px;
background: rgba(148, 163, 184, 0.12);
color: #64748b;
font-size: 11px;
font-weight: 700;
line-height: 1;
white-space: nowrap;
transition: background-color 120ms ease, border-color 120ms ease, color 120ms ease;
cursor: pointer;
}
.detected-url-toggle-btn:hover {
background: rgba(15, 23, 42, 0.08);
}
.detected-url-toggle-btn-inactive:hover {
border-color: rgba(180, 35, 24, 0.28);
background: rgba(180, 35, 24, 0.10);
color: #b42318;
}
.detected-url-toggle-btn-active:hover {
border-color: rgba(22, 101, 52, 0.28);
background: rgba(22, 101, 52, 0.10);
color: #166534;
}
.detected-url-toggle-btn-disabled,
.detected-url-toggle-btn-disabled:hover {
border-color: rgba(203, 213, 225, 0.55);
background: rgba(226, 232, 240, 0.45);
color: #94a3b8;
cursor: not-allowed;
}
.detected-url-message {
margin-top: 4px;
font-size: 11px;
color: #617080;
line-height: 1.45;
}
.detected-url-allowlisted .detected-url-value {
color: #166534;
}
.detected-url-denied .detected-url-value {
color: #b42318;
text-decoration: line-through;
text-decoration-thickness: 1.5px;
}
.detected-url-denied .detected-url-message {
color: #b42318;
}
.detected-url-filtered .detected-url-value {
color: #6b7280;
}
.form-field .help-text {
font-size: 12px;
color: #666;
@@ -173,7 +413,137 @@ select {
margin-top: 4px;
}
/* Checkbox fields (for overwrite, update, index_only) */
.tag-editor-container {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 6px;
padding: 8px 12px;
min-height: 44px;
background: #fff;
border: 2px solid #004882;
border-radius: 4px;
box-shadow: 4px 4px 4px rgba(0,0,0,0.02);
cursor: text;
}
.tag-editor-container:focus-within {
border-color: #2c7ec1;
}
.tag-pills {
display: flex;
flex-wrap: wrap;
gap: 6px;
align-items: center;
}
.tag-pill {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 4px 8px 4px 10px;
background: var(--tag-bg, #e2e8f0);
color: var(--tag-fg, #1e293b);
border-radius: 16px;
border: 1px solid var(--tag-border, #cbd5e1);
font-size: 13px;
font-weight: 500;
}
.tag-remove-btn {
display: inline-flex;
align-items: center;
justify-content: center;
width: 16px;
height: 16px;
padding: 0;
margin: 0;
border: 1px solid rgba(15, 23, 42, 0.12);
border-radius: 50%;
background: rgba(15, 23, 42, 0.08);
color: inherit;
font-size: 14px;
line-height: 1;
cursor: pointer;
}
.tag-inline-input {
flex: 1;
min-width: 120px;
padding: 4px 0;
border: none !important;
box-shadow: none !important;
outline: none;
background: transparent;
}
.tag-inline-input::placeholder {
color: #7c8b98;
}
.url-filters-widget textarea {
min-height: 58px;
font-family: monospace;
font-size: 13px;
}
.url-filters-field > label {
display: none;
}
.url-filters-grid {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 12px;
}
.url-filter-label-row {
display: flex;
align-items: baseline;
flex-wrap: nowrap;
gap: 10px;
width: 100%;
margin-bottom: 6px;
}
.url-filters-column .url-filter-label {
display: block;
font-size: 14px;
margin-bottom: 0;
}
.url-filter-label-main {
font-weight: 600;
white-space: nowrap;
}
.url-filter-label-note {
display: inline-block;
flex: 0 0 auto;
margin-left: auto;
font-size: 12px;
color: #7a7a7a;
font-weight: 400;
font-style: italic;
text-align: right;
white-space: nowrap;
}
.url-filters-toggle {
display: inline-flex !important;
align-items: center;
gap: 8px;
margin-top: 10px;
font-size: 14px !important;
font-weight: 600;
}
.url-filters-toggle input[type="checkbox"] {
width: auto;
margin: 0;
}
.checkbox-field {
display: flex;
align-items: center;
@@ -193,7 +563,6 @@ select {
/* URL Counter */
.url-counter {
display: inline-block;
margin-top: 8px;
padding: 4px 10px;
font-size: 13px;
font-weight: 600;
@@ -209,13 +578,27 @@ select {
border-color: #c3e6cb;
}
@media (max-width: 1020px) {
.settings-row {
grid-template-columns: 1fr;
}
.url-workbench {
grid-template-columns: 1fr;
}
.url-filters-grid {
grid-template-columns: 1fr;
}
}
/* Plugin Presets */
.plugin-presets {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 8px;
margin-bottom: 20px;
margin-bottom: 18px;
padding: 15px;
background-color: #f8f9fa;
border: 1px solid #dee2e6;
@@ -254,11 +637,18 @@ select {
/* Plugin groups */
.plugin-group {
margin-bottom: 20px;
padding: 15px;
padding: 14px 16px;
background-color: white;
border: 1px solid #ddd;
border-radius: 6px;
min-width: 0;
}
.plugin-groups-grid {
display: grid;
grid-template-columns: repeat(2, minmax(280px, 1fr));
gap: 16px;
align-items: start;
}
.plugin-group-header {
@@ -268,6 +658,7 @@ select {
margin-bottom: 12px;
padding-bottom: 8px;
border-bottom: 2px solid #004882;
gap: 12px;
}
.plugin-group-header label {
@@ -277,6 +668,12 @@ select {
margin: 0;
}
.plugin-group-note {
font-size: 12px;
color: #7a7a7a;
white-space: nowrap;
}
.select-all-btn {
padding: 4px 12px;
font-size: 12px;
@@ -293,42 +690,105 @@ select {
.plugin-checkboxes {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
gap: 8px;
grid-template-columns: 1fr;
gap: 6px;
}
.plugin-checkboxes ul {
list-style-type: none;
padding: 0;
margin: 0;
display: contents;
.plugin-checkboxes > div {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 6px 10px;
}
.plugin-checkboxes li {
.plugin-checkboxes > div > div {
display: flex;
align-items: center;
gap: 8px;
padding: 6px;
padding: 6px 8px;
border: 1px solid #e3e8ef;
background-color: #fff;
border-radius: 4px;
transition: background-color 0.2s;
}
.plugin-checkboxes li:hover {
.plugin-checkboxes > div > div:hover {
background-color: #f5f5f5;
}
.plugin-checkboxes input[type="checkbox"] {
grid-column: 2;
grid-row: 1 / span 2;
margin: 0;
margin-top: 2px;
width: auto;
flex: 0 0 auto;
}
.plugin-checkboxes label {
#add-form .plugin-checkboxes label {
display: grid !important;
grid-template-columns: 18px 16px minmax(0, 1fr);
column-gap: 8px;
row-gap: 3px;
align-items: start;
width: 100%;
margin: 0;
font-size: 14px;
font-weight: normal;
cursor: pointer;
}
.plugin-choice-name {
grid-column: 3;
grid-row: 1;
font-weight: 500;
color: #1f2937;
}
#add-form .plugin-choice-icon {
grid-column: 1;
grid-row: 1 / span 2;
display: inline-flex;
align-items: center;
justify-content: center;
color: #7a7a7a;
flex: 0 0 auto;
}
#add-form .plugin-choice-icon .abx-output-icon {
display: inline-flex;
align-items: center;
justify-content: center;
}
#add-form .plugin-choice-icon svg {
width: 18px;
height: 18px;
}
#add-form .plugin-choice-description {
grid-column: 3;
grid-row: 2;
margin-left: 0;
display: inline-block;
font-size: 12px;
color: #7a7a7a !important;
text-decoration: none !important;
text-align: left;
}
#add-form .plugin-checkboxes label a.plugin-choice-description:link,
#add-form .plugin-checkboxes label a.plugin-choice-description:visited,
#add-form .plugin-checkboxes label a.plugin-choice-description:active {
color: #7a7a7a !important;
text-decoration: none !important;
}
#add-form .plugin-checkboxes label a.plugin-choice-description:hover,
#add-form .plugin-checkboxes label a.plugin-choice-description:focus {
color: #4b5563 !important;
text-decoration: underline !important;
}
/* Advanced section (collapsible) */
.advanced-section {
background-color: white;
@@ -388,6 +848,14 @@ input:focus, select:focus, textarea:focus, button:focus {
grid-template-columns: 1fr;
}
.plugin-groups-grid {
grid-template-columns: 1fr;
}
.plugin-checkboxes > div {
grid-template-columns: 1fr;
}
.plugin-group-header {
flex-direction: column;
align-items: flex-start;

View File

@@ -477,6 +477,10 @@ body.model-snapshot.change-list #content .object-tools {
max-width: 220px;
}
#content td.field-tags_inline .tag-editor-inline.readonly {
padding-right: 0;
}
#content th.field-tags_inline,
#content td.field-tags_inline {
max-width: 220px;
@@ -610,6 +614,56 @@ body.model-snapshot.change-list #content .object-tools {
border-radius: 4px;
}
body.model-archiveresult.change-list #result_list td.field-cmd_str {
width: 300px !important;
max-width: 300px !important;
min-width: 300px !important;
}
body.model-archiveresult.change-list #result_list td.field-cmd_str > div,
body.model-archiveresult.change-list #result_list td.field-cmd_str code {
max-width: 300px !important;
}
body.model-archiveresult.change-list #result_list {
table-layout: fixed;
width: 100%;
}
body.model-archiveresult.change-list #result_list th.column-cmd_str,
body.model-archiveresult.change-list #result_list td.field-cmd_str {
width: 300px !important;
max-width: 300px !important;
min-width: 300px !important;
overflow: hidden !important;
box-sizing: border-box;
}
body.model-archiveresult.change-list #result_list th.column-process_link,
body.model-archiveresult.change-list #result_list td.field-process_link {
width: 72px;
white-space: nowrap;
}
body.model-archiveresult.change-list #result_list th.column-machine_link,
body.model-archiveresult.change-list #result_list td.field-machine_link {
width: 180px;
}
body.model-archiveresult.change-list #result_list td.field-snapshot_info a {
display: block;
overflow: hidden;
text-overflow: ellipsis;
}
body.model-archiveresult.change-list #result_list td.field-cmd_str > div,
body.model-archiveresult.change-list #result_list td.field-cmd_str code {
width: 300px !important;
min-width: 300px !important;
max-width: 300px !important;
box-sizing: border-box;
}
body.filters-collapsed #content #changelist-filter {
display: none !important;
}
@@ -637,10 +691,49 @@ body.filters-collapsed .filtered div.xfull {
font-variant: small-caps;
}
#result_list tbody td.field-status {
#result_list tbody td.field-status,
#result_list tbody td.field-status_badge {
font-variant: small-caps;
}
body.model-archiveresult.filters-collapsed.change-list #changelist .changelist-form-container {
gap: 0 !important;
}
body.model-archiveresult.filters-collapsed.change-list #changelist .changelist-form-container > div,
body.model-archiveresult.filters-collapsed.change-list #changelist .results,
body.model-archiveresult.filters-collapsed.change-list #changelist .paginator,
body.model-archiveresult.filters-collapsed.change-list #changelist #toolbar,
body.model-archiveresult.filters-collapsed.change-list #changelist #changelist-form,
body.model-archiveresult.filters-collapsed.change-list #changelist #result_list {
width: 100% !important;
max-width: 100% !important;
margin-right: 0 !important;
}
body.model-archiveresult.change-list #result_list tbody tr {
transition: background-color 0.15s ease, opacity 0.15s ease;
}
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.started),
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.backoff) {
background: rgba(251, 191, 36, 0.14);
}
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.failed) {
background: rgba(239, 68, 68, 0.12);
}
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.succeeded) {
background: rgba(34, 197, 94, 0.11);
}
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.skipped),
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.noresults) {
background: rgba(148, 163, 184, 0.10);
opacity: 0.82;
}
.inline-group .tabular td.original p {
margin-top: -28px;
}
@@ -697,6 +790,7 @@ tbody .output-link:hover {opacity: 1;}
.status-badge.failed { background: #fee2e2; color: #ef4444; }
.status-badge.backoff { background: #fef3c7; color: #f59e0b; }
.status-badge.skipped { background: #f3f4f6; color: #6b7280; }
.status-badge.noresults { background: #f1f5f9; color: #64748b; }
/* Progress Bar */
.snapshot-progress-bar {

View File

@@ -0,0 +1,195 @@
import re
import pytest
from django.contrib.auth import get_user_model
from django.urls import reverse
from archivebox.config.common import SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.core.models import Tag
from archivebox.crawls.models import Crawl
pytestmark = pytest.mark.django_db
User = get_user_model()
WEB_HOST = 'web.archivebox.localhost:8000'
ADMIN_HOST = 'admin.archivebox.localhost:8000'
@pytest.fixture
def admin_user(db):
return User.objects.create_superuser(
username='addviewadmin',
email='addviewadmin@test.com',
password='testpassword',
)
def test_add_view_renders_tag_editor_and_url_filter_fields(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert 'tag-editor-container' in body
assert 'name="url_filters_allowlist"' in body
assert 'name="url_filters_denylist"' in body
assert 'Same domain only' in body
assert 'name="persona"' in body
assert 'Overwrite existing snapshots' not in body
assert 'Update/retry previously failed URLs' not in body
assert 'Index only dry run (add crawl but don&#x27;t archive yet)' in body
assert 'name="notes"' in body
assert '<input type="text" name="notes"' in body
assert body.index('name="persona"') < body.index('<h3>Crawl Plugins</h3>')
assert 'data-url-regex=' in body
assert 'id="url-highlight-layer"' in body
assert 'id="detected-urls-list"' in body
assert 'detected-url-toggle-btn' in body
def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, 'SEARCH_BACKEND_ENGINE', 'sqlite')
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert re.search(
r'<input type="checkbox" name="search_plugins" value="search_backend_sqlite"[^>]* checked\b',
body,
)
assert "const requiredSearchPlugin = 'search_backend_sqlite';" in body
def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
client.force_login(admin_user)
response = client.post(
reverse('add'),
data={
'url': 'https://example.com\nhttps://cdn.example.com/asset.js',
'tag': 'alpha,beta',
'depth': '1',
'url_filters_allowlist': 'example.com\n*.example.com',
'url_filters_denylist': 'cdn.example.com',
'notes': 'Created from /add/',
'schedule': '',
'persona': 'Default',
'index_only': '',
'config': '{}',
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by('-created_at').first()
assert crawl is not None
assert crawl.tags_str == 'alpha,beta'
assert crawl.notes == 'Created from /add/'
assert crawl.config.get('DEFAULT_PERSONA') == 'Default'
assert crawl.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
assert crawl.config['URL_DENYLIST'] == 'cdn.example.com'
assert 'OVERWRITE' not in crawl.config
assert 'ONLY_NEW' not in crawl.config
def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
client.force_login(admin_user)
response = client.post(
reverse('add'),
data={
'url': '\n'.join([
'https://sweeting.me,https://google.com',
'Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com',
'[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))',
'{"items":["https://example.com/three"]}',
'csv,https://example.com/four',
]),
'tag': '',
'depth': '0',
'url_filters_allowlist': '',
'url_filters_denylist': '',
'notes': '',
'schedule': '',
'persona': 'Default',
'index_only': '',
'config': '{}',
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by('-created_at').first()
assert crawl is not None
assert crawl.urls == '\n'.join([
'https://sweeting.me',
'https://google.com',
'https://github.com/ArchiveBox/ArchiveBox',
'https://news.ycombinator.com',
'https://en.wikipedia.org/wiki/Classification_(machine_learning)',
'https://example.com/three',
'https://example.com/four',
])
def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
client.force_login(admin_user)
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
assert response.status_code == 200
assert b'window.ARCHIVEBOX_API_KEY' in response.content
def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 401
def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = True
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['tags'][0]['name'] == 'archive'
def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
client.force_login(admin_user)
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['tags'][0]['name'] == 'archive'

View File

@@ -0,0 +1,151 @@
from archivebox.base_models.admin import KeyValueWidget
def test_key_value_widget_renders_enum_autocomplete_metadata(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'CHROME_WAIT_FOR': {
'plugin': 'chrome',
'type': 'string',
'default': 'networkidle2',
'description': 'Page load completion condition',
'enum': ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'],
},
},
)
html = str(
KeyValueWidget().render(
'config',
{'CHROME_WAIT_FOR': 'load'},
attrs={'id': 'id_config'},
)
)
assert '"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"]' in html
assert 'class="kv-value-options"' in html
assert 'class="kv-help"' in html
assert 'configureValueInput_id_config' in html
assert 'describeMeta_id_config' in html
assert 'validateValueAgainstMeta_id_config' in html
def test_key_value_widget_renders_numeric_and_pattern_constraints(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'TIMEOUT': {
'plugin': 'base',
'type': 'integer',
'default': 60,
'description': 'Timeout in seconds',
'minimum': 5,
'maximum': 120,
},
'CHROME_RESOLUTION': {
'plugin': 'chrome',
'type': 'string',
'default': '1440,2000',
'description': 'Viewport resolution',
'pattern': '^\\d+,\\d+$',
},
},
)
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
assert '"minimum": 5' in html
assert '"maximum": 120' in html
assert '"pattern": "^\\\\d+,\\\\d+$"' in html
assert 'Expected: ' in html
assert 'Example: ' in html
assert 'setValueValidationState_id_config' in html
assert 'coerceValueForStorage_id_config' in html
def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'DEBUG': {
'plugin': 'base',
'type': 'boolean',
'default': False,
'description': 'Enable debug mode',
},
},
)
html = str(KeyValueWidget().render('config', {'DEBUG': 'True'}, attrs={'id': 'id_config'}))
assert "enumValues = ['True', 'False']" in html
assert "raw.toLowerCase()" in html
assert "lowered === 'true' || raw === '1'" in html
assert "lowered === 'false' || raw === '0'" in html
def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'WGET_ARGS_EXTRA': {
'plugin': 'wget',
'type': 'array',
'default': [],
'description': 'Extra arguments to append to wget command',
},
'SAVE_ALLOWLIST': {
'plugin': 'base',
'type': 'object',
'default': {},
'description': 'Regex allowlist mapped to enabled methods',
},
'WGET_BINARY': {
'plugin': 'wget',
'type': 'string',
'default': 'wget',
'description': 'Path to wget binary',
},
},
)
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
assert 'Example: ["--extra-arg"]' in html
assert 'Example: {"^https://example\\\\.com": ["wget"]}' in html
assert 'Example: wget or /usr/bin/wget' in html
assert 'validateBinaryValue_id_config' in html
assert "meta.key.endsWith('_BINARY')" in html
assert "Binary paths cannot contain quotes" in html
def test_key_value_widget_falls_back_to_binary_validation_for_unknown_binary_keys(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'CHROME_BINARY': {
'plugin': 'base',
'type': 'string',
'default': '',
'description': 'Resolved Chromium/Chrome binary path shared across plugins',
},
},
)
html = str(
KeyValueWidget().render(
'config',
{'NODE_BINARY': '/opt/homebrew/bin/node'},
attrs={'id': 'id_config'},
)
)
assert 'function getMetaForKey_id_config' in html
assert "if (key.endsWith('_BINARY'))" in html
assert 'Path to binary executable' in html

View File

@@ -0,0 +1,127 @@
import pytest
from django.contrib.admin.sites import AdminSite
from uuid import uuid4
pytestmark = pytest.mark.django_db
def _create_snapshot():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
return Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
def _create_machine():
from archivebox.machine.models import Machine
return Machine.objects.create(
guid=f'test-guid-{uuid4()}',
hostname='test-host',
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid4()}',
os_arch='arm64',
os_family='darwin',
os_platform='macOS',
os_release='14.0',
os_kernel='Darwin',
stats={},
config={},
)
def _create_iface(machine):
from archivebox.machine.models import NetworkInterface
return NetworkInterface.objects.create(
machine=machine,
mac_address='00:11:22:33:44:66',
ip_public='203.0.113.11',
ip_local='10.0.0.11',
dns_server='1.1.1.1',
hostname='test-host',
iface='en0',
isp='Test ISP',
city='Test City',
region='Test Region',
country='Test Country',
)
def test_archiveresult_admin_links_plugin_and_process():
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
from archivebox.core.models import ArchiveResult
from archivebox.machine.models import Process
snapshot = _create_snapshot()
iface = _create_iface(_create_machine())
process = Process.objects.create(
machine=iface.machine,
iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(snapshot.output_dir / 'wget'),
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
status=Process.StatusChoices.EXITED,
)
result = ArchiveResult.objects.create(
snapshot=snapshot,
plugin='wget',
hook_name='on_Snapshot__06_wget.finite.bg.py',
process=process,
status=ArchiveResult.StatusChoices.SUCCEEDED,
)
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
plugin_html = str(admin.plugin_with_icon(result))
process_html = str(admin.process_link(result))
assert '/admin/environment/plugins/builtin.wget/' in plugin_html
assert f'/admin/machine/process/{process.id}/change' in process_html
def test_process_admin_links_binary_and_iface():
from archivebox.machine.admin import ProcessAdmin
from archivebox.machine.models import Binary, Process
machine = _create_machine()
iface = _create_iface(machine)
binary = Binary.objects.create(
machine=machine,
name='wget',
abspath='/usr/local/bin/wget',
version='1.21.2',
binprovider='env',
binproviders='env',
status=Binary.StatusChoices.INSTALLED,
)
process = Process.objects.create(
machine=machine,
iface=iface,
binary=binary,
process_type=Process.TypeChoices.HOOK,
pwd='/tmp/wget',
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
status=Process.StatusChoices.EXITED,
)
admin = ProcessAdmin(Process, AdminSite())
binary_html = str(admin.binary_link(process))
iface_html = str(admin.iface_link(process))
assert f'/admin/machine/binary/{binary.id}/change' in binary_html
assert f'/admin/machine/networkinterface/{iface.id}/change' in iface_html

View File

@@ -9,11 +9,13 @@ Tests cover:
"""
import pytest
import uuid
from typing import cast
from django.test import override_settings
from django.urls import reverse
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.utils import timezone
pytestmark = pytest.mark.django_db
@@ -195,6 +197,232 @@ class TestAdminSnapshotListView:
assert b'snapshot-view-list' in response.content
assert b'snapshot-view-grid' in response.content
def test_binary_change_view_renders(self, client, admin_user, db):
"""Binary admin change form should load without FieldError."""
from archivebox.machine.models import Machine, Binary
machine = Machine.objects.create(
guid=f'test-guid-{uuid.uuid4()}',
hostname='test-host',
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid.uuid4()}',
os_arch='x86_64',
os_family='darwin',
os_platform='darwin',
os_release='test',
os_kernel='test-kernel',
stats={},
)
binary = Binary.objects.create(
machine=machine,
name='gallery-dl',
binproviders='env',
binprovider='env',
abspath='/opt/homebrew/bin/gallery-dl',
version='1.26.9',
sha256='abc123',
status=Binary.StatusChoices.INSTALLED,
)
client.login(username='testadmin', password='testpassword')
url = f'/admin/machine/binary/{binary.pk}/change/'
response = client.get(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'gallery-dl' in response.content
def test_change_view_renders_real_redo_failed_action(self, client, admin_user, snapshot):
client.login(username='testadmin', password='testpassword')
url = reverse('admin:core_snapshot_change', args=[snapshot.pk])
response = client.get(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert f'/admin/core/snapshot/{snapshot.pk}/redo-failed/'.encode() in response.content
def test_redo_failed_action_requeues_snapshot(self, client, admin_user, snapshot, monkeypatch):
import archivebox.core.admin_snapshots as admin_snapshots
queued = []
def fake_bg_archive_snapshot(obj, overwrite=False, methods=None):
queued.append((str(obj.pk), overwrite, methods))
return 1
monkeypatch.setattr(admin_snapshots, 'bg_archive_snapshot', fake_bg_archive_snapshot)
client.login(username='testadmin', password='testpassword')
url = reverse('admin:core_snapshot_redo_failed', args=[snapshot.pk])
response = client.post(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 302
assert queued == [(str(snapshot.pk), False, None)]
assert response['Location'].endswith(f'/admin/core/snapshot/{snapshot.pk}/change/')
class TestArchiveResultAdminListView:
def test_list_view_renders_readonly_tags_and_noresults_status(self, client, admin_user, snapshot):
from archivebox.core.models import ArchiveResult, Tag
tag = Tag.objects.create(name='Alpha Research')
snapshot.tags.add(tag)
ArchiveResult.objects.create(
snapshot=snapshot,
plugin='title',
status=ArchiveResult.StatusChoices.NORESULTS,
output_str='No title found',
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('admin:core_archiveresult_changelist'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'Alpha Research' in response.content
assert b'tag-editor-inline readonly' in response.content
assert b'No Results' in response.content
def test_archiveresult_model_has_no_retry_at_field(self):
from archivebox.core.models import ArchiveResult
assert 'retry_at' not in {field.name for field in ArchiveResult._meta.fields}
class TestLiveProgressView:
def test_live_progress_routes_crawl_process_rows_to_crawl_setup(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=43210,
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
setup_entry = next(item for item in active_crawl['setup_plugins'] if item['source'] == 'process')
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
assert setup_entry['label'] == 'chrome wait'
assert setup_entry['status'] == 'started'
assert active_crawl['worker_pid'] == 43210
assert active_snapshot['all_plugins'] == []
def test_live_progress_uses_snapshot_process_rows_before_archiveresults(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=43211,
cmd=['/plugins/title/on_Snapshot__10_title.py', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
assert active_snapshot['all_plugins'][0]['source'] == 'process'
assert active_snapshot['all_plugins'][0]['label'] == 'title'
assert active_snapshot['all_plugins'][0]['status'] == 'started'
assert active_snapshot['worker_pid'] == 43211
def test_live_progress_merges_process_rows_with_archiveresults_when_present(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.core.models import ArchiveResult
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=54321,
cmd=['/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
ArchiveResult.objects.create(
snapshot=snapshot,
plugin='title',
status=ArchiveResult.StatusChoices.STARTED,
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
sources = {item['source'] for item in active_snapshot['all_plugins']}
plugins = {item['plugin'] for item in active_snapshot['all_plugins']}
assert sources == {'archiveresult', 'process'}
assert 'title' in plugins
assert 'chrome' in plugins
def test_live_progress_omits_pid_for_exited_process_rows(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.EXITED,
exit_code=0,
pid=99999,
cmd=['/plugins/title/on_Snapshot__10_title.py', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
ended_at=timezone.now(),
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
process_entry = next(item for item in active_snapshot['all_plugins'] if item['source'] == 'process')
assert process_entry['status'] == 'succeeded'
assert 'pid' not in process_entry
class TestAdminSnapshotSearch:
"""Tests for admin snapshot search functionality."""

View File

@@ -0,0 +1,305 @@
from pathlib import Path
from uuid import uuid4
import pytest
from django.db import connection
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db
def _cleanup_machine_process_rows() -> None:
with connection.cursor() as cursor:
cursor.execute("DELETE FROM machine_process")
def _create_snapshot():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
return Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
def _create_machine():
from archivebox.machine.models import Machine
return Machine.objects.create(
guid=f'test-guid-{uuid4()}',
hostname='test-host',
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid4()}',
os_arch='arm64',
os_family='darwin',
os_platform='macOS',
os_release='14.0',
os_kernel='Darwin',
stats={},
config={},
)
def _create_iface(machine):
from archivebox.machine.models import NetworkInterface
return NetworkInterface.objects.create(
machine=machine,
mac_address='00:11:22:33:44:55',
ip_public='203.0.113.10',
ip_local='10.0.0.10',
dns_server='1.1.1.1',
hostname='test-host',
iface='en0',
isp='Test ISP',
city='Test City',
region='Test Region',
country='Test Country',
)
def test_process_completed_projects_inline_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "wget"
plugin_dir.mkdir(parents=True, exist_ok=True)
(plugin_dir / "index.html").write_text("<html>ok</html>")
bus = create_bus(name="test_inline_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="wget",
hook_name="on_Snapshot__06_wget.finite.bg",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=["index.html"],
process_id="proc-inline",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "wget",
"hook_name": "on_Snapshot__06_wget.finite.bg",
"status": "succeeded",
"output_str": "wget/index.html",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
assert result.output_str == "wget/index.html"
assert "index.html" in result.output_files
_cleanup_machine_process_rows()
def test_process_completed_projects_synthetic_failed_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "chrome"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_synthetic_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="chrome",
hook_name="on_Snapshot__11_chrome_wait",
stdout="",
stderr="Hook timed out after 60 seconds",
exit_code=-1,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-failed",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:01:00+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"plugin": "chrome",
"hook_name": "on_Snapshot__11_chrome_wait",
"status": "failed",
"output_str": "Hook timed out after 60 seconds",
"error": "Hook timed out after 60 seconds",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
assert result.status == ArchiveResult.StatusChoices.FAILED
assert result.output_str == "Hook timed out after 60 seconds"
assert "Hook timed out" in result.notes
_cleanup_machine_process_rows()
def test_process_completed_projects_noresults_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_noresults_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-noresults",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
assert result.status == ArchiveResult.StatusChoices.NORESULTS
assert result.output_str == "No title found"
_cleanup_machine_process_rows()
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
machine = _create_machine()
iface = _create_iface(machine)
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
binary = Binary.objects.create(
machine=machine,
name='postlight-parser',
abspath='/tmp/postlight-parser',
version='2.2.3',
binprovider='npm',
binproviders='npm',
status=Binary.StatusChoices.INSTALLED,
)
bus = create_bus(name="test_process_started_binary_hydration")
service = ProcessService(bus)
event = ProcessStartedEvent(
plugin_name="mercury",
hook_name="on_Snapshot__57_mercury.py",
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py",
hook_args=["--url=https://example.com"],
output_dir="/tmp/mercury",
env={
"MERCURY_BINARY": binary.abspath,
"NODE_BINARY": "/tmp/node",
},
timeout=60,
pid=4321,
process_id="proc-mercury",
snapshot_id="",
start_ts="2026-03-22T12:00:00+00:00",
)
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == binary.id
assert process.iface_id == iface.id
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
machine = _create_machine()
iface = _create_iface(machine)
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
node = Binary.objects.create(
machine=machine,
name='node',
abspath='/tmp/node',
version='22.0.0',
binprovider='env',
binproviders='env',
status=Binary.StatusChoices.INSTALLED,
)
bus = create_bus(name="test_process_started_node_fallback")
service = ProcessService(bus)
event = ProcessStartedEvent(
plugin_name="parse_dom_outlinks",
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js",
hook_args=["--url=https://example.com"],
output_dir="/tmp/parse-dom-outlinks",
env={
"NODE_BINARY": node.abspath,
},
timeout=60,
pid=9876,
process_id="proc-parse-dom-outlinks",
snapshot_id="",
start_ts="2026-03-22T12:00:00+00:00",
)
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == node.id
assert process.iface_id == iface.id

View File

@@ -44,6 +44,27 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
assert snapshots[0][0] == 'https://example.com'
def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict):
"""Background add should create root snapshots immediately so the queue is visible in the DB."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--bg', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshots = c.execute("SELECT url, status FROM core_snapshot").fetchall()
conn.close()
assert len(snapshots) == 1
assert snapshots[0][0] == 'https://example.com'
assert snapshots[0][1] == 'queued'
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
"""Test that add command creates a Crawl record in the database."""
os.chdir(tmp_path)
@@ -217,6 +238,32 @@ def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extrac
assert persona_id
assert default_persona == 'Default'
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
result = subprocess.run(
[
'archivebox', 'add', '--index-only', '--depth=0',
'--domain-allowlist=example.com,*.example.com',
'--domain-denylist=static.example.com',
'https://example.com',
],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
allowlist, denylist = c.execute(
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1"
).fetchone()
conn.close()
assert allowlist == 'example.com,*.example.com'
assert denylist == 'static.example.com'
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()

View File

@@ -16,6 +16,13 @@ from archivebox.tests.conftest import (
create_test_url,
)
PROJECTOR_TEST_ENV = {
'PLUGINS': 'favicon',
'SAVE_FAVICON': 'True',
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
}
class TestArchiveResultCreate:
"""Tests for `archivebox archiveresult create`."""
@@ -38,13 +45,14 @@ class TestArchiveResultCreate:
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout2)
# Should have the Snapshot passed through and ArchiveResult created
# Should have the Snapshot passed through and an ArchiveResult request emitted
types = [r.get('type') for r in records]
assert 'Snapshot' in types
assert 'ArchiveResult' in types
ar = next(r for r in records if r['type'] == 'ArchiveResult')
assert ar['plugin'] == 'title'
assert 'id' not in ar
def test_create_with_specific_plugin(self, initialized_archive):
"""Create archive result for specific plugin."""
@@ -122,15 +130,33 @@ class TestArchiveResultList:
def test_list_filter_by_status(self, initialized_archive):
"""Filter archive results by status."""
# Create snapshot and archive result
# Create snapshot and materialize an archive result via the runner
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
created = parse_jsonl_output(
run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)[0]
)[0]
run_archivebox_cmd(
['archiveresult', 'update', '--status=queued'],
stdin=json.dumps(created),
data_dir=initialized_archive,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--status=queued'],
@@ -147,21 +173,28 @@ class TestArchiveResultList:
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=title'],
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['plugin'] == 'title'
assert r['plugin'] == 'favicon'
def test_list_with_limit(self, initialized_archive):
"""Limit number of results."""
@@ -170,11 +203,18 @@ class TestArchiveResultList:
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--limit=2'],
@@ -196,11 +236,22 @@ class TestArchiveResultUpdate:
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout_run, _, _ = run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout3, stderr, code = run_archivebox_cmd(
['archiveresult', 'update', '--status=failed'],
@@ -225,11 +276,22 @@ class TestArchiveResultDelete:
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout_run, _, _ = run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete'],
@@ -247,11 +309,22 @@ class TestArchiveResultDelete:
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout_run, _, _ = run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete', '--yes'],

View File

@@ -83,7 +83,7 @@ class TestCrawlCreate:
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
assert 'test-tag' in records[0].get('tags', '')
def test_create_pass_through_other_types(self, initialized_archive):
"""Pass-through records of other types unchanged."""

View File

@@ -173,6 +173,20 @@ def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path):
from archivebox.hooks import collect_urls_from_plugins
(tmp_path / "parse_html_urls").mkdir()
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
'{"url":"https://docs.sweeting.me/s/youtube-favorites)**"}\n',
encoding="utf-8",
)
urls = collect_urls_from_plugins(tmp_path)
assert len(urls) == 1
assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites"
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
url = create_test_url()
@@ -269,8 +283,13 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
)
assert ar_create_code == 0, ar_create_stderr
created_records = parse_jsonl_output(ar_create_stdout)
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
run_archivebox_cmd(
["run"],
stdin=ar_create_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["archiveresult", "list", "--plugin=favicon"],
@@ -278,6 +297,8 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
)
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
listed_records = parse_jsonl_output(list_stdout)
archiveresult = next(record for record in listed_records if record.get("type") == "ArchiveResult")
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],

View File

@@ -8,6 +8,9 @@ Tests cover:
"""
import json
import sys
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
@@ -266,3 +269,182 @@ class TestRunEmpty:
assert code == 0
assert 'No records to process' in stderr
class TestRunDaemonMode:
def test_run_daemon_processes_stdin_before_runner(self, monkeypatch):
from archivebox.cli import archivebox_run
class FakeStdin:
def isatty(self):
return False
monkeypatch.setattr(sys, "stdin", FakeStdin())
calls = []
monkeypatch.setattr(
archivebox_run,
"process_stdin_records",
lambda: calls.append("stdin") or 0,
)
monkeypatch.setattr(
archivebox_run,
"run_runner",
lambda daemon=False: calls.append(f"runner:{daemon}") or 0,
)
with pytest.raises(SystemExit) as exit_info:
archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None)
assert exit_info.value.code == 0
assert calls == ["stdin", "runner:True"]
def test_run_daemon_skips_runner_if_stdin_processing_fails(self, monkeypatch):
from archivebox.cli import archivebox_run
class FakeStdin:
def isatty(self):
return False
monkeypatch.setattr(sys, "stdin", FakeStdin())
monkeypatch.setattr(archivebox_run, "process_stdin_records", lambda: 1)
monkeypatch.setattr(
archivebox_run,
"run_runner",
lambda daemon=False: (_ for _ in ()).throw(AssertionError("runner should not start after stdin failure")),
)
with pytest.raises(SystemExit) as exit_info:
archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None)
assert exit_info.value.code == 1
@pytest.mark.django_db
class TestRecoverOrphanedCrawls:
def test_recover_orphaned_crawl_requeues_started_crawl_without_active_processes(self):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=None,
)
recovered = recover_orphaned_crawls()
crawl.refresh_from_db()
assert recovered == 1
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
def test_recover_orphaned_crawl_skips_active_child_processes(self):
import archivebox.machine.models as machine_models
from django.utils import timezone
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.machine.models import Machine, Process
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=None,
)
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js'],
env={
'CRAWL_ID': str(crawl.id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
recovered = recover_orphaned_crawls()
crawl.refresh_from_db()
assert recovered == 0
assert crawl.retry_at is None
def test_recover_orphaned_crawl_seals_when_all_snapshots_are_already_sealed(self):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
)
recovered = recover_orphaned_crawls()
crawl.refresh_from_db()
assert recovered == 1
assert crawl.status == Crawl.StatusChoices.SEALED
assert crawl.retry_at is None
@pytest.mark.django_db
class TestRecoverOrphanedSnapshots:
def test_recover_orphaned_snapshot_requeues_started_snapshot_without_active_processes(self):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.runner import recover_orphaned_snapshots
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
retry_at=None,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
retry_at=None,
)
recovered = recover_orphaned_snapshots()
snapshot.refresh_from_db()
crawl.refresh_from_db()
assert recovered == 1
assert snapshot.status == Snapshot.StatusChoices.QUEUED
assert snapshot.retry_at is not None
assert crawl.status == Crawl.StatusChoices.QUEUED
assert crawl.retry_at is not None

View File

@@ -6,6 +6,15 @@ Verify server can start (basic smoke tests only, no full server testing).
import os
import subprocess
import sys
from unittest.mock import Mock
def test_sqlite_connections_use_explicit_30_second_busy_timeout():
from archivebox.core.settings import SQLITE_CONNECTION_OPTIONS
assert SQLITE_CONNECTION_OPTIONS["OPTIONS"]["timeout"] == 30
assert "PRAGMA busy_timeout = 30000;" in SQLITE_CONNECTION_OPTIONS["OPTIONS"]["init_command"]
def test_server_shows_usage_info(tmp_path, process):
@@ -39,3 +48,64 @@ def test_server_init_flag(tmp_path, process):
assert result.returncode == 0
assert '--init' in result.stdout or 'init' in result.stdout.lower()
def test_runner_worker_uses_current_interpreter():
"""The supervised runner should use the active Python environment, not PATH."""
from archivebox.workers.supervisord_util import RUNNER_WORKER
assert RUNNER_WORKER["command"] == f"{sys.executable} -m archivebox run --daemon"
def test_reload_workers_use_current_interpreter_and_supervisord_managed_runner():
from archivebox.workers.supervisord_util import RUNNER_WATCH_WORKER, RUNSERVER_WORKER
runserver = RUNSERVER_WORKER("127.0.0.1", "8000", reload=True, pidfile="/tmp/runserver.pid")
watcher = RUNNER_WATCH_WORKER("/tmp/runserver.pid")
assert runserver["name"] == "worker_runserver"
assert runserver["command"] == f"{sys.executable} -m archivebox manage runserver 127.0.0.1:8000"
assert 'ARCHIVEBOX_RUNSERVER="1"' in runserver["environment"]
assert 'ARCHIVEBOX_AUTORELOAD="1"' in runserver["environment"]
assert 'ARCHIVEBOX_RUNSERVER_PIDFILE="/tmp/runserver.pid"' in runserver["environment"]
assert watcher["name"] == "worker_runner_watch"
assert watcher["command"] == f"{sys.executable} -m archivebox manage runner_watch --pidfile=/tmp/runserver.pid"
def test_stop_existing_background_runner_cleans_up_and_stops_orchestrators():
from archivebox.cli.archivebox_server import stop_existing_background_runner
runner_a = Mock()
runner_a.kill_tree = Mock()
runner_a.terminate = Mock()
runner_b = Mock()
runner_b.kill_tree = Mock(side_effect=RuntimeError("boom"))
runner_b.terminate = Mock()
process_model = Mock()
process_model.StatusChoices.RUNNING = "running"
process_model.TypeChoices.ORCHESTRATOR = "orchestrator"
queryset = Mock()
queryset.order_by.return_value = [runner_a, runner_b]
process_model.objects.filter.return_value = queryset
supervisor = Mock()
stop_worker = Mock()
log = Mock()
stopped = stop_existing_background_runner(
machine=Mock(),
process_model=process_model,
supervisor=supervisor,
stop_worker_fn=stop_worker,
log=log,
)
assert stopped == 2
assert process_model.cleanup_stale_running.call_count == 2
stop_worker.assert_any_call(supervisor, "worker_runner")
stop_worker.assert_any_call(supervisor, "worker_runner_watch")
runner_a.kill_tree.assert_called_once_with(graceful_timeout=2.0)
runner_b.terminate.assert_called_once_with(graceful_timeout=2.0)
log.assert_called_once()

View File

@@ -74,7 +74,7 @@ class TestSnapshotCreate:
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
assert 'test-tag' in records[0].get('tags', '')
def test_create_pass_through_other_types(self, initialized_archive):
"""Pass-through records of other types unchanged."""

View File

@@ -0,0 +1,326 @@
from datetime import timedelta
from types import SimpleNamespace
import pytest
from django.test import RequestFactory
from django.utils import timezone
from archivebox.config import views as config_views
from archivebox.core import views as core_views
from archivebox.machine.models import Binary
pytestmark = pytest.mark.django_db
def test_get_db_binaries_by_name_collapses_youtube_dl_aliases(monkeypatch):
now = timezone.now()
records = [
SimpleNamespace(
name='youtube-dl',
version='',
binprovider='',
abspath='/usr/bin/youtube-dl',
status=Binary.StatusChoices.INSTALLED,
modified_at=now,
),
SimpleNamespace(
name='yt-dlp',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
status=Binary.StatusChoices.INSTALLED,
modified_at=now + timedelta(seconds=1),
),
]
monkeypatch.setattr(config_views.Binary, 'objects', SimpleNamespace(all=lambda: records))
binaries = config_views.get_db_binaries_by_name()
assert 'yt-dlp' in binaries
assert 'youtube-dl' not in binaries
assert binaries['yt-dlp'].version == '2026.03.01'
def test_binaries_list_view_uses_db_version_and_hides_youtube_dl_alias(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/')
request.user = SimpleNamespace(is_superuser=True)
db_binary = SimpleNamespace(
name='youtube-dl',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
status=Binary.StatusChoices.INSTALLED,
sha256='',
modified_at=timezone.now(),
)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
context = config_views.binaries_list_view.__wrapped__(request)
assert len(context['table']['Binary Name']) == 1
assert str(context['table']['Binary Name'][0].link_item) == 'yt-dlp'
assert context['table']['Found Version'][0] == '✅ 2026.03.01'
assert context['table']['Provided By'][0] == 'pip'
assert context['table']['Found Abspath'][0] == '/usr/bin/yt-dlp'
def test_binaries_list_view_only_shows_persisted_records(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
context = config_views.binaries_list_view.__wrapped__(request)
assert context['table']['Binary Name'] == []
assert context['table']['Found Version'] == []
assert context['table']['Provided By'] == []
assert context['table']['Found Abspath'] == []
def test_binary_detail_view_uses_canonical_db_record(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/youtube-dl/')
request.user = SimpleNamespace(is_superuser=True)
db_binary = SimpleNamespace(
id='019d14cc-6c40-7793-8ff1-0f8bb050e8a3',
name='yt-dlp',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
sha256='abc123',
status=Binary.StatusChoices.INSTALLED,
modified_at=timezone.now(),
)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
context = config_views.binary_detail_view.__wrapped__(request, key='youtube-dl')
section = context['data'][0]
assert context['title'] == 'yt-dlp'
assert section['fields']['name'] == 'yt-dlp'
assert section['fields']['version'] == '2026.03.01'
assert section['fields']['binprovider'] == 'pip'
assert section['fields']['abspath'] == '/usr/bin/yt-dlp'
assert '/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp' in section['description']
def test_binary_detail_view_marks_unrecorded_binary(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/wget/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
context = config_views.binary_detail_view.__wrapped__(request, key='wget')
section = context['data'][0]
assert section['description'] == 'No persisted Binary record found'
assert section['fields']['status'] == 'unrecorded'
assert section['fields']['binprovider'] == 'not recorded'
def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
request = RequestFactory().get('/admin/environment/plugins/builtin.example/')
request.user = SimpleNamespace(is_superuser=True)
plugin_config = {
'title': 'Example Plugin',
'description': 'Example config used to verify plugin metadata rendering.',
'type': 'object',
'required_plugins': ['chrome'],
'required_binaries': ['example-cli'],
'output_mimetypes': ['text/plain', 'application/json'],
'properties': {
'EXAMPLE_ENABLED': {
'type': 'boolean',
'description': 'Enable the example plugin.',
'x-fallback': 'CHECK_SSL_VALIDITY',
},
'EXAMPLE_BINARY': {
'type': 'string',
'default': 'gallery-dl',
'description': 'Filesystem path for example output.',
'x-aliases': ['USE_EXAMPLE_BINARY'],
},
},
}
monkeypatch.setattr(config_views, 'get_filesystem_plugins', lambda: {
'builtin.example': {
'id': 'builtin.example',
'name': 'example',
'source': 'builtin',
'path': '/plugins/example',
'hooks': ['on_Snapshot__01_example.py'],
'config': plugin_config,
}
})
monkeypatch.setattr(config_views, 'get_machine_admin_url', lambda: '/admin/machine/machine/test-machine/change/')
context = config_views.plugin_detail_view.__wrapped__(request, key='builtin.example')
assert context['title'] == 'example'
assert len(context['data']) == 5
summary_section, hooks_section, metadata_section, config_section, properties_section = context['data']
assert summary_section['fields'] == {
'id': 'builtin.example',
'name': 'example',
'source': 'builtin',
}
assert '/plugins/example' in summary_section['description']
assert 'https://archivebox.github.io/abx-plugins/#example' in summary_section['description']
assert hooks_section['name'] == 'Hooks'
assert hooks_section['fields'] == {}
assert 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py' in hooks_section['description']
assert 'on_Snapshot__01_example.py' in hooks_section['description']
assert metadata_section['name'] == 'Plugin Metadata'
assert metadata_section['fields'] == {}
assert 'Example Plugin' in metadata_section['description']
assert 'Example config used to verify plugin metadata rendering.' in metadata_section['description']
assert 'https://archivebox.github.io/abx-plugins/#chrome' in metadata_section['description']
assert '/admin/environment/binaries/example-cli/' in metadata_section['description']
assert 'text/plain' in metadata_section['description']
assert 'application/json' in metadata_section['description']
assert config_section['name'] == 'config.json'
assert config_section['fields'] == {}
assert '<pre style=' in config_section['description']
assert 'EXAMPLE_ENABLED' in config_section['description']
assert '<span style="color: #0550ae;">"properties"</span>' in config_section['description']
assert properties_section['name'] == 'Config Properties'
assert properties_section['fields'] == {}
assert '/admin/machine/machine/test-machine/change/' in properties_section['description']
assert '/admin/machine/binary/' in properties_section['description']
assert '/admin/environment/binaries/' in properties_section['description']
assert 'EXAMPLE_ENABLED' in properties_section['description']
assert 'boolean' in properties_section['description']
assert 'Enable the example plugin.' in properties_section['description']
assert '/admin/environment/config/EXAMPLE_ENABLED/' in properties_section['description']
assert '/admin/environment/config/CHECK_SSL_VALIDITY/' in properties_section['description']
assert '/admin/environment/config/USE_EXAMPLE_BINARY/' in properties_section['description']
assert '/admin/environment/binaries/gallery-dl/' in properties_section['description']
assert 'EXAMPLE_BINARY' in properties_section['description']
def test_get_config_definition_link_keeps_core_config_search_link(monkeypatch):
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: None)
url, label = core_views.get_config_definition_link('CHECK_SSL_VALIDITY')
assert 'github.com/search' in url
assert 'CHECK_SSL_VALIDITY' in url
assert label == 'archivebox/config'
def test_get_config_definition_link_uses_plugin_config_json_for_plugin_options(monkeypatch):
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / 'parse_dom_outlinks'
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: 'parse_dom_outlinks')
monkeypatch.setattr(core_views, 'iter_plugin_dirs', lambda: [plugin_dir])
url, label = core_views.get_config_definition_link('PARSE_DOM_OUTLINKS_ENABLED')
assert url == 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json'
assert label == 'abx_plugins/plugins/parse_dom_outlinks/config.json'
def test_live_config_value_view_renames_source_field_and_uses_plugin_definition_link(monkeypatch):
request = RequestFactory().get('/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {})
monkeypatch.setattr(core_views, 'get_config', lambda: {'PARSE_DOM_OUTLINKS_ENABLED': True})
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
monkeypatch.setattr(core_views, 'find_config_source', lambda key, merged: 'Default')
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: False))
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-id', config={})))
monkeypatch.setattr(BaseConfigSet, 'load_from_file', classmethod(lambda cls, path: {}))
monkeypatch.setattr(
core_views,
'get_config_definition_link',
lambda key: (
'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json',
'abx_plugins/plugins/parse_dom_outlinks/config.json',
),
)
context = core_views.live_config_value_view.__wrapped__(request, key='PARSE_DOM_OUTLINKS_ENABLED')
section = context['data'][0]
assert 'Currently read from' in section['fields']
assert 'Source' not in section['fields']
assert section['fields']['Currently read from'] == 'Default'
assert 'abx_plugins/plugins/parse_dom_outlinks/config.json' in section['help_texts']['Type']
def test_find_config_source_prefers_environment_over_machine_and_file(monkeypatch):
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(
Machine,
'current',
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
)
monkeypatch.setattr(
BaseConfigSet,
'load_from_file',
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
)
assert core_views.find_config_source('CHECK_SSL_VALIDITY', {'CHECK_SSL_VALIDITY': False}) == 'Environment'
def test_live_config_value_view_priority_text_matches_runtime_precedence(monkeypatch):
request = RequestFactory().get('/admin/environment/config/CHECK_SSL_VALIDITY/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {'CHECK_SSL_VALIDITY': True})
monkeypatch.setattr(core_views, 'get_config', lambda: {'CHECK_SSL_VALIDITY': False})
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(
Machine,
'current',
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
)
monkeypatch.setattr(
BaseConfigSet,
'load_from_file',
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
)
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: True))
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
context = core_views.live_config_value_view.__wrapped__(request, key='CHECK_SSL_VALIDITY')
section = context['data'][0]
assert section['fields']['Currently read from'] == 'Environment'
help_text = section['help_texts']['Currently read from']
assert help_text.index('Environment') < help_text.index('Machine') < help_text.index('Config File') < help_text.index('Default')
assert 'Configuration Sources (highest priority first):' in section['help_texts']['Value']

View File

@@ -0,0 +1,220 @@
from typing import cast
import pytest
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from archivebox.crawls.admin import CrawlAdminForm
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = 'admin.archivebox.localhost:8000'
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username='crawladmin',
email='crawladmin@test.com',
password='testpassword',
)
@pytest.fixture
def crawl(admin_user):
return Crawl.objects.create(
urls='https://example.com\nhttps://example.org',
tags_str='alpha,beta',
created_by=admin_user,
)
def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
client.login(username='crawladmin', password='testpassword')
response = client.get(
reverse('admin:crawls_crawl_change', args=[crawl.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="tags_editor"' in response.content
assert b'tag-editor-container' in response.content
assert b'alpha' in response.content
assert b'beta' in response.content
def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
client.login(username='crawladmin', password='testpassword')
response = client.get(
reverse('admin:crawls_crawl_add'),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="url_filters_allowlist"' in response.content
assert b'name="url_filters_denylist"' in response.content
assert b'Same domain only' in response.content
def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
form = CrawlAdminForm(
data={
'created_at': crawl.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'urls': crawl.urls,
'config': '{}',
'max_depth': '0',
'tags_editor': 'alpha, beta, Alpha, gamma',
'url_filters_allowlist': 'example.com\n*.example.com',
'url_filters_denylist': 'static.example.com',
'persona_id': '',
'label': '',
'notes': '',
'schedule': '',
'status': crawl.status,
'retry_at': crawl.retry_at.strftime('%Y-%m-%d %H:%M:%S'),
'created_by': str(admin_user.pk),
'num_uses_failed': '0',
'num_uses_succeeded': '0',
},
instance=crawl,
)
assert form.is_valid(), form.errors
updated = form.save()
updated.refresh_from_db()
assert updated.tags_str == 'alpha,beta,gamma'
assert updated.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
assert updated.config['URL_DENYLIST'] == 'static.example.com'
def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
crawl = Crawl.objects.create(
urls='https://example.com/remove-me',
created_by=admin_user,
)
snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://example.com/remove-me',
)
client.login(username='crawladmin', password='testpassword')
response = client.post(
reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['ok'] is True
assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
crawl.refresh_from_db()
assert 'https://example.com/remove-me' not in crawl.urls
def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://cdn.example.com/asset.js',
'https://cdn.example.com/second.js',
'https://example.com/root',
]),
created_by=admin_user,
)
queued_snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://cdn.example.com/asset.js',
status=Snapshot.StatusChoices.QUEUED,
)
preserved_snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://example.com/root',
status=Snapshot.StatusChoices.SEALED,
)
client.login(username='crawladmin', password='testpassword')
response = client.post(
reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, queued_snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['ok'] is True
assert payload['domain'] == 'cdn.example.com'
crawl.refresh_from_db()
assert crawl.get_url_denylist(use_effective_config=False) == ['cdn.example.com']
assert 'https://cdn.example.com/asset.js' not in crawl.urls
assert 'https://cdn.example.com/second.js' not in crawl.urls
assert 'https://example.com/root' in crawl.urls
assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
snapshot = Snapshot.from_json(
{'url': 'https://docs.sweeting.me/s/youtube-favorites)**'},
overrides={'crawl': crawl},
queue_for_extraction=False,
)
assert snapshot is not None
assert snapshot.url == 'https://docs.sweeting.me/s/youtube-favorites'
def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://example.com/root',
'https://static.example.com/app.js',
'https://other.test/page',
]),
created_by=admin_user,
config={
'URL_ALLOWLIST': 'example.com',
'URL_DENYLIST': 'static.example.com',
},
)
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == ['https://example.com/root']
def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://example.com/root',
'https://example.com/path,with,commas',
'https://other.test/page',
]),
created_by=admin_user,
config={
'URL_ALLOWLIST': r'^https://example\.com/(root|path,with,commas)$' + '\n' + r'^https://other\.test/page$',
'URL_DENYLIST': r'^https://example\.com/path,with,commas$',
},
)
assert crawl.get_url_allowlist(use_effective_config=False) == [
r'^https://example\.com/(root|path,with,commas)$',
r'^https://other\.test/page$',
]
assert crawl.get_url_denylist(use_effective_config=False) == [
r'^https://example\.com/path,with,commas$',
]
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == [
'https://example.com/root',
'https://other.test/page',
]

View File

@@ -14,7 +14,7 @@ Tests cover:
import os
from datetime import timedelta
from typing import cast
from unittest.mock import patch
from unittest.mock import Mock, patch
import pytest
from django.test import TestCase
@@ -89,11 +89,45 @@ class TestMachineModel(TestCase):
assert result is not None
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
def test_machine_from_jsonl_strips_legacy_chromium_version(self):
"""Machine.from_json() should ignore legacy browser version keys."""
Machine.current() # Ensure machine exists
record = {
'config': {
'WGET_BINARY': '/usr/bin/wget',
'CHROMIUM_VERSION': '123.4.5',
},
}
result = Machine.from_json(record)
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
self.assertNotIn('CHROMIUM_VERSION', result.config)
def test_machine_from_jsonl_invalid(self):
"""Machine.from_json() should return None for invalid records."""
result = Machine.from_json({'invalid': 'record'})
self.assertIsNone(result)
def test_machine_current_strips_legacy_chromium_version(self):
"""Machine.current() should clean legacy browser version keys from persisted config."""
import archivebox.machine.models as models
machine = Machine.current()
machine.config = {
'CHROME_BINARY': '/tmp/chromium',
'CHROMIUM_VERSION': '123.4.5',
}
machine.save(update_fields=['config'])
models._CURRENT_MACHINE = machine
refreshed = Machine.current()
self.assertEqual(refreshed.config.get('CHROME_BINARY'), '/tmp/chromium')
self.assertNotIn('CHROMIUM_VERSION', refreshed.config)
def test_machine_manager_current(self):
"""Machine.objects.current() should return current machine."""
machine = Machine.current()
@@ -131,6 +165,36 @@ class TestNetworkInterfaceModel(TestCase):
interface = NetworkInterface.current()
self.assertIsNotNone(interface)
def test_networkinterface_current_refresh_creates_new_interface_when_properties_change(self):
"""Refreshing should persist a new NetworkInterface row when the host network fingerprint changes."""
import archivebox.machine.models as models
first = {
'mac_address': 'aa:bb:cc:dd:ee:01',
'ip_public': '1.1.1.1',
'ip_local': '192.168.1.10',
'dns_server': '8.8.8.8',
'hostname': 'host-a',
'iface': 'en0',
'isp': 'ISP A',
'city': 'City',
'region': 'Region',
'country': 'Country',
}
second = {
**first,
'ip_public': '2.2.2.2',
'ip_local': '10.0.0.5',
}
with patch.object(models, 'get_host_network', side_effect=[first, second]):
interface1 = NetworkInterface.current(refresh=True)
interface2 = NetworkInterface.current(refresh=True)
self.assertNotEqual(interface1.id, interface2.id)
self.assertEqual(interface1.machine_id, interface2.machine_id)
self.assertEqual(NetworkInterface.objects.filter(machine=interface1.machine).count(), 2)
class TestBinaryModel(TestCase):
"""Test the Binary model."""
@@ -360,6 +424,8 @@ class TestProcessCurrent(TestCase):
self.assertEqual(proc.pid, os.getpid())
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
self.assertIsNotNone(proc.machine)
self.assertIsNotNone(proc.iface)
self.assertEqual(proc.iface.machine_id, proc.machine_id)
self.assertIsNotNone(proc.started_at)
def test_process_current_caches(self):
@@ -375,6 +441,12 @@ class TestProcessCurrent(TestCase):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
def test_process_detect_type_runner_watch(self):
"""runner_watch should be classified as a worker, not the orchestrator itself."""
with patch('sys.argv', ['archivebox', 'manage', 'runner_watch', '--pidfile=/tmp/runserver.pid']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.WORKER)
def test_process_detect_type_cli(self):
"""_detect_process_type should detect CLI commands."""
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
@@ -387,6 +459,27 @@ class TestProcessCurrent(TestCase):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.BINARY)
def test_process_proc_allows_interpreter_wrapped_script(self):
"""Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
proc = Process.objects.create(
machine=Machine.current(),
cmd=['/tmp/on_Crawl__90_chrome_launch.daemon.bg.js', '--url=https://example.com/'],
pid=12345,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
os_proc = Mock()
os_proc.create_time.return_value = proc.started_at.timestamp()
os_proc.cmdline.return_value = [
'node',
'/tmp/on_Crawl__90_chrome_launch.daemon.bg.js',
'--url=https://example.com/',
]
with patch('archivebox.machine.models.psutil.Process', return_value=os_proc):
self.assertIs(proc.proc, os_proc)
class TestProcessHierarchy(TestCase):
"""Test Process parent/child relationships."""

View File

@@ -0,0 +1,191 @@
import pytest
from typing import cast
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from archivebox.personas.importers import (
PersonaImportResult,
discover_persona_template_profiles,
import_persona_from_source,
resolve_browser_profile_source,
resolve_custom_import_source,
)
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username="personaadmin",
email="personaadmin@test.com",
password="testpassword",
)
def _make_profile_source(tmp_path):
user_data_dir = tmp_path / "Chrome User Data"
profile_dir = user_data_dir / "Default"
profile_dir.mkdir(parents=True)
(profile_dir / "Preferences").write_text("{}")
return resolve_browser_profile_source(
browser="chrome",
user_data_dir=user_data_dir,
profile_dir="Default",
browser_binary="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
)
def test_resolve_custom_import_source_accepts_exact_profile_dir(tmp_path):
user_data_dir = tmp_path / "Brave User Data"
profile_dir = user_data_dir / "Profile 2"
profile_dir.mkdir(parents=True)
(profile_dir / "Preferences").write_text("{}")
source = resolve_custom_import_source(str(profile_dir))
assert source.kind == "browser-profile"
assert source.user_data_dir == user_data_dir.resolve()
assert source.profile_dir == "Profile 2"
def test_resolve_custom_import_source_accepts_cdp_url():
source = resolve_custom_import_source("ws://127.0.0.1:9222/devtools/browser/test-session")
assert source.kind == "cdp"
assert source.cdp_url == "ws://127.0.0.1:9222/devtools/browser/test-session"
def test_discover_persona_template_profiles_finds_chrome_profile_dirs(tmp_path):
personas_dir = tmp_path / "personas"
chrome_profile = personas_dir / "ExistingPersona" / "chrome_profile"
default_profile = chrome_profile / "Default"
default_profile.mkdir(parents=True)
(default_profile / "Preferences").write_text("{}")
discovered = discover_persona_template_profiles(personas_dir=personas_dir)
assert len(discovered) == 1
assert discovered[0].browser == "persona"
assert discovered[0].source_name == "ExistingPersona"
assert discovered[0].profile_dir == "Default"
assert discovered[0].user_data_dir == chrome_profile.resolve()
def test_discover_persona_template_profiles_finds_home_abx_personas(monkeypatch, tmp_path):
from archivebox.config.constants import CONSTANTS
monkeypatch.setattr(CONSTANTS, "PERSONAS_DIR", tmp_path / "missing-data-personas")
monkeypatch.setattr("archivebox.personas.importers.Path.home", lambda: tmp_path)
chrome_profile = tmp_path / ".config" / "abx" / "personas" / "HomePersona" / "chrome_profile"
default_profile = chrome_profile / "Default"
default_profile.mkdir(parents=True)
(default_profile / "Preferences").write_text("{}")
discovered = discover_persona_template_profiles()
assert len(discovered) == 1
assert discovered[0].browser == "persona"
assert discovered[0].source_name == "HomePersona"
assert discovered[0].profile_dir == "Default"
assert discovered[0].user_data_dir == chrome_profile.resolve()
def test_persona_admin_add_view_renders_import_ui(client, admin_user, monkeypatch, tmp_path):
source = _make_profile_source(tmp_path)
monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
client.login(username="personaadmin", password="testpassword")
response = client.get(reverse("admin:personas_persona_add"), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b"Bootstrap a persona from a real browser session" in response.content
assert b"Google Chrome / Default" in response.content
assert b"auth.json" in response.content
def test_import_persona_from_source_copies_user_agent_to_persona_config(admin_user, monkeypatch, tmp_path):
from archivebox.personas.models import Persona
source = _make_profile_source(tmp_path)
persona = Persona.objects.create(name="AgentPersona", created_by=admin_user)
def fake_export_browser_state(**kwargs):
return True, {"user_agent": "Mozilla/5.0 Test Imported UA"}, "ok"
monkeypatch.setattr("archivebox.personas.importers.export_browser_state", fake_export_browser_state)
result = import_persona_from_source(
persona,
source,
copy_profile=False,
import_cookies=False,
capture_storage=False,
)
persona.refresh_from_db()
assert result.user_agent_imported is True
assert persona.config["USER_AGENT"] == "Mozilla/5.0 Test Imported UA"
def test_persona_admin_add_post_runs_shared_importer(client, admin_user, monkeypatch, tmp_path):
from archivebox.personas.models import Persona
source = _make_profile_source(tmp_path)
monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
calls = {}
def fake_import(persona, selected_source, **kwargs):
calls["persona_name"] = persona.name
calls["source"] = selected_source
calls["kwargs"] = kwargs
(persona.path / "cookies.txt").parent.mkdir(parents=True, exist_ok=True)
(persona.path / "cookies.txt").write_text("# Netscape HTTP Cookie File\n")
(persona.path / "auth.json").write_text('{"TYPE":"auth","cookies":[],"localStorage":{},"sessionStorage":{}}\n')
return PersonaImportResult(
source=selected_source,
profile_copied=True,
cookies_imported=True,
storage_captured=True,
)
monkeypatch.setattr("archivebox.personas.forms.import_persona_from_source", fake_import)
client.login(username="personaadmin", password="testpassword")
response = client.post(
reverse("admin:personas_persona_add"),
{
"name": "ImportedPersona",
"created_by": str(admin_user.pk),
"config": "{}",
"import_mode": "discovered",
"import_discovered_profile": source.choice_value,
"import_copy_profile": "on",
"import_extract_cookies": "on",
"import_capture_storage": "on",
"_save": "Save",
},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 302
persona = Persona.objects.get(name="ImportedPersona")
assert calls["persona_name"] == "ImportedPersona"
assert calls["source"].profile_dir == "Default"
assert calls["kwargs"] == {
"copy_profile": True,
"import_cookies": True,
"capture_storage": True,
}
assert persona.COOKIES_FILE.endswith("cookies.txt")
assert persona.AUTH_STORAGE_FILE.endswith("auth.json")

View File

@@ -0,0 +1,640 @@
import asyncio
import subprocess
from types import SimpleNamespace
import pytest
from django.test import RequestFactory
pytestmark = pytest.mark.django_db
class _DummyBus:
def __init__(self, name: str):
self.name = name
async def stop(self):
return None
class _DummyService:
def __init__(self, *args, **kwargs):
pass
class _DummyAbxServices:
def __init__(self):
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
async def _wait(self):
return None
async def _call_sync(func, *args, **kwargs):
return func(*args, **kwargs)
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://blog.sweeting.me\nhttps://sweeting.me',
created_by_id=get_or_create_system_user_pk(),
)
snapshot_a = Snapshot.objects.create(
url='https://blog.sweeting.me',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
snapshot_b = Snapshot.objects.create(
url='https://sweeting.me',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
created_buses: list[_DummyBus] = []
def fake_create_bus(*, name, total_timeout=3600.0, **kwargs):
bus = _DummyBus(name)
created_buses.append(bus)
return bus
monkeypatch.setattr(runner_module, 'create_bus', fake_create_bus)
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
download_calls = []
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
download_calls.append(
{
'url': url,
'bus': bus,
'snapshot_id': config_overrides['SNAPSHOT_ID'],
'source_url': config_overrides['SOURCE_URL'],
'abx_snapshot_id': snapshot.id,
}
)
await asyncio.sleep(0)
return []
monkeypatch.setattr(runner_module, 'download', fake_download)
crawl_runner = runner_module.CrawlRunner(crawl)
snapshot_data = {
str(snapshot_a.id): {
'id': str(snapshot_a.id),
'url': snapshot_a.url,
'title': snapshot_a.title,
'timestamp': snapshot_a.timestamp,
'bookmarked_at': snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
'created_at': snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
'tags': snapshot_a.tags_str(),
'depth': snapshot_a.depth,
'parent_snapshot_id': str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
'output_dir': str(snapshot_a.output_dir),
'config': crawl_runner._snapshot_config(snapshot_a),
},
str(snapshot_b.id): {
'id': str(snapshot_b.id),
'url': snapshot_b.url,
'title': snapshot_b.title,
'timestamp': snapshot_b.timestamp,
'bookmarked_at': snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
'created_at': snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
'tags': snapshot_b.tags_str(),
'depth': snapshot_b.depth,
'parent_snapshot_id': str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
'output_dir': str(snapshot_b.output_dir),
'config': crawl_runner._snapshot_config(snapshot_b),
},
}
monkeypatch.setattr(crawl_runner, '_load_snapshot_run_data', lambda snapshot_id: snapshot_data[snapshot_id])
async def run_both():
await asyncio.gather(
crawl_runner._run_snapshot(str(snapshot_a.id)),
crawl_runner._run_snapshot(str(snapshot_b.id)),
)
asyncio.run(run_both())
assert len(download_calls) == 2
assert {call['snapshot_id'] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call['source_url'] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call['bus']) for call in download_calls}) == 2
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
popen_calls = []
class DummyPopen:
def __init__(self, args, **kwargs):
popen_calls.append((args, kwargs))
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
monkeypatch.setattr(
machine_models.Process.objects,
'filter',
lambda **kwargs: SimpleNamespace(exists=lambda: False),
)
monkeypatch.setattr(runner_module.subprocess, 'Popen', DummyPopen)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is True
assert len(popen_calls) == 1
assert popen_calls[0][0] == [runner_module.sys.executable, '-m', 'archivebox', 'run', '--daemon']
assert popen_calls[0][1]['stdin'] is subprocess.DEVNULL
def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
monkeypatch.setattr(
machine_models.Process.objects,
'filter',
lambda **kwargs: SimpleNamespace(exists=lambda: True),
)
monkeypatch.setattr(
runner_module.subprocess,
'Popen',
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('runner should not be spawned')),
)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is False
def test_runner_prepare_refreshes_network_interface_and_attaches_current_process(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
)
class _Iface:
id = 'iface-1'
machine = SimpleNamespace(id='machine-1')
machine_id = 'machine-1'
saved_updates = []
class _Proc:
iface_id = None
machine_id = 'machine-1'
iface = None
machine = None
def save(self, *, update_fields):
saved_updates.append(tuple(update_fields))
proc = _Proc()
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'create_bus', lambda **kwargs: _DummyBus(kwargs['name']))
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
from archivebox.machine.models import NetworkInterface, Process
from archivebox.config import configset as configset_module
refresh_calls = []
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, 'current', classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, 'get_config', lambda **kwargs: {})
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner._prepare()
assert refresh_calls == [True]
assert proc.iface is not None
assert proc.machine == proc.iface.machine
assert saved_updates == [('iface', 'machine', 'modified_at')]
def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
from django.contrib.auth import get_user_model
from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
user = get_user_model().objects.create_superuser(
username='runner-api-admin',
email='runner-api-admin@example.com',
password='testpassword',
)
request = RequestFactory().post('/api/v1/crawls')
request.user = user
crawl = create_crawl(
request,
CrawlCreateSchema(
urls=['https://example.com'],
max_depth=0,
tags=[],
tags_str='',
label='',
notes='',
config={},
),
)
assert str(crawl.id)
assert crawl.status == 'queued'
assert crawl.retry_at is not None
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
'sync_to_async',
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
)
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert crawl.status != Crawl.StatusChoices.SEALED
assert crawl.retry_at is not None
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, 'create_bus', lambda *args, **kwargs: _DummyBus('runner'))
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(crawl, 'cleanup', lambda: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
sync_to_async_wrapped: list[str] = []
sync_to_async_active = False
def fake_sync_to_async(func, thread_sensitive=True):
async def wrapper(*args, **kwargs):
nonlocal sync_to_async_active
sync_to_async_wrapped.append(getattr(func, '__name__', repr(func)))
previous = sync_to_async_active
sync_to_async_active = True
try:
return func(*args, **kwargs)
finally:
sync_to_async_active = previous
return wrapper
def guarded_is_finished():
assert sync_to_async_active is True
return False
monkeypatch.setattr(asgiref.sync, 'sync_to_async', fake_sync_to_async)
monkeypatch.setattr(crawl, 'is_finished', guarded_is_finished)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
assert 'guarded_is_finished' in sync_to_async_wrapped
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def run_test():
task = asyncio.get_running_loop().create_future()
task.set_exception(RuntimeError('snapshot failed'))
crawl_runner.snapshot_tasks['snap-1'] = task
with pytest.raises(RuntimeError, match='snapshot failed'):
await crawl_runner._wait_for_snapshot_tasks()
asyncio.run(run_test())
def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def finish_snapshot() -> None:
await asyncio.sleep(0)
async def run_test():
task = asyncio.create_task(finish_snapshot())
crawl_runner.snapshot_tasks['snap-1'] = task
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
assert crawl_runner.snapshot_tasks == {}
asyncio.run(run_test())
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
'sync_to_async',
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
)
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
cleanup_calls = []
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: cleanup_calls.append('abx_cleanup') or asyncio.sleep(0))
monkeypatch.setattr(crawl, 'cleanup', lambda: cleanup_calls.append('crawl_cleanup'))
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert cleanup_calls == ['crawl_cleanup', 'abx_cleanup']
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
from abx_dl.models import Process as AbxProcess, now_iso
from abx_dl.services.process_service import ProcessService
from abx_dl.events import ProcessCompletedEvent
service = object.__new__(ProcessService)
service.emit_jsonl = False
emitted_events = []
async def fake_emit_event(event, *, detach_from_parent):
emitted_events.append((event, detach_from_parent))
async def fake_stream_stdout(**kwargs):
try:
await asyncio.Event().wait()
except asyncio.CancelledError:
return ["daemon output\n"]
service._emit_event = fake_emit_event
monkeypatch.setattr(service, '_stream_stdout', fake_stream_stdout)
class FakeAsyncProcess:
def __init__(self):
self.pid = 42424
self.returncode = None
async def wait(self):
await asyncio.sleep(0)
self.returncode = 0
return 0
plugin_output_dir = tmp_path / 'chrome'
plugin_output_dir.mkdir()
stdout_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stdout.log'
stderr_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stderr.log'
stderr_file.write_text('')
pid_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.pid'
pid_file.write_text('12345')
proc = AbxProcess(
cmd=['hook'],
pwd=str(plugin_output_dir),
timeout=60,
started_at=now_iso(),
plugin='chrome',
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
)
process = FakeAsyncProcess()
event = SimpleNamespace(
plugin_name='chrome',
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
hook_path='hook',
hook_args=['--url=https://example.org/'],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
snapshot_id='snap-1',
is_background=True,
)
async def run_test():
await asyncio.wait_for(
service._monitor_background_process(
event=event,
proc=proc,
process=process,
plugin_output_dir=plugin_output_dir,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
files_before=set(),
),
timeout=0.5,
)
asyncio.run(run_test())
assert pid_file.exists() is False
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events)
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
result = runner_module.run_pending_crawls(daemon=False)
assert result == 0
assert run_calls == [(str(crawl.id), [str(snapshot.id)], False)]
def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
older_crawl = Crawl.objects.create(
urls='https://older.example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
older_snapshot = Snapshot.objects.create(
url='https://older.example.com',
crawl=older_crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
newer_crawl = Crawl.objects.create(
urls='https://newer.example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(older_snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(older_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(newer_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
class _StopScheduling(Exception):
pass
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
raise _StopScheduling
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
with pytest.raises(_StopScheduling):
runner_module.run_pending_crawls(daemon=False)
assert run_calls == [(str(newer_crawl.id), None, False)]

View File

@@ -0,0 +1,205 @@
import json
from datetime import datetime
from typing import cast
import pytest
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from django.utils import timezone
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = 'admin.archivebox.localhost:8000'
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username='tagadmin',
email='tagadmin@test.com',
password='testpassword',
)
@pytest.fixture
def api_token(admin_user):
from archivebox.api.auth import get_or_create_api_token
token = get_or_create_api_token(admin_user)
assert token is not None
return token.token
@pytest.fixture
def crawl(admin_user):
from archivebox.crawls.models import Crawl
return Crawl.objects.create(
urls='https://example.com',
created_by=admin_user,
)
@pytest.fixture
def tagged_data(crawl, admin_user):
from archivebox.core.models import Snapshot, Tag
tag = Tag.objects.create(name='Alpha Research', created_by=admin_user)
first = Snapshot.objects.create(
url='https://example.com/one',
title='Example One',
crawl=crawl,
)
second = Snapshot.objects.create(
url='https://example.com/two',
title='Example Two',
crawl=crawl,
)
first.tags.add(tag)
second.tags.add(tag)
return tag, [first, second]
def test_tag_admin_changelist_renders_custom_ui(client, admin_user, tagged_data):
client.login(username='tagadmin', password='testpassword')
response = client.get(reverse('admin:core_tag_changelist'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'id="tag-live-search"' in response.content
assert b'id="tag-sort-select"' in response.content
assert b'id="tag-created-by-select"' in response.content
assert b'id="tag-year-select"' in response.content
assert b'id="tag-has-snapshots-select"' in response.content
assert b'Alpha Research' in response.content
assert b'class="tag-card"' in response.content
def test_tag_admin_add_view_renders_similar_tag_reference(client, admin_user):
client.login(username='tagadmin', password='testpassword')
response = client.get(reverse('admin:core_tag_add'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'Similar Tags' in response.content
assert b'data-tag-name-input="1"' in response.content
def test_tag_search_api_returns_card_payload(client, api_token, tagged_data):
tag, snapshots = tagged_data
response = client.get(
reverse('api-1:search_tags'),
{'q': 'Alpha', 'api_key': api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['sort'] == 'created_desc'
assert payload['created_by'] == ''
assert payload['year'] == ''
assert payload['has_snapshots'] == 'all'
assert payload['tags'][0]['id'] == tag.id
assert payload['tags'][0]['name'] == 'Alpha Research'
assert payload['tags'][0]['num_snapshots'] == 2
assert payload['tags'][0]['snapshots'][0]['title'] in {'Example One', 'Example Two'}
assert payload['tags'][0]['export_jsonl_url'].endswith(f'/api/v1/core/tag/{tag.id}/snapshots.jsonl')
assert payload['tags'][0]['filter_url'].endswith(f'/admin/core/snapshot/?tags__id__exact={tag.id}')
assert {snapshot['url'] for snapshot in payload['tags'][0]['snapshots']} == {snap.url for snap in snapshots}
def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data):
from archivebox.core.models import Snapshot, Tag
other_user = cast(UserManager, User.objects).create_user(
username='tagother',
email='tagother@test.com',
password='unused',
)
tag_with_snapshots = tagged_data[0]
empty_tag = Tag.objects.create(name='Zulu Empty', created_by=other_user)
alpha_tag = Tag.objects.create(name='Alpha Empty', created_by=other_user)
Snapshot.objects.create(
url='https://example.com/three',
title='Example Three',
crawl=crawl,
).tags.add(alpha_tag)
Tag.objects.filter(pk=empty_tag.pk).update(created_at=timezone.make_aware(datetime(2024, 1, 1, 12, 0, 0)))
Tag.objects.filter(pk=alpha_tag.pk).update(created_at=timezone.make_aware(datetime(2025, 1, 1, 12, 0, 0)))
Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0)))
response = client.get(
reverse('api-1:search_tags'),
{
'sort': 'name_desc',
'created_by': str(other_user.pk),
'year': '2024',
'has_snapshots': 'no',
'api_key': api_token,
},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['sort'] == 'name_desc'
assert payload['created_by'] == str(other_user.pk)
assert payload['year'] == '2024'
assert payload['has_snapshots'] == 'no'
assert [tag['name'] for tag in payload['tags']] == ['Zulu Empty']
def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
tag, _ = tagged_data
response = client.post(
f"{reverse('api-1:rename_tag', args=[tag.id])}?api_key={api_token}",
data=json.dumps({'name': 'Alpha Archive'}),
content_type='application/json',
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
tag.refresh_from_db()
assert tag.name == 'Alpha Archive'
assert tag.slug == 'alpha-archive'
def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data):
tag, _ = tagged_data
response = client.get(
reverse('api-1:tag_snapshots_export', args=[tag.id]),
{'api_key': api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response['Content-Type'].startswith('application/x-ndjson')
assert f'tag-{tag.slug}-snapshots.jsonl' in response['Content-Disposition']
body = response.content.decode()
assert '"type": "Snapshot"' in body
assert '"tags": "Alpha Research"' in body
def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data):
tag, snapshots = tagged_data
response = client.get(
reverse('api-1:tag_urls_export', args=[tag.id]),
{'api_key': api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response['Content-Type'].startswith('text/plain')
assert f'tag-{tag.slug}-urls.txt' in response['Content-Disposition']
exported_urls = set(filter(None, response.content.decode().splitlines()))
assert exported_urls == {snapshot.url for snapshot in snapshots}

View File

@@ -55,6 +55,7 @@ def _build_script(body: str) -> str:
get_admin_host,
get_api_host,
get_web_host,
get_public_host,
get_snapshot_host,
get_original_host,
get_listen_subdomain,
@@ -198,6 +199,7 @@ class TestUrlRouting:
web_host = get_web_host()
admin_host = get_admin_host()
api_host = get_api_host()
public_host = get_public_host()
snapshot_host = get_snapshot_host(snapshot_id)
original_host = get_original_host(domain)
base_host = SERVER_CONFIG.LISTEN_HOST
@@ -208,6 +210,7 @@ class TestUrlRouting:
assert web_host == "web.archivebox.localhost:8000"
assert admin_host == "admin.archivebox.localhost:8000"
assert api_host == "api.archivebox.localhost:8000"
assert public_host == "public.archivebox.localhost:8000"
assert snapshot_host == f"{snapshot_id}.archivebox.localhost:8000"
assert original_host == f"{domain}.archivebox.localhost:8000"
assert get_listen_subdomain(web_host) == "web"
@@ -302,6 +305,20 @@ class TestUrlRouting:
assert resp.status_code == 200
assert response_body(resp) == response_file.read_bytes()
resp = client.get("/index.html", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
snapshot_html = response_body(resp).decode("utf-8", "ignore")
assert f"http://{snapshot_host}/" in snapshot_html
assert "See all files..." in snapshot_html
assert ">WARC<" not in snapshot_html
assert ">Media<" not in snapshot_html
assert ">Git<" not in snapshot_html
resp = client.get("/?files=1", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
files_html = response_body(resp).decode("utf-8", "ignore")
assert output_rel.split("/", 1)[0] in files_html
print("OK")
"""
)
@@ -479,6 +496,7 @@ class TestUrlRouting:
snapshot_host = get_snapshot_host(snapshot_id)
admin_host = get_admin_host()
web_host = get_web_host()
public_host = get_public_host()
client = Client()
@@ -491,10 +509,17 @@ class TestUrlRouting:
assert resp.status_code == 200
live_html = response_body(resp).decode("utf-8", "ignore")
assert f"http://{snapshot_host}/" in live_html
assert "http://web.archivebox.localhost:8000" in live_html
assert f"http://{public_host}/static/archive.png" in live_html
assert ">WARC<" not in live_html
assert ">Media<" not in live_html
assert ">Git<" not in live_html
static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore")
assert f"http://{snapshot_host}/" in static_html
assert f"http://{public_host}/static/archive.png" in static_html
assert ">WARC<" not in static_html
assert ">Media<" not in static_html
assert ">Git<" not in static_html
client.login(username="testadmin", password="testpassword")
resp = client.get(f"/admin/core/snapshot/{snapshot_id}/change/", HTTP_HOST=admin_host)

View File

@@ -19,12 +19,19 @@ class Command(BaseCommand):
def handle(self, *args, **kwargs):
import os
import subprocess
import sys
import time
import psutil
from archivebox.config.common import STORAGE_CONFIG
from archivebox.machine.models import Machine, Process
from archivebox.workers.supervisord_util import (
RUNNER_WORKER,
get_existing_supervisord_process,
get_worker,
start_worker,
stop_worker,
)
pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
if not pidfile:
@@ -32,11 +39,38 @@ class Command(BaseCommand):
interval = max(0.2, float(kwargs.get("interval", 1.0)))
last_pid = None
runner_proc: subprocess.Popen[bytes] | None = None
def stop_duplicate_watchers() -> None:
current_pid = os.getpid()
for proc in psutil.process_iter(["pid", "cmdline"]):
if proc.info["pid"] == current_pid:
continue
cmdline = proc.info.get("cmdline") or []
if not cmdline:
continue
if "runner_watch" not in " ".join(cmdline):
continue
if not any(str(arg) == f"--pidfile={pidfile}" or str(arg) == pidfile for arg in cmdline):
continue
try:
proc.terminate()
proc.wait(timeout=2.0)
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
try:
proc.kill()
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
def get_supervisor():
supervisor = get_existing_supervisord_process()
if supervisor is None:
raise RuntimeError("runner_watch requires a running supervisord process")
return supervisor
stop_duplicate_watchers()
start_worker(get_supervisor(), RUNNER_WORKER, lazy=True)
def restart_runner() -> None:
nonlocal runner_proc
Process.cleanup_stale_running()
machine = Machine.current()
@@ -55,29 +89,18 @@ class Command(BaseCommand):
except Exception:
continue
if runner_proc and runner_proc.poll() is None:
try:
runner_proc.terminate()
runner_proc.wait(timeout=2.0)
except Exception:
try:
runner_proc.kill()
except Exception:
pass
supervisor = get_supervisor()
runner_proc = subprocess.Popen(
[sys.executable, '-m', 'archivebox', 'run', '--daemon'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
try:
stop_worker(supervisor, RUNNER_WORKER["name"])
except Exception:
pass
start_worker(supervisor, RUNNER_WORKER)
def runner_running() -> bool:
return Process.objects.filter(
machine=Machine.current(),
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists()
proc = get_worker(get_supervisor(), RUNNER_WORKER["name"])
return bool(proc and proc.get("statename") == "RUNNING")
while True:
try:

View File

@@ -6,6 +6,7 @@ import socket
import psutil
import shutil
import subprocess
import shlex
from typing import Dict, cast, Iterator
from pathlib import Path
@@ -29,24 +30,63 @@ WORKERS_DIR_NAME = "workers"
# Global reference to supervisord process for cleanup
_supervisord_proc = None
def _shell_join(args: list[str]) -> str:
return shlex.join(args)
RUNNER_WORKER = {
"name": "worker_runner",
"command": "archivebox run --daemon",
"autostart": "true",
"command": _shell_join([sys.executable, "-m", "archivebox", "run", "--daemon"]),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": "logs/worker_runner.log",
"redirect_stderr": "true",
}
RUNNER_WATCH_WORKER = lambda pidfile: {
"name": "worker_runner_watch",
"command": _shell_join([sys.executable, "-m", "archivebox", "manage", "runner_watch", f"--pidfile={pidfile}"]),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": "logs/worker_runner_watch.log",
"redirect_stderr": "true",
}
SERVER_WORKER = lambda host, port: {
"name": "worker_daphne",
"command": f"{sys.executable} -m daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
"command": _shell_join([sys.executable, "-m", "daphne", f"--bind={host}", f"--port={port}", "--application-close-timeout=600", "archivebox.core.asgi:application"]),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": "logs/worker_daphne.log",
"redirect_stderr": "true",
}
def RUNSERVER_WORKER(host: str, port: str, *, reload: bool, pidfile: str | None = None, nothreading: bool = False):
command = [sys.executable, "-m", "archivebox", "manage", "runserver", f"{host}:{port}"]
if not reload:
command.append("--noreload")
if nothreading:
command.append("--nothreading")
environment = ['ARCHIVEBOX_RUNSERVER="1"']
if reload:
assert pidfile, "RUNSERVER_WORKER requires a pidfile when reload=True"
environment.extend([
'ARCHIVEBOX_AUTORELOAD="1"',
f'ARCHIVEBOX_RUNSERVER_PIDFILE="{pidfile}"',
])
return {
"name": "worker_runserver",
"command": _shell_join(command),
"environment": ",".join(environment),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": "logs/worker_runserver.log",
"redirect_stderr": "true",
}
def is_port_in_use(host: str, port: int) -> bool:
"""Check if a port is already in use."""
try:
@@ -511,16 +551,30 @@ def watch_worker(supervisor, daemon_name, interval=5):
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False, debug=False, reload=False, nothreading=False):
from archivebox.config.common import STORAGE_CONFIG
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
bg_workers = [RUNNER_WORKER]
if debug:
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid') if reload else None
server_worker = RUNSERVER_WORKER(host=host, port=port, reload=reload, pidfile=pidfile, nothreading=nothreading)
bg_workers: list[tuple[dict[str, str], bool]] = (
[(RUNNER_WORKER, True), (RUNNER_WATCH_WORKER(pidfile), False)] if reload else [(RUNNER_WORKER, False)]
)
log_files = ['logs/worker_runserver.log', 'logs/worker_runner.log']
if reload:
log_files.insert(1, 'logs/worker_runner_watch.log')
else:
server_worker = SERVER_WORKER(host=host, port=port)
bg_workers = [(RUNNER_WORKER, False)]
log_files = ['logs/worker_daphne.log', 'logs/worker_runner.log']
print()
start_worker(supervisor, SERVER_WORKER(host=host, port=port))
start_worker(supervisor, server_worker)
print()
for worker in bg_workers:
start_worker(supervisor, worker)
for worker, lazy in bg_workers:
start_worker(supervisor, worker, lazy=lazy)
print()
if not daemonize:
@@ -529,7 +583,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
sys.stdout.flush()
tail_multiple_worker_logs(
log_files=['logs/worker_daphne.log', 'logs/worker_runner.log'],
log_files=log_files,
follow=True,
proc=_supervisord_proc, # Stop tailing when supervisord exits
)

View File

@@ -50,10 +50,11 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
downloaded_at=None,
)
crawl_id = getattr(snapshot, 'crawl_id', None)
if crawl_id:
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
Crawl.objects.filter(id=crawl_id).update(
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
@@ -75,10 +76,11 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
downloaded_at=None,
)
crawl_id = getattr(snapshot, 'crawl_id', None)
if crawl_id:
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
Crawl.objects.filter(id=crawl_id).update(
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)

View File

@@ -1,36 +1,373 @@
#!/usr/bin/env bash
### Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
# set -o xtrace
set -o errexit
set -o errtrace
set -o nounset
set -o pipefail
IFS=$'\n'
set -Eeuo pipefail
IFS=$'\n\t'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
cd "$REPO_DIR"
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
WORKSPACE_DIR="$(cd "${REPO_DIR}/.." && pwd)"
cd "${REPO_DIR}"
TAG_PREFIX="v"
PYPI_PACKAGE="archivebox"
# Run the linters and tests
# ./bin/lint.sh
# ./bin/test.sh
source_optional_env() {
if [[ -f "${REPO_DIR}/.env" ]]; then
set -a
# shellcheck disable=SC1091
source "${REPO_DIR}/.env"
set +a
fi
}
# # Run all the build scripts
# ./bin/build_git.sh
# ./bin/build_docs.sh
# ./bin/build_pip.sh
# ./bin/build_docker.sh
repo_slug() {
python3 - <<'PY'
import re
import subprocess
# Push relase to public repositories
# ./bin/release_docs.sh
./bin/release_git.sh "$@"
./bin/release_pip.sh "$@"
./bin/release_deb.sh "$@"
./bin/release_brew.sh "$@"
./bin/release_docker.sh "$@"
remote = subprocess.check_output(
['git', 'remote', 'get-url', 'origin'],
text=True,
).strip()
VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')"
echo "[√] Done. Published version v$VERSION"
patterns = [
r'github\.com[:/](?P<slug>[^/]+/[^/.]+)(?:\.git)?$',
r'github\.com/(?P<slug>[^/]+/[^/.]+)(?:\.git)?$',
]
for pattern in patterns:
match = re.search(pattern, remote)
if match:
print(match.group('slug'))
raise SystemExit(0)
raise SystemExit(f'Unable to parse GitHub repo slug from remote: {remote}')
PY
}
default_branch() {
if [[ -n "${DEFAULT_BRANCH:-}" ]]; then
echo "${DEFAULT_BRANCH}"
return 0
fi
if git symbolic-ref refs/remotes/origin/HEAD >/dev/null 2>&1; then
git symbolic-ref refs/remotes/origin/HEAD | sed 's#^refs/remotes/origin/##'
return 0
fi
git remote show origin | sed -n '/HEAD branch/s/.*: //p' | head -n 1
}
current_version() {
python3 - <<'PY'
from pathlib import Path
import json
import re
versions = []
pyproject_text = Path('pyproject.toml').read_text()
pyproject_match = re.search(r'^version = "([^"]+)"$', pyproject_text, re.MULTILINE)
if pyproject_match:
versions.append(pyproject_match.group(1))
package_json = json.loads(Path('etc/package.json').read_text())
if 'version' in package_json:
versions.append(package_json['version'])
def parse(version: str) -> tuple[int, int, int, int, int]:
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
if not match:
raise SystemExit(f'Unsupported version format: {version}')
major, minor, patch, rc = match.groups()
rc_value = int(rc) if rc else (0 if 'rc' in version else 10_000)
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, rc_value)
print(max(versions, key=parse))
PY
}
bump_version() {
python3 - <<'PY'
from pathlib import Path
import json
import re
def parse(version: str) -> tuple[int, int, int, int, int]:
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
if not match:
raise SystemExit(f'Unsupported version format: {version}')
major, minor, patch, rc = match.groups()
rc_value = int(rc) if rc else (0 if 'rc' in version else 10_000)
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, rc_value)
pyproject_path = Path('pyproject.toml')
pyproject_text = pyproject_path.read_text()
pyproject_match = re.search(r'^version = "([^"]+)"$', pyproject_text, re.MULTILINE)
if not pyproject_match:
raise SystemExit('Failed to find version in pyproject.toml')
package_path = Path('etc/package.json')
package_json = json.loads(package_path.read_text())
if 'version' not in package_json:
raise SystemExit('Failed to find version in etc/package.json')
current_version = max([pyproject_match.group(1), package_json['version']], key=parse)
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', current_version)
major, minor, patch, rc = match.groups()
if 'rc' in current_version:
rc_number = int(rc or '0') + 1
next_version = f'{major}.{minor}.{patch}rc{rc_number}'
else:
next_version = f'{major}.{minor}.{int(patch) + 1}'
pyproject_path.write_text(
re.sub(r'^version = "[^"]+"$', f'version = "{next_version}"', pyproject_text, count=1, flags=re.MULTILINE)
)
package_json['version'] = next_version
package_path.write_text(json.dumps(package_json, indent=2) + '\n')
print(next_version)
PY
}
read_repo_version() {
local repo_dir="$1"
if [[ ! -f "${repo_dir}/pyproject.toml" ]]; then
return 1
fi
python3 - "${repo_dir}/pyproject.toml" <<'PY'
from pathlib import Path
import re
import sys
text = Path(sys.argv[1]).read_text()
match = re.search(r'^version = "([^"]+)"$', text, re.MULTILINE)
if not match:
raise SystemExit('Failed to find version')
print(match.group(1))
PY
}
update_internal_dependencies() {
local abxbus_version abx_pkg_version abx_plugins_version abx_dl_version
abxbus_version="$(read_repo_version "${WORKSPACE_DIR}/abxbus" || true)"
abx_pkg_version="$(read_repo_version "${WORKSPACE_DIR}/abx-pkg" || true)"
abx_plugins_version="$(read_repo_version "${WORKSPACE_DIR}/abx-plugins" || true)"
abx_dl_version="$(read_repo_version "${WORKSPACE_DIR}/abx-dl" || true)"
python3 - "${abxbus_version}" "${abx_pkg_version}" "${abx_plugins_version}" "${abx_dl_version}" <<'PY'
from pathlib import Path
import re
import sys
path = Path('pyproject.toml')
text = path.read_text()
for name, version in (
('abxbus', sys.argv[1]),
('abx-pkg', sys.argv[2]),
('abx-plugins', sys.argv[3]),
('abx-dl', sys.argv[4]),
):
if version:
text = re.sub(rf'("{re.escape(name)}>=)[^"]+(")', rf'\g<1>{version}\2', text)
path.write_text(text)
PY
}
compare_versions() {
python3 - "$1" "$2" <<'PY'
import re
import sys
def parse(version: str) -> tuple[int, int, int, int, int]:
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
if not match:
raise SystemExit(f'Unsupported version format: {version}')
major, minor, patch, rc = match.groups()
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, int(rc or '0'))
left, right = sys.argv[1], sys.argv[2]
if parse(left) > parse(right):
print('gt')
elif parse(left) == parse(right):
print('eq')
else:
print('lt')
PY
}
latest_release_version() {
local slug="$1"
local raw_tags
raw_tags="$(gh api "repos/${slug}/releases?per_page=100" --jq '.[].tag_name' || true)"
RELEASE_TAGS="${raw_tags}" TAG_PREFIX_VALUE="${TAG_PREFIX}" python3 - <<'PY'
import os
import re
def parse(version: str) -> tuple[int, int, int, int, int]:
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
if not match:
return (-1, -1, -1, -1, -1)
major, minor, patch, rc = match.groups()
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, int(rc or '0'))
prefix = os.environ.get('TAG_PREFIX_VALUE', '')
versions = [line.strip() for line in os.environ.get('RELEASE_TAGS', '').splitlines() if line.strip()]
if prefix:
versions = [version[len(prefix):] if version.startswith(prefix) else version for version in versions]
if not versions:
print('')
else:
print(max(versions, key=parse))
PY
}
wait_for_runs() {
local slug="$1"
local event="$2"
local sha="$3"
local label="$4"
local runs_json
local attempts=0
while :; do
runs_json="$(GH_FORCE_TTY=0 GH_PAGER=cat gh run list --repo "${slug}" --event "${event}" --commit "${sha}" --limit 20 --json databaseId,status,conclusion,workflowName)"
if [[ "$(jq 'length' <<<"${runs_json}")" -gt 0 ]]; then
break
fi
attempts=$((attempts + 1))
if [[ "${attempts}" -ge 30 ]]; then
echo "Timed out waiting for ${label} workflows to start" >&2
return 1
fi
sleep 10
done
while read -r run_id; do
gh run watch "${run_id}" --repo "${slug}" --exit-status
done < <(jq -r '.[].databaseId' <<<"${runs_json}")
}
wait_for_pypi() {
local package_name="$1"
local expected_version="$2"
local attempts=0
local published_version
while :; do
published_version="$(curl -fsSL "https://pypi.org/pypi/${package_name}/json" | jq -r '.info.version')"
if [[ "${published_version}" == "${expected_version}" ]]; then
return 0
fi
attempts=$((attempts + 1))
if [[ "${attempts}" -ge 30 ]]; then
echo "Timed out waiting for ${package_name}==${expected_version} on PyPI" >&2
return 1
fi
sleep 10
done
}
run_checks() {
uv sync --all-extras --all-groups --no-cache --upgrade
uv build --all
}
validate_release_state() {
local slug="$1"
local branch="$2"
local current latest relation
if [[ "$(git branch --show-current)" != "${branch}" ]]; then
echo "Skipping release-state validation on non-default branch $(git branch --show-current)"
return 0
fi
current="$(current_version)"
latest="$(latest_release_version "${slug}")"
if [[ -z "${latest}" ]]; then
echo "No published releases found for ${slug}; release state is valid"
return 0
fi
relation="$(compare_versions "${current}" "${latest}")"
if [[ "${relation}" == "lt" ]]; then
echo "Current version ${current} is behind latest published version ${latest}" >&2
return 1
fi
echo "Release state is valid: local=${current} latest=${latest}"
}
create_release() {
local slug="$1"
local version="$2"
local prerelease_args=()
if [[ "${version}" == *rc* ]]; then
prerelease_args+=(--prerelease)
fi
gh release create "${TAG_PREFIX}${version}" \
--repo "${slug}" \
--target "$(git rev-parse HEAD)" \
--title "${TAG_PREFIX}${version}" \
--generate-notes \
"${prerelease_args[@]}"
}
publish_artifacts() {
local version="$1"
local pypi_token="${UV_PUBLISH_TOKEN:-${PYPI_TOKEN:-${PYPI_PAT_SECRET:-}}}"
if [[ -n "${pypi_token}" ]]; then
UV_PUBLISH_TOKEN="${pypi_token}" uv publish --username=__token__ dist/*
elif [[ -n "${GITHUB_ACTIONS:-}" ]]; then
uv publish --trusted-publishing always dist/*
else
echo "Missing PyPI credentials: set UV_PUBLISH_TOKEN or PYPI_TOKEN" >&2
return 1
fi
wait_for_pypi "${PYPI_PACKAGE}" "${version}"
}
main() {
local slug branch version latest relation
source_optional_env
slug="$(repo_slug)"
branch="$(default_branch)"
if [[ "${GITHUB_EVENT_NAME:-}" == "push" ]]; then
validate_release_state "${slug}" "${branch}"
return 0
fi
if [[ "$(git branch --show-current)" != "${branch}" ]]; then
echo "Release must run from ${branch}, found $(git branch --show-current)" >&2
return 1
fi
update_internal_dependencies
version="$(bump_version)"
run_checks
git add -A
git commit -m "release: ${TAG_PREFIX}${version}"
git push origin "${branch}"
wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push"
publish_artifacts "${version}"
create_release "${slug}" "${version}"
latest="$(latest_release_version "${slug}")"
relation="$(compare_versions "${latest}" "${version}")"
if [[ "${relation}" != "eq" ]]; then
echo "GitHub release version mismatch: expected ${version}, got ${latest}" >&2
return 1
fi
echo "Released ${PYPI_PACKAGE} ${version}"
}
main "$@"

View File

@@ -1,6 +1,6 @@
[project]
name = "archivebox"
version = "0.9.10rc1"
version = "0.9.10rc2"
requires-python = ">=3.13"
description = "Self-hosted internet archiving solution."
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
@@ -78,9 +78,10 @@ dependencies = [
"w3lib>=2.2.1", # used for parsing content-type encoding from http response headers & html tags
### Extractor dependencies (optional binaries detected at runtime via shutil.which)
### Binary/Package Management
"abx-pkg>=1.9.14", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
"abx-plugins>=1.9.18", # shared ArchiveBox plugin package with install_args-only overrides
"abx-dl>=1.10.13", # shared ArchiveBox downloader package with install_args-only overrides
"abxbus>=2.4.2", # explicit direct dep so local dev env resolves sibling abxbus repo, matching abx-dl EventBus API
"abx-pkg>=1.9.18", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
"abx-plugins>=1.10.14", # shared ArchiveBox plugin package with install_args-only overrides
"abx-dl>=1.10.14", # shared ArchiveBox downloader package with install_args-only overrides
### UUID7 backport for Python <3.14
"uuid7>=0.1.0; python_version < '3.14'", # provides the uuid_extensions module on Python 3.13
]
@@ -156,9 +157,11 @@ environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"]
package = true
# compile-bytecode = true
[tool.uv.pip]
python-version = "3.13"
# compile-bytecode = true
[tool.uv.sources]
abxbus = { path = "../abxbus", editable = true }
abx-pkg = { path = "../abx-pkg", editable = true }
abx-plugins = { path = "../abx-plugins", editable = true }
abx-dl = { path = "../abx-dl", editable = true }
[build-system]
requires = ["pdm-backend"]

154
uv.lock generated
View File

@@ -14,8 +14,8 @@ supported-markers = [
[[package]]
name = "abx-dl"
version = "1.10.13"
source = { registry = "https://pypi.org/simple" }
version = "1.10.14"
source = { editable = "../abx-dl" }
dependencies = [
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "abx-plugins", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -27,44 +27,110 @@ dependencies = [
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/10/de/d9d5a398f053f899fc62d45b9d21eb85412c6ca7d32099c25b9b43f84e32/abx_dl-1.10.13.tar.gz", hash = "sha256:f9fef6119691e07e1792593ed5bcd8de2f84df9d01e77966006d743593c611aa", size = 58200, upload-time = "2026-03-21T18:47:20.901Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e7/11/670fbdc0afe2274893b63774643f6bb44f09d4975d3968cf394384af1306/abx_dl-1.10.13-py3-none-any.whl", hash = "sha256:cd4aab469563b1c7d9f9202161d94ba7de62cf31fbe924f6fe6f51ad051f4d70", size = 62597, upload-time = "2026-03-21T18:47:19.573Z" },
[package.metadata]
requires-dist = [
{ name = "abx-pkg", editable = "../abx-pkg" },
{ name = "abx-plugins", editable = "../abx-plugins" },
{ name = "abxbus", editable = "../abxbus" },
{ name = "flake8", marker = "extra == 'dev'", specifier = ">=7.1.1" },
{ name = "flask", marker = "extra == 'dev'", specifier = ">=3.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" },
{ name = "platformdirs", specifier = ">=4.0.0" },
{ name = "psutil", specifier = ">=7.2.1" },
{ name = "pydantic", specifier = ">=2.0.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
{ name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.5.0" },
{ name = "requests", specifier = ">=2.28.0" },
{ name = "rich", specifier = ">=13.0.0" },
{ name = "rich-click", specifier = ">=1.8.0" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.6.6" },
]
provides-extras = ["dev"]
[package.metadata.requires-dev]
dev = [
{ name = "prek", specifier = ">=0.3.6" },
{ name = "pyright", specifier = ">=1.1.408" },
{ name = "ruff", specifier = ">=0.15.7" },
{ name = "ty", specifier = ">=0.0.24" },
]
[[package]]
name = "abx-pkg"
version = "1.9.14"
source = { registry = "https://pypi.org/simple" }
version = "1.9.18"
source = { editable = "../abx-pkg" }
dependencies = [
{ name = "pip", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "platformdirs", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f9/6e/4465d44686b40ab0361d153160e2bd0167f588756518084308a8e8d08d8c/abx_pkg-1.9.14.tar.gz", hash = "sha256:b94d42cdbc6dde88635903cf14977b34e552d807a72c03d60f27f075deb59952", size = 146811, upload-time = "2026-03-21T07:44:12.158Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/56/af/8e65a23d44e5ccc069c344a7a626f575498b3c1a3ccacb17e941b36ecd35/abx_pkg-1.9.14-py3-none-any.whl", hash = "sha256:cf89dc4c5737e2078cb05fa7e33683718d540391a018445b6e54aa22666f25e0", size = 63511, upload-time = "2026-03-21T07:44:11.038Z" },
[package.metadata]
requires-dist = [
{ name = "abx-pkg", extras = ["rich", "pyinfra", "ansible"], marker = "extra == 'all'" },
{ name = "ansible", marker = "extra == 'ansible'", specifier = ">=12.3.0" },
{ name = "ansible-core", marker = "extra == 'ansible'", specifier = ">=2.0.0" },
{ name = "ansible-runner", marker = "extra == 'ansible'", specifier = ">=2.4.2" },
{ name = "pip", specifier = ">=26.0.1" },
{ name = "platformdirs", specifier = ">=4.9.2" },
{ name = "pydantic", specifier = ">=2.12.5" },
{ name = "pyinfra", marker = "extra == 'pyinfra'", specifier = ">=3.6.1" },
{ name = "rich", marker = "extra == 'rich'", specifier = ">=14.0.0" },
{ name = "typing-extensions", specifier = ">=4.15.0" },
]
provides-extras = ["rich", "pyinfra", "ansible", "all"]
[package.metadata.requires-dev]
dev = [
{ name = "django", specifier = ">=4.0" },
{ name = "django-admin-data-views", specifier = ">=0.3.1" },
{ name = "django-jsonform", specifier = ">=2.22.0" },
{ name = "django-pydantic-field", specifier = ">=0.3.9" },
{ name = "django-stubs", specifier = ">=5.0.0" },
{ name = "mypy", specifier = ">=1.19.1" },
{ name = "prek", specifier = ">=0.3.6" },
{ name = "pyright" },
{ name = "pytest", specifier = ">=9.0.2" },
{ name = "rich", specifier = ">=14.0.0" },
{ name = "ruff", specifier = ">=0.15.7" },
{ name = "ty", specifier = ">=0.0.24" },
]
[[package]]
name = "abx-plugins"
version = "1.10.13"
source = { registry = "https://pypi.org/simple" }
version = "1.10.14"
source = { editable = "../abx-plugins" }
dependencies = [
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pydantic-settings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/2b/ea/7e70fa30a1e52039decd8b755b22549b8c51fb9d97cf54751b6fd1af7f2d/abx_plugins-1.10.13.tar.gz", hash = "sha256:945623afc6436894d26e8e27ce6101032b0c42655d5cbfaeeaa8a57913d0d46a", size = 525322, upload-time = "2026-03-21T17:39:10.142Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/29/25/d5338a5a7a1958916e7104727046ec01744da3fb28b1e30934480ab57f65/abx_plugins-1.10.13-py3-none-any.whl", hash = "sha256:79353763baf685871d52ea7e5fa8d0249937ec9edb2f63c7768b0c0a98d5518e", size = 731961, upload-time = "2026-03-21T17:39:11.713Z" },
[package.metadata]
requires-dist = [
{ name = "abx-pkg", editable = "../abx-pkg" },
{ name = "feedparser", marker = "extra == 'dev'", specifier = ">=6.0.0" },
{ name = "jinja2", marker = "extra == 'dev'", specifier = ">=3.1.0" },
{ name = "pydantic-settings", specifier = ">=2.0.0" },
{ name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.408" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
{ name = "pytest-httpserver", marker = "extra == 'dev'", specifier = ">=1.1.0" },
{ name = "requests", marker = "extra == 'dev'", specifier = ">=2.28.0" },
{ name = "rich-click", specifier = ">=1.9.7" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.15.2" },
{ name = "ty", marker = "extra == 'dev'", specifier = ">=0.0.18" },
]
provides-extras = ["dev"]
[package.metadata.requires-dev]
dev = [{ name = "prek", specifier = ">=0.3.6" }]
[[package]]
name = "abxbus"
version = "2.4.2"
source = { registry = "https://pypi.org/simple" }
version = "2.4.7"
source = { editable = "../abxbus" }
dependencies = [
{ name = "aiofiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "anyio", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -73,9 +139,41 @@ dependencies = [
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "uuid7", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/14/e5/ddf5dab0db243ddd9b193a4461a2d07f3d554b595c77e58af0beceb60eb2/abxbus-2.4.2.tar.gz", hash = "sha256:1c8056655decc81d28a8622f313109df9da36bde77175b0388a0ab9300b878a8", size = 114123, upload-time = "2026-03-20T21:09:35.643Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/86/c8/7815696415e66a7753112062a1357457f1cdd52d623964942f9086872dcb/abxbus-2.4.2-py3-none-any.whl", hash = "sha256:bd2058280fea91a021b604fdc32c4e4e690dfdee848fa50ea746cd786581f923", size = 110208, upload-time = "2026-03-20T21:09:33.942Z" },
[package.metadata]
requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" },
{ name = "anyio", specifier = ">=4.9.0" },
{ name = "asyncpg", marker = "extra == 'bridges'", specifier = ">=0.31.0" },
{ name = "asyncpg", marker = "extra == 'postgres'", specifier = ">=0.31.0" },
{ name = "nats-py", marker = "extra == 'bridges'", specifier = ">=2.13.1" },
{ name = "nats-py", marker = "extra == 'nats'", specifier = ">=2.13.1" },
{ name = "portalocker", specifier = ">=2.7.0" },
{ name = "pydantic", specifier = ">=2.11.5" },
{ name = "redis", marker = "extra == 'bridges'", specifier = ">=7.1.1" },
{ name = "redis", marker = "extra == 'redis'", specifier = ">=7.1.1" },
{ name = "typing-extensions", specifier = ">=4.12.2" },
{ name = "uuid7", specifier = ">=0.1.0" },
]
provides-extras = ["postgres", "nats", "redis", "bridges"]
[package.metadata.requires-dev]
dev = [
{ name = "build", specifier = ">=1.2.2" },
{ name = "codespell", specifier = ">=2.4.1" },
{ name = "fastapi", specifier = ">=0.118.0" },
{ name = "ipdb", specifier = ">=0.13.13" },
{ name = "prek", specifier = ">=0.3.3" },
{ name = "psutil", specifier = ">=7.0.0" },
{ name = "pyright", specifier = ">=1.1.404" },
{ name = "pytest", specifier = ">=8.3.5" },
{ name = "pytest-asyncio", specifier = ">=1.1.0" },
{ name = "pytest-cov", specifier = ">=6.2.1" },
{ name = "pytest-httpserver", specifier = ">=1.0.8" },
{ name = "pytest-timeout", specifier = ">=2.4.0" },
{ name = "pytest-xdist", specifier = ">=3.7.0" },
{ name = "ruff", specifier = ">=0.15.1" },
{ name = "ty", specifier = ">=0.0.1a19" },
]
[[package]]
@@ -119,12 +217,13 @@ wheels = [
[[package]]
name = "archivebox"
version = "0.9.10rc1"
version = "0.9.10rc2"
source = { editable = "." }
dependencies = [
{ name = "abx-dl", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "abx-plugins", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "abxbus", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -213,9 +312,10 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "abx-dl", specifier = ">=1.10.13" },
{ name = "abx-pkg", specifier = ">=1.9.14" },
{ name = "abx-plugins", specifier = ">=1.9.18" },
{ name = "abx-dl", editable = "../abx-dl" },
{ name = "abx-pkg", editable = "../abx-pkg" },
{ name = "abx-plugins", editable = "../abx-plugins" },
{ name = "abxbus", editable = "../abxbus" },
{ name = "archivebox", extras = ["sonic", "ldap", "debug"], marker = "extra == 'all'" },
{ name = "atomicwrites", specifier = "==1.4.1" },
{ name = "base32-crockford", specifier = ">=0.3.0" },
@@ -1856,16 +1956,16 @@ wheels = [
[[package]]
name = "pytest-cov"
version = "7.0.0"
version = "7.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "coverage", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pluggy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" }
sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" },
{ url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" },
]
[[package]]