mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
WIP: checkpoint working tree before rebasing onto dev
This commit is contained in:
45
.github/workflows/release-runner.yml
vendored
Normal file
45
.github/workflows/release-runner.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Release State
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- '**'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
id-token: write
|
||||
|
||||
jobs:
|
||||
release-state:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
submodules: true
|
||||
ref: ${{ github.ref_name }}
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.13"
|
||||
|
||||
- uses: astral-sh/setup-uv@v6
|
||||
with:
|
||||
enable-cache: true
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- name: Configure git identity
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Run release script
|
||||
env:
|
||||
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
PYPI_PAT_SECRET: ${{ secrets.PYPI_PAT_SECRET }}
|
||||
run: ./bin/release.sh
|
||||
1
.github/workflows/release.yml
vendored
1
.github/workflows/release.yml
vendored
@@ -9,7 +9,6 @@ name: Release
|
||||
# This workflow ensures the correct ordering during a release.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
|
||||
@@ -6,8 +6,9 @@ from django.views.generic.base import RedirectView
|
||||
from .v1_api import urls as v1_api_urls
|
||||
|
||||
urlpatterns = [
|
||||
path("", RedirectView.as_view(url='/api/v1')),
|
||||
path("", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
path("v1/", RedirectView.as_view(url='/api/v1/docs')),
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
|
||||
@@ -6,7 +6,8 @@ from typing import List, Optional, Union, Any, Annotated
|
||||
from datetime import datetime
|
||||
|
||||
from django.db.models import Model, Q
|
||||
from django.http import HttpRequest
|
||||
from django.conf import settings
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import User
|
||||
@@ -18,6 +19,22 @@ from ninja.pagination import paginate, PaginationBase
|
||||
from ninja.errors import HttpError
|
||||
|
||||
from archivebox.core.models import Snapshot, ArchiveResult, Tag
|
||||
from archivebox.api.auth import auth_using_token
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.core.tag_utils import (
|
||||
build_tag_cards,
|
||||
delete_tag as delete_tag_record,
|
||||
export_tag_snapshots_jsonl,
|
||||
export_tag_urls,
|
||||
get_matching_tags,
|
||||
get_or_create_tag,
|
||||
get_tag_by_ref,
|
||||
normalize_created_by_filter,
|
||||
normalize_created_year_filter,
|
||||
normalize_has_snapshots_filter,
|
||||
normalize_tag_sort,
|
||||
rename_tag as rename_tag_record,
|
||||
)
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.api.v1_crawls import CrawlSchema
|
||||
|
||||
@@ -404,7 +421,7 @@ class TagSchema(Schema):
|
||||
def get_tags(request: HttpRequest):
|
||||
setattr(request, 'with_snapshots', False)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
return Tag.objects.all().distinct()
|
||||
return get_matching_tags()
|
||||
|
||||
|
||||
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
|
||||
@@ -412,9 +429,9 @@ def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
|
||||
setattr(request, 'with_snapshots', with_snapshots)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
try:
|
||||
return Tag.objects.get(id__icontains=tag_id)
|
||||
return get_tag_by_ref(tag_id)
|
||||
except (Tag.DoesNotExist, ValidationError):
|
||||
return Tag.objects.get(slug__icontains=tag_id)
|
||||
raise HttpError(404, 'Tag not found')
|
||||
|
||||
|
||||
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
|
||||
@@ -459,6 +476,55 @@ class TagCreateResponseSchema(Schema):
|
||||
created: bool
|
||||
|
||||
|
||||
class TagSearchSnapshotSchema(Schema):
|
||||
id: str
|
||||
title: str
|
||||
url: str
|
||||
favicon_url: str
|
||||
admin_url: str
|
||||
archive_url: str
|
||||
downloaded_at: Optional[str] = None
|
||||
|
||||
|
||||
class TagSearchCardSchema(Schema):
|
||||
id: int
|
||||
name: str
|
||||
slug: str
|
||||
num_snapshots: int
|
||||
filter_url: str
|
||||
edit_url: str
|
||||
export_urls_url: str
|
||||
export_jsonl_url: str
|
||||
rename_url: str
|
||||
delete_url: str
|
||||
snapshots: List[TagSearchSnapshotSchema]
|
||||
|
||||
|
||||
class TagSearchResponseSchema(Schema):
|
||||
tags: List[TagSearchCardSchema]
|
||||
sort: str
|
||||
created_by: str
|
||||
year: str
|
||||
has_snapshots: str
|
||||
|
||||
|
||||
class TagUpdateSchema(Schema):
|
||||
name: str
|
||||
|
||||
|
||||
class TagUpdateResponseSchema(Schema):
|
||||
success: bool
|
||||
tag_id: int
|
||||
tag_name: str
|
||||
slug: str
|
||||
|
||||
|
||||
class TagDeleteResponseSchema(Schema):
|
||||
success: bool
|
||||
tag_id: int
|
||||
deleted_count: int
|
||||
|
||||
|
||||
class TagSnapshotRequestSchema(Schema):
|
||||
snapshot_id: str
|
||||
tag_name: Optional[str] = None
|
||||
@@ -471,41 +537,82 @@ class TagSnapshotResponseSchema(Schema):
|
||||
tag_name: str
|
||||
|
||||
|
||||
@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete")
|
||||
@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags")
|
||||
def search_tags(
|
||||
request: HttpRequest,
|
||||
q: str = "",
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
):
|
||||
"""Return detailed tag cards for admin/live-search UIs."""
|
||||
normalized_sort = normalize_tag_sort(sort)
|
||||
normalized_created_by = normalize_created_by_filter(created_by)
|
||||
normalized_year = normalize_created_year_filter(year)
|
||||
normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
|
||||
return {
|
||||
'tags': build_tag_cards(
|
||||
query=q,
|
||||
request=request,
|
||||
sort=normalized_sort,
|
||||
created_by=normalized_created_by,
|
||||
year=normalized_year,
|
||||
has_snapshots=normalized_has_snapshots,
|
||||
),
|
||||
'sort': normalized_sort,
|
||||
'created_by': normalized_created_by,
|
||||
'year': normalized_year,
|
||||
'has_snapshots': normalized_has_snapshots,
|
||||
}
|
||||
|
||||
|
||||
def _public_tag_listing_enabled() -> bool:
|
||||
explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
|
||||
if explicit is not None:
|
||||
return bool(explicit)
|
||||
return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
|
||||
|
||||
|
||||
def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
|
||||
user = getattr(request, 'user', None)
|
||||
if getattr(user, 'is_authenticated', False):
|
||||
return True
|
||||
|
||||
token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
|
||||
auth_header = request.headers.get('Authorization', '')
|
||||
if not token and auth_header.lower().startswith('bearer '):
|
||||
token = auth_header.split(None, 1)[1].strip()
|
||||
|
||||
if token and auth_using_token(token=token, request=request):
|
||||
return True
|
||||
|
||||
return _public_tag_listing_enabled()
|
||||
|
||||
|
||||
@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None)
|
||||
def tags_autocomplete(request: HttpRequest, q: str = ""):
|
||||
"""Return tags matching the query for autocomplete."""
|
||||
if not q:
|
||||
# Return all tags if no query (limited to 50)
|
||||
tags = Tag.objects.all().order_by('name')[:50]
|
||||
else:
|
||||
tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20]
|
||||
if not _request_has_tag_autocomplete_access(request):
|
||||
raise HttpError(401, 'Authentication required')
|
||||
|
||||
tags = get_matching_tags(q)[:50 if not q else 20]
|
||||
|
||||
return {
|
||||
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags]
|
||||
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
|
||||
}
|
||||
|
||||
|
||||
@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create")
|
||||
def tags_create(request: HttpRequest, data: TagCreateSchema):
|
||||
"""Create a new tag or return existing one."""
|
||||
name = data.name.strip()
|
||||
if not name:
|
||||
raise HttpError(400, 'Tag name is required')
|
||||
|
||||
tag, created = Tag.objects.get_or_create(
|
||||
name__iexact=name,
|
||||
defaults={
|
||||
'name': name,
|
||||
'created_by': request.user if request.user.is_authenticated else None,
|
||||
}
|
||||
)
|
||||
|
||||
# If found by case-insensitive match, use that tag
|
||||
if not created:
|
||||
existing_tag = Tag.objects.filter(name__iexact=name).first()
|
||||
if existing_tag is None:
|
||||
raise HttpError(500, 'Failed to load existing tag after get_or_create')
|
||||
tag = existing_tag
|
||||
try:
|
||||
tag, created = get_or_create_tag(
|
||||
data.name,
|
||||
created_by=request.user if request.user.is_authenticated else None,
|
||||
)
|
||||
except ValueError as err:
|
||||
raise HttpError(400, str(err)) from err
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
@@ -515,6 +622,62 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
|
||||
}
|
||||
|
||||
|
||||
@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag")
|
||||
def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
|
||||
try:
|
||||
tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
except ValueError as err:
|
||||
raise HttpError(400, str(err)) from err
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
'slug': tag.slug,
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag")
|
||||
def delete_tag(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
|
||||
deleted_count, _ = delete_tag_record(tag)
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': int(tag_id),
|
||||
'deleted_count': deleted_count,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export")
|
||||
def tag_urls_export(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
|
||||
response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
|
||||
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
|
||||
return response
|
||||
|
||||
|
||||
@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export")
|
||||
def tag_snapshots_export(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
|
||||
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
|
||||
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
|
||||
return response
|
||||
|
||||
|
||||
@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot")
|
||||
def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
|
||||
"""Add a tag to a snapshot. Creates the tag if it doesn't exist."""
|
||||
@@ -534,24 +697,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
|
||||
|
||||
# Get or create the tag
|
||||
if data.tag_name:
|
||||
name = data.tag_name.strip()
|
||||
if not name:
|
||||
raise HttpError(400, 'Tag name is required')
|
||||
|
||||
tag, _ = Tag.objects.get_or_create(
|
||||
name__iexact=name,
|
||||
defaults={
|
||||
'name': name,
|
||||
'created_by': request.user if request.user.is_authenticated else None,
|
||||
}
|
||||
)
|
||||
# If found by case-insensitive match, use that tag
|
||||
existing_tag = Tag.objects.filter(name__iexact=name).first()
|
||||
if existing_tag is not None:
|
||||
tag = existing_tag
|
||||
try:
|
||||
tag, _ = get_or_create_tag(
|
||||
data.tag_name,
|
||||
created_by=request.user if request.user.is_authenticated else None,
|
||||
)
|
||||
except ValueError as err:
|
||||
raise HttpError(400, str(err)) from err
|
||||
elif data.tag_id:
|
||||
try:
|
||||
tag = Tag.objects.get(pk=data.tag_id)
|
||||
tag = get_tag_by_ref(data.tag_id)
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
else:
|
||||
|
||||
@@ -4,7 +4,7 @@ __package__ = 'archivebox.base_models'
|
||||
|
||||
import json
|
||||
from collections.abc import Mapping
|
||||
from typing import TypedDict
|
||||
from typing import NotRequired, TypedDict
|
||||
|
||||
from django import forms
|
||||
from django.contrib import admin
|
||||
@@ -17,9 +17,13 @@ from django_object_actions import DjangoObjectActions
|
||||
|
||||
class ConfigOption(TypedDict):
|
||||
plugin: str
|
||||
type: str
|
||||
type: str | list[str]
|
||||
default: object
|
||||
description: str
|
||||
enum: NotRequired[list[object]]
|
||||
pattern: NotRequired[str]
|
||||
minimum: NotRequired[int | float]
|
||||
maximum: NotRequired[int | float]
|
||||
|
||||
|
||||
class KeyValueWidget(forms.Widget):
|
||||
@@ -44,12 +48,16 @@ class KeyValueWidget(forms.Widget):
|
||||
options: dict[str, ConfigOption] = {}
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop in schema.get('properties', {}).items():
|
||||
options[key] = {
|
||||
option: ConfigOption = {
|
||||
'plugin': plugin_name,
|
||||
'type': prop.get('type', 'string'),
|
||||
'default': prop.get('default', ''),
|
||||
'description': prop.get('description', ''),
|
||||
}
|
||||
for schema_key in ('enum', 'pattern', 'minimum', 'maximum'):
|
||||
if schema_key in prop:
|
||||
option[schema_key] = prop[schema_key]
|
||||
options[key] = option
|
||||
return options
|
||||
except Exception:
|
||||
return {}
|
||||
@@ -98,14 +106,12 @@ class KeyValueWidget(forms.Widget):
|
||||
'''
|
||||
|
||||
# Render existing key-value pairs
|
||||
row_idx = 0
|
||||
for key, val in data.items():
|
||||
val_str = json.dumps(val) if not isinstance(val, str) else val
|
||||
html += self._render_row(widget_id, row_idx, key, val_str)
|
||||
row_idx += 1
|
||||
html += self._render_row(widget_id, key, val_str)
|
||||
|
||||
# Always add one empty row for new entries
|
||||
html += self._render_row(widget_id, row_idx, '', '')
|
||||
html += self._render_row(widget_id, '', '')
|
||||
|
||||
html += f'''
|
||||
</div>
|
||||
@@ -114,22 +120,450 @@ class KeyValueWidget(forms.Widget):
|
||||
style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
|
||||
+ Add Row
|
||||
</button>
|
||||
<span id="{widget_id}_hint" style="font-size: 11px; color: #666; font-style: italic;"></span>
|
||||
</div>
|
||||
<input type="hidden" name="{name}" id="{widget_id}" value="">
|
||||
<script>
|
||||
(function() {{
|
||||
var configMeta_{widget_id} = {config_meta_json};
|
||||
var rowCounter_{widget_id} = 0;
|
||||
|
||||
function showKeyHint_{widget_id}(key) {{
|
||||
var hint = document.getElementById('{widget_id}_hint');
|
||||
var meta = configMeta_{widget_id}[key];
|
||||
if (meta) {{
|
||||
hint.innerHTML = '<b>' + key + '</b>: ' + (meta.description || meta.type) +
|
||||
(meta.default !== '' ? ' <span style="color:#888">(default: ' + meta.default + ')</span>' : '');
|
||||
}} else {{
|
||||
hint.textContent = key ? 'Custom key: ' + key : '';
|
||||
function stringifyValue_{widget_id}(value) {{
|
||||
return typeof value === 'string' ? value : JSON.stringify(value);
|
||||
}}
|
||||
|
||||
function getTypes_{widget_id}(meta) {{
|
||||
if (!meta || meta.type === undefined || meta.type === null) {{
|
||||
return [];
|
||||
}}
|
||||
return Array.isArray(meta.type) ? meta.type : [meta.type];
|
||||
}}
|
||||
|
||||
function getMetaForKey_{widget_id}(key) {{
|
||||
if (!key) {{
|
||||
return null;
|
||||
}}
|
||||
|
||||
var explicitMeta = configMeta_{widget_id}[key];
|
||||
if (explicitMeta) {{
|
||||
return Object.assign({{ key: key }}, explicitMeta);
|
||||
}}
|
||||
|
||||
if (key.endsWith('_BINARY')) {{
|
||||
return {{
|
||||
key: key,
|
||||
plugin: 'custom',
|
||||
type: 'string',
|
||||
default: '',
|
||||
description: 'Path to binary executable',
|
||||
}};
|
||||
}}
|
||||
|
||||
if (isRegexConfigKey_{widget_id}(key)) {{
|
||||
return {{
|
||||
key: key,
|
||||
plugin: 'custom',
|
||||
type: 'string',
|
||||
default: '',
|
||||
description: 'Regex pattern list',
|
||||
}};
|
||||
}}
|
||||
|
||||
return null;
|
||||
}}
|
||||
|
||||
function describeMeta_{widget_id}(meta) {{
|
||||
if (!meta) {{
|
||||
return '';
|
||||
}}
|
||||
|
||||
var details = '';
|
||||
if (Array.isArray(meta.enum) && meta.enum.length) {{
|
||||
details = 'Allowed: ' + meta.enum.map(stringifyValue_{widget_id}).join(', ');
|
||||
}} else {{
|
||||
var types = getTypes_{widget_id}(meta);
|
||||
if (types.length) {{
|
||||
details = 'Expected: ' + types.join(' or ');
|
||||
}}
|
||||
}}
|
||||
|
||||
if (meta.minimum !== undefined || meta.maximum !== undefined) {{
|
||||
var bounds = [];
|
||||
if (meta.minimum !== undefined) bounds.push('min ' + meta.minimum);
|
||||
if (meta.maximum !== undefined) bounds.push('max ' + meta.maximum);
|
||||
details += (details ? ' ' : '') + '(' + bounds.join(', ') + ')';
|
||||
}}
|
||||
|
||||
return [meta.description || '', details].filter(Boolean).join(' ');
|
||||
}}
|
||||
|
||||
function getExampleInput_{widget_id}(key, meta) {{
|
||||
var types = getTypes_{widget_id}(meta);
|
||||
if (key.endsWith('_BINARY')) {{
|
||||
return 'Example: wget or /usr/bin/wget';
|
||||
}}
|
||||
if (key.endsWith('_ARGS_EXTRA') || key.endsWith('_ARGS')) {{
|
||||
return 'Example: ["--extra-arg"]';
|
||||
}}
|
||||
if (types.includes('array')) {{
|
||||
return 'Example: ["value"]';
|
||||
}}
|
||||
if (types.includes('object')) {{
|
||||
if (key === 'SAVE_ALLOWLIST' || key === 'SAVE_DENYLIST') {{
|
||||
return 'Example: {{"^https://example\\\\.com": ["wget"]}}';
|
||||
}}
|
||||
return 'Example: {{"key": "value"}}';
|
||||
}}
|
||||
return '';
|
||||
}}
|
||||
|
||||
function isRegexConfigKey_{widget_id}(key) {{
|
||||
return key === 'URL_ALLOWLIST' ||
|
||||
key === 'URL_DENYLIST' ||
|
||||
key === 'SAVE_ALLOWLIST' ||
|
||||
key === 'SAVE_DENYLIST' ||
|
||||
key.endsWith('_PATTERN') ||
|
||||
key.includes('REGEX');
|
||||
}}
|
||||
|
||||
function isSimpleFilterPattern_{widget_id}(pattern) {{
|
||||
return /^[\\w.*:-]+$/.test(pattern);
|
||||
}}
|
||||
|
||||
function validateRegexPattern_{widget_id}(pattern) {{
|
||||
if (!pattern || isSimpleFilterPattern_{widget_id}(pattern)) {{
|
||||
return '';
|
||||
}}
|
||||
|
||||
try {{
|
||||
new RegExp(pattern);
|
||||
}} catch (error) {{
|
||||
return error && error.message ? error.message : 'Invalid regex';
|
||||
}}
|
||||
return '';
|
||||
}}
|
||||
|
||||
function validateRegexConfig_{widget_id}(key, raw, typeName) {{
|
||||
if (typeName === 'object') {{
|
||||
var parsed;
|
||||
try {{
|
||||
parsed = JSON.parse(raw);
|
||||
}} catch (error) {{
|
||||
return {{ ok: false, value: raw, message: 'Must be valid JSON' }};
|
||||
}}
|
||||
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {{
|
||||
return {{ ok: false, value: parsed, message: 'Must be a JSON object' }};
|
||||
}}
|
||||
for (var regexKey in parsed) {{
|
||||
var objectRegexError = validateRegexPattern_{widget_id}(regexKey);
|
||||
if (objectRegexError) {{
|
||||
return {{ ok: false, value: parsed, message: 'Invalid regex key "' + regexKey + '": ' + objectRegexError }};
|
||||
}}
|
||||
}}
|
||||
return {{ ok: true, value: parsed, message: '' }};
|
||||
}}
|
||||
|
||||
var patterns = raw.split(/[\\n,]+/).map(function(pattern) {{
|
||||
return pattern.trim();
|
||||
}}).filter(Boolean);
|
||||
for (var i = 0; i < patterns.length; i++) {{
|
||||
var regexError = validateRegexPattern_{widget_id}(patterns[i]);
|
||||
if (regexError) {{
|
||||
return {{ ok: false, value: raw, message: 'Invalid regex "' + patterns[i] + '": ' + regexError }};
|
||||
}}
|
||||
}}
|
||||
return {{ ok: true, value: raw, message: '' }};
|
||||
}}
|
||||
|
||||
function validateBinaryValue_{widget_id}(raw) {{
|
||||
if (!raw) {{
|
||||
return {{ ok: true, value: raw, message: '' }};
|
||||
}}
|
||||
|
||||
if (/['"`]/.test(raw)) {{
|
||||
return {{ ok: false, value: raw, message: 'Binary paths cannot contain quotes' }};
|
||||
}}
|
||||
|
||||
if (/[;&|<>$(){{}}\\[\\]!]/.test(raw)) {{
|
||||
return {{ ok: false, value: raw, message: 'Binary paths can only be a binary name or absolute path' }};
|
||||
}}
|
||||
|
||||
if (raw.startsWith('/')) {{
|
||||
if (/^[A-Za-z0-9_./+\\- ]+$/.test(raw)) {{
|
||||
return {{ ok: true, value: raw, message: '' }};
|
||||
}}
|
||||
return {{ ok: false, value: raw, message: 'Absolute paths may only contain path-safe characters' }};
|
||||
}}
|
||||
|
||||
if (/^[A-Za-z0-9_.+-]+$/.test(raw)) {{
|
||||
return {{ ok: true, value: raw, message: '' }};
|
||||
}}
|
||||
|
||||
return {{ ok: false, value: raw, message: 'Enter a binary name like wget or an absolute path like /usr/bin/wget' }};
|
||||
}}
|
||||
|
||||
function parseValue_{widget_id}(raw) {{
|
||||
try {{
|
||||
if (raw === 'true') return true;
|
||||
if (raw === 'false') return false;
|
||||
if (raw === 'null') return null;
|
||||
if (raw !== '' && !isNaN(raw)) return Number(raw);
|
||||
if ((raw.startsWith('{{') && raw.endsWith('}}')) ||
|
||||
(raw.startsWith('[') && raw.endsWith(']')) ||
|
||||
(raw.startsWith('"') && raw.endsWith('"'))) {{
|
||||
return JSON.parse(raw);
|
||||
}}
|
||||
}} catch (error) {{
|
||||
return raw;
|
||||
}}
|
||||
return raw;
|
||||
}}
|
||||
|
||||
function sameValue_{widget_id}(left, right) {{
|
||||
return left === right || JSON.stringify(left) === JSON.stringify(right);
|
||||
}}
|
||||
|
||||
function parseTypedValue_{widget_id}(raw, typeName, meta) {{
|
||||
var numberValue;
|
||||
var parsed;
|
||||
|
||||
if (typeName && meta && meta.key && isRegexConfigKey_{widget_id}(meta.key)) {{
|
||||
return validateRegexConfig_{widget_id}(meta.key, raw, typeName);
|
||||
}}
|
||||
|
||||
if (typeName === 'string' && meta && meta.key && meta.key.endsWith('_BINARY')) {{
|
||||
return validateBinaryValue_{widget_id}(raw);
|
||||
}}
|
||||
|
||||
if (typeName === 'string') {{
|
||||
if (meta.pattern) {{
|
||||
try {{
|
||||
if (!(new RegExp(meta.pattern)).test(raw)) {{
|
||||
return {{ ok: false, value: raw, message: 'Must match pattern ' + meta.pattern }};
|
||||
}}
|
||||
}} catch (error) {{}}
|
||||
}}
|
||||
return {{ ok: true, value: raw, message: '' }};
|
||||
}}
|
||||
|
||||
if (typeName === 'integer') {{
|
||||
if (!/^-?\\d+$/.test(raw)) {{
|
||||
return {{ ok: false, value: raw, message: 'Must be an integer' }};
|
||||
}}
|
||||
numberValue = Number(raw);
|
||||
if (meta.minimum !== undefined && numberValue < meta.minimum) {{
|
||||
return {{ ok: false, value: numberValue, message: 'Must be at least ' + meta.minimum }};
|
||||
}}
|
||||
if (meta.maximum !== undefined && numberValue > meta.maximum) {{
|
||||
return {{ ok: false, value: numberValue, message: 'Must be at most ' + meta.maximum }};
|
||||
}}
|
||||
return {{ ok: true, value: numberValue, message: '' }};
|
||||
}}
|
||||
|
||||
if (typeName === 'number') {{
|
||||
if (raw === '' || isNaN(raw)) {{
|
||||
return {{ ok: false, value: raw, message: 'Must be a number' }};
|
||||
}}
|
||||
numberValue = Number(raw);
|
||||
if (meta.minimum !== undefined && numberValue < meta.minimum) {{
|
||||
return {{ ok: false, value: numberValue, message: 'Must be at least ' + meta.minimum }};
|
||||
}}
|
||||
if (meta.maximum !== undefined && numberValue > meta.maximum) {{
|
||||
return {{ ok: false, value: numberValue, message: 'Must be at most ' + meta.maximum }};
|
||||
}}
|
||||
return {{ ok: true, value: numberValue, message: '' }};
|
||||
}}
|
||||
|
||||
if (typeName === 'boolean') {{
|
||||
var lowered = raw.toLowerCase();
|
||||
if (lowered === 'true' || raw === '1') return {{ ok: true, value: true, message: '' }};
|
||||
if (lowered === 'false' || raw === '0') return {{ ok: true, value: false, message: '' }};
|
||||
return {{ ok: false, value: raw, message: 'Must be true or false' }};
|
||||
}}
|
||||
|
||||
if (typeName === 'null') {{
|
||||
return raw === 'null'
|
||||
? {{ ok: true, value: null, message: '' }}
|
||||
: {{ ok: false, value: raw, message: 'Must be null' }};
|
||||
}}
|
||||
|
||||
if (typeName === 'array' || typeName === 'object') {{
|
||||
try {{
|
||||
parsed = JSON.parse(raw);
|
||||
}} catch (error) {{
|
||||
return {{ ok: false, value: raw, message: 'Must be valid JSON' }};
|
||||
}}
|
||||
|
||||
if (typeName === 'array' && Array.isArray(parsed)) {{
|
||||
return {{ ok: true, value: parsed, message: '' }};
|
||||
}}
|
||||
if (typeName === 'object' && parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {{
|
||||
return {{ ok: true, value: parsed, message: '' }};
|
||||
}}
|
||||
|
||||
return {{
|
||||
ok: false,
|
||||
value: parsed,
|
||||
message: typeName === 'array' ? 'Must be a JSON array' : 'Must be a JSON object',
|
||||
}};
|
||||
}}
|
||||
|
||||
return {{ ok: true, value: parseValue_{widget_id}(raw), message: '' }};
|
||||
}}
|
||||
|
||||
function validateValueAgainstMeta_{widget_id}(raw, meta) {{
|
||||
if (!meta || raw === '') {{
|
||||
return {{ state: 'neutral', value: raw, message: '' }};
|
||||
}}
|
||||
|
||||
var enumValues = Array.isArray(meta.enum) ? meta.enum : [];
|
||||
var types = getTypes_{widget_id}(meta);
|
||||
if (!types.length) {{
|
||||
types = ['string'];
|
||||
}}
|
||||
|
||||
var error = 'Invalid value';
|
||||
for (var i = 0; i < types.length; i++) {{
|
||||
var candidate = parseTypedValue_{widget_id}(raw, types[i], meta);
|
||||
if (!candidate.ok) {{
|
||||
error = candidate.message || error;
|
||||
continue;
|
||||
}}
|
||||
if (enumValues.length && !enumValues.some(function(enumValue) {{
|
||||
return sameValue_{widget_id}(enumValue, candidate.value) || stringifyValue_{widget_id}(enumValue) === raw;
|
||||
}})) {{
|
||||
error = 'Must be one of: ' + enumValues.map(stringifyValue_{widget_id}).join(', ');
|
||||
continue;
|
||||
}}
|
||||
return {{ state: 'valid', value: candidate.value, message: '' }};
|
||||
}}
|
||||
|
||||
return {{ state: 'invalid', value: raw, message: error }};
|
||||
}}
|
||||
|
||||
function ensureRowId_{widget_id}(row) {{
|
||||
if (!row.dataset.rowId) {{
|
||||
row.dataset.rowId = String(rowCounter_{widget_id}++);
|
||||
}}
|
||||
return row.dataset.rowId;
|
||||
}}
|
||||
|
||||
function setRowHelp_{widget_id}(row) {{
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
var help = row.querySelector('.kv-help');
|
||||
if (!keyInput || !help) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var key = keyInput.value.trim();
|
||||
if (!key) {{
|
||||
help.textContent = '';
|
||||
return;
|
||||
}}
|
||||
|
||||
var meta = getMetaForKey_{widget_id}(key);
|
||||
if (meta) {{
|
||||
var extra = isRegexConfigKey_{widget_id}(key)
|
||||
? ((meta.type === 'object' || (Array.isArray(meta.type) && meta.type.includes('object')))
|
||||
? ' Expected: JSON object with regex keys.'
|
||||
: ' Expected: valid regex.')
|
||||
: '';
|
||||
var example = getExampleInput_{widget_id}(key, meta);
|
||||
help.textContent = [describeMeta_{widget_id}(meta) + extra, example].filter(Boolean).join(' ');
|
||||
}} else {{
|
||||
help.textContent = 'Custom key';
|
||||
}}
|
||||
}}
|
||||
|
||||
function configureValueInput_{widget_id}(row) {{
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
var valueInput = row.querySelector('.kv-value');
|
||||
var datalist = row.querySelector('.kv-value-options');
|
||||
if (!keyInput || !valueInput || !datalist) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var rowId = ensureRowId_{widget_id}(row);
|
||||
datalist.id = '{widget_id}_value_options_' + rowId;
|
||||
|
||||
var meta = getMetaForKey_{widget_id}(keyInput.value.trim());
|
||||
var enumValues = Array.isArray(meta && meta.enum) ? meta.enum : [];
|
||||
var types = getTypes_{widget_id}(meta);
|
||||
if (!enumValues.length && types.includes('boolean')) {{
|
||||
enumValues = ['True', 'False'];
|
||||
}}
|
||||
if (enumValues.length) {{
|
||||
datalist.innerHTML = enumValues.map(function(enumValue) {{
|
||||
return '<option value="' + stringifyValue_{widget_id}(enumValue).replace(/"/g, '"') + '"></option>';
|
||||
}}).join('');
|
||||
valueInput.setAttribute('list', datalist.id);
|
||||
}} else {{
|
||||
datalist.innerHTML = '';
|
||||
valueInput.removeAttribute('list');
|
||||
}}
|
||||
}}
|
||||
|
||||
function setValueValidationState_{widget_id}(input, state, message) {{
|
||||
if (!input) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
if (state === 'valid') {{
|
||||
input.style.borderColor = '#2da44e';
|
||||
input.style.boxShadow = '0 0 0 1px rgba(45, 164, 78, 0.18)';
|
||||
input.style.backgroundColor = '#f6ffed';
|
||||
}} else if (state === 'invalid') {{
|
||||
input.style.borderColor = '#cf222e';
|
||||
input.style.boxShadow = '0 0 0 1px rgba(207, 34, 46, 0.18)';
|
||||
input.style.backgroundColor = '#fff8f8';
|
||||
}} else {{
|
||||
input.style.borderColor = '#ccc';
|
||||
input.style.boxShadow = 'none';
|
||||
input.style.backgroundColor = '';
|
||||
}}
|
||||
input.title = message || '';
|
||||
}}
|
||||
|
||||
function applyValueValidation_{widget_id}(row) {{
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
var valueInput = row.querySelector('.kv-value');
|
||||
if (!keyInput || !valueInput) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var key = keyInput.value.trim();
|
||||
if (!key) {{
|
||||
setValueValidationState_{widget_id}(valueInput, 'neutral', '');
|
||||
return;
|
||||
}}
|
||||
|
||||
var meta = getMetaForKey_{widget_id}(key);
|
||||
if (!meta) {{
|
||||
setValueValidationState_{widget_id}(valueInput, 'neutral', '');
|
||||
return;
|
||||
}}
|
||||
|
||||
var validation = validateValueAgainstMeta_{widget_id}(valueInput.value.trim(), meta);
|
||||
setValueValidationState_{widget_id}(valueInput, validation.state, validation.message);
|
||||
}}
|
||||
|
||||
function coerceValueForStorage_{widget_id}(key, raw) {{
|
||||
var meta = getMetaForKey_{widget_id}(key);
|
||||
if (!meta) {{
|
||||
return parseValue_{widget_id}(raw);
|
||||
}}
|
||||
|
||||
var validation = validateValueAgainstMeta_{widget_id}(raw, meta);
|
||||
return validation.state === 'valid' ? validation.value : raw;
|
||||
}}
|
||||
|
||||
function initializeRows_{widget_id}() {{
|
||||
var container = document.getElementById('{widget_id}_rows');
|
||||
container.querySelectorAll('.key-value-row').forEach(function(row) {{
|
||||
ensureRowId_{widget_id}(row);
|
||||
configureValueInput_{widget_id}(row);
|
||||
setRowHelp_{widget_id}(row);
|
||||
applyValueValidation_{widget_id}(row);
|
||||
}});
|
||||
}}
|
||||
|
||||
function updateHiddenField_{widget_id}() {{
|
||||
@@ -142,20 +576,7 @@ class KeyValueWidget(forms.Widget):
|
||||
if (keyInput && valInput && keyInput.value.trim()) {{
|
||||
var key = keyInput.value.trim();
|
||||
var val = valInput.value.trim();
|
||||
// Try to parse as JSON (for booleans, numbers, etc)
|
||||
try {{
|
||||
if (val === 'true') result[key] = true;
|
||||
else if (val === 'false') result[key] = false;
|
||||
else if (val === 'null') result[key] = null;
|
||||
else if (!isNaN(val) && val !== '') result[key] = Number(val);
|
||||
else if ((val.startsWith('{{') && val.endsWith('}}')) ||
|
||||
(val.startsWith('[') && val.endsWith(']')) ||
|
||||
(val.startsWith('"') && val.endsWith('"')))
|
||||
result[key] = JSON.parse(val);
|
||||
else result[key] = val;
|
||||
}} catch(e) {{
|
||||
result[key] = val;
|
||||
}}
|
||||
result[key] = coerceValueForStorage_{widget_id}(key, val);
|
||||
}}
|
||||
}});
|
||||
document.getElementById('{widget_id}').value = JSON.stringify(result);
|
||||
@@ -163,60 +584,85 @@ class KeyValueWidget(forms.Widget):
|
||||
|
||||
window.addKeyValueRow_{widget_id} = function() {{
|
||||
var container = document.getElementById('{widget_id}_rows');
|
||||
var rows = container.querySelectorAll('.key-value-row');
|
||||
var newIdx = rows.length;
|
||||
var newRow = document.createElement('div');
|
||||
newRow.className = 'key-value-row';
|
||||
newRow.style.cssText = 'display: flex; gap: 8px; margin-bottom: 6px; align-items: center;';
|
||||
newRow.innerHTML = '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
|
||||
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
|
||||
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">' +
|
||||
newRow.style.cssText = 'margin-bottom: 6px;';
|
||||
newRow.innerHTML = '<div style="display: flex; gap: 8px; align-items: center;">' +
|
||||
'<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
|
||||
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">' +
|
||||
'<input type="text" class="kv-value" placeholder="value" ' +
|
||||
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
|
||||
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">' +
|
||||
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">' +
|
||||
'<datalist class="kv-value-options"></datalist>' +
|
||||
'<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' +
|
||||
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>';
|
||||
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>' +
|
||||
'</div>' +
|
||||
'<div class="kv-help" style="margin-top: 4px; font-size: 11px; color: #666; font-style: italic;"></div>';
|
||||
container.appendChild(newRow);
|
||||
ensureRowId_{widget_id}(newRow);
|
||||
configureValueInput_{widget_id}(newRow);
|
||||
setRowHelp_{widget_id}(newRow);
|
||||
applyValueValidation_{widget_id}(newRow);
|
||||
updateHiddenField_{widget_id}();
|
||||
newRow.querySelector('.kv-key').focus();
|
||||
}};
|
||||
|
||||
window.removeKeyValueRow_{widget_id} = function(btn) {{
|
||||
var row = btn.parentElement;
|
||||
var row = btn.closest('.key-value-row');
|
||||
row.remove();
|
||||
updateHiddenField_{widget_id}();
|
||||
}};
|
||||
|
||||
window.showKeyHint_{widget_id} = showKeyHint_{widget_id};
|
||||
window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id};
|
||||
|
||||
// Initialize on load
|
||||
document.addEventListener('DOMContentLoaded', function() {{
|
||||
initializeRows_{widget_id}();
|
||||
updateHiddenField_{widget_id}();
|
||||
}});
|
||||
// Also run immediately in case DOM is already ready
|
||||
if (document.readyState !== 'loading') {{
|
||||
initializeRows_{widget_id}();
|
||||
updateHiddenField_{widget_id}();
|
||||
}}
|
||||
|
||||
// Update on any input change
|
||||
document.getElementById('{widget_id}_rows').addEventListener('input', updateHiddenField_{widget_id});
|
||||
var rowsEl_{widget_id} = document.getElementById('{widget_id}_rows');
|
||||
|
||||
rowsEl_{widget_id}.addEventListener('input', function(event) {{
|
||||
var row = event.target.closest('.key-value-row');
|
||||
if (!row) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
if (event.target.classList.contains('kv-key')) {{
|
||||
configureValueInput_{widget_id}(row);
|
||||
setRowHelp_{widget_id}(row);
|
||||
}}
|
||||
|
||||
if (event.target.classList.contains('kv-key') || event.target.classList.contains('kv-value')) {{
|
||||
applyValueValidation_{widget_id}(row);
|
||||
updateHiddenField_{widget_id}();
|
||||
}}
|
||||
}});
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
'''
|
||||
return mark_safe(html)
|
||||
|
||||
def _render_row(self, widget_id: str, idx: int, key: str, value: str) -> str:
|
||||
def _render_row(self, widget_id: str, key: str, value: str) -> str:
|
||||
return f'''
|
||||
<div class="key-value-row" style="display: flex; gap: 8px; margin-bottom: 6px; align-items: center;">
|
||||
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
|
||||
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
|
||||
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">
|
||||
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
|
||||
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
|
||||
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">
|
||||
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
|
||||
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>
|
||||
<div class="key-value-row" style="margin-bottom: 6px;">
|
||||
<div style="display: flex; gap: 8px; align-items: center;">
|
||||
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
|
||||
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">
|
||||
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
|
||||
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">
|
||||
<datalist class="kv-value-options"></datalist>
|
||||
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
|
||||
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>
|
||||
</div>
|
||||
<div class="kv-help" style="margin-top: 4px; font-size: 11px; color: #666; font-style: italic;"></div>
|
||||
</div>
|
||||
'''
|
||||
|
||||
|
||||
@@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
tag: str='',
|
||||
url_allowlist: str='',
|
||||
url_denylist: str='',
|
||||
parser: str="auto",
|
||||
plugins: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
update: bool | None=None,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
|
||||
@@ -85,6 +87,8 @@ def add(urls: str | list[str],
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
started_at = timezone.now()
|
||||
if update is None:
|
||||
update = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
@@ -120,6 +124,8 @@ def add(urls: str | list[str],
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
|
||||
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -150,6 +156,9 @@ def add(urls: str | list[str],
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return crawl, crawl.snapshot_set.all()
|
||||
|
||||
if bg:
|
||||
crawl.create_snapshots_from_urls()
|
||||
|
||||
# 5. Start the crawl runner to process the queue
|
||||
# The runner will:
|
||||
# - Process Crawl -> create Snapshots from all URLs
|
||||
@@ -192,8 +201,7 @@ def add(urls: str | list[str],
|
||||
except Exception:
|
||||
rel_output_str = str(crawl.output_dir)
|
||||
|
||||
# Build admin URL from SERVER_CONFIG
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
|
||||
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
|
||||
base_url = bind_addr
|
||||
else:
|
||||
@@ -218,11 +226,13 @@ def add(urls: str | list[str],
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
|
||||
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
|
||||
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
|
||||
@@ -42,6 +42,16 @@ from rich import print as rprint
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
|
||||
return {
|
||||
'type': 'ArchiveResult',
|
||||
'snapshot_id': str(snapshot_id),
|
||||
'plugin': plugin,
|
||||
'hook_name': hook_name,
|
||||
'status': status,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
@@ -52,21 +62,21 @@ def create_archiveresults(
|
||||
status: str = 'queued',
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResults for Snapshots.
|
||||
Create ArchiveResult request records for Snapshots.
|
||||
|
||||
Reads Snapshot records from stdin and creates ArchiveResult entries.
|
||||
Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
|
||||
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
|
||||
If --plugin is specified, only creates results for that plugin.
|
||||
Otherwise, creates results for all pending plugins.
|
||||
If --plugin is specified, only emits requests for that plugin.
|
||||
Otherwise, emits requests for all enabled snapshot hooks.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.hooks import discover_hooks
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
@@ -135,33 +145,20 @@ def create_archiveresults(
|
||||
created_count = 0
|
||||
for snapshot in snapshots:
|
||||
if plugin:
|
||||
# Create for specific plugin only
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'status': status,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
|
||||
created_count += 1
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
|
||||
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name
|
||||
plugin_name = hook_path.parent.name
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -205,6 +202,7 @@ def list_archiveresults(
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'noresults': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
@@ -233,8 +231,6 @@ def update_archiveresults(
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
@@ -257,7 +253,6 @@ def update_archiveresults(
|
||||
# Apply updates from CLI flags
|
||||
if status:
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
|
||||
result.save()
|
||||
updated_count += 1
|
||||
|
||||
@@ -38,15 +38,16 @@ import rich_click as click
|
||||
|
||||
def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
"""
|
||||
Run extraction for a single ArchiveResult by ID (used by workers).
|
||||
Re-run extraction for a single ArchiveResult by ID.
|
||||
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor
|
||||
plugin, but only after claiming ownership via retry_at. This keeps direct
|
||||
CLI execution aligned with the worker lifecycle and prevents duplicate hook
|
||||
runs if another process already owns the same ArchiveResult.
|
||||
ArchiveResults are projected status rows, not queued work items. Re-running
|
||||
a single result means resetting that row and queueing its parent snapshot
|
||||
through the shared crawl runner with the corresponding plugin selected.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
@@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Claim-before-tick is the required calling pattern for direct
|
||||
# state-machine drivers. If another worker already owns this row,
|
||||
# report that and exit without running duplicate extractor side effects.
|
||||
if not archiveresult.tick_claimed(lock_seconds=120):
|
||||
print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
|
||||
return 0
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
snapshot.status = snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.status != crawl.StatusChoices.STARTED:
|
||||
crawl.status = crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
|
||||
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
@@ -121,8 +133,9 @@ def run_plugins(
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs to process
|
||||
# Gather snapshot IDs and optional plugin constraints to process
|
||||
snapshot_ids = set()
|
||||
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
|
||||
@@ -142,6 +155,9 @@ def run_plugins(
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
plugin_name = record.get('plugin')
|
||||
if plugin_name and not plugins_list:
|
||||
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
|
||||
|
||||
elif 'id' in record:
|
||||
# Assume it's a snapshot ID
|
||||
@@ -160,26 +176,15 @@ def run_plugins(
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Create pending ArchiveResults if needed
|
||||
if plugins_list:
|
||||
# Only create for specific plugins
|
||||
for plugin_name in plugins_list:
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
|
||||
if existing_result and existing_result.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
existing_result.reset_for_retry()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
@@ -207,10 +212,15 @@ def run_plugins(
|
||||
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
|
||||
|
||||
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
|
||||
selected_plugins = plugins_list or sorted({
|
||||
plugin
|
||||
for snapshot_id in crawl_snapshot_ids
|
||||
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
|
||||
}) or None
|
||||
run_crawl(
|
||||
crawl_id,
|
||||
snapshot_ids=sorted(crawl_snapshot_ids),
|
||||
selected_plugins=plugins_list or None,
|
||||
selected_plugins=selected_plugins,
|
||||
)
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
|
||||
@@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
|
||||
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
|
||||
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None:
|
||||
"""List Snapshots as JSONL."""
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
|
||||
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
|
||||
"""List Snapshots."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
@@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith:
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
))
|
||||
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
from archivebox.personas import importers as persona_importers
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -440,8 +441,6 @@ def create_personas(
|
||||
browser_binary = get_browser_binary(import_from)
|
||||
if browser_binary:
|
||||
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
|
||||
else:
|
||||
browser_binary = None
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
@@ -450,7 +449,7 @@ def create_personas(
|
||||
continue
|
||||
|
||||
# Validate persona name to prevent path traversal
|
||||
is_valid, error_msg = validate_persona_name(name)
|
||||
is_valid, error_msg = persona_importers.validate_persona_name(name)
|
||||
if not is_valid:
|
||||
rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
|
||||
continue
|
||||
@@ -468,49 +467,29 @@ def create_personas(
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
|
||||
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
|
||||
|
||||
# Copy the browser profile
|
||||
rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Remove existing chrome_user_data if it exists
|
||||
if persona_chrome_dir.exists():
|
||||
shutil.rmtree(persona_chrome_dir)
|
||||
|
||||
# Copy the profile directory
|
||||
# We copy the entire user data dir, not just Default profile
|
||||
shutil.copytree(
|
||||
source_profile_dir,
|
||||
persona_chrome_dir,
|
||||
symlinks=True,
|
||||
ignore=shutil.ignore_patterns(
|
||||
'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
|
||||
'Service Worker', 'GCM Store', '*.log', 'Crashpad',
|
||||
'BrowserMetrics', 'BrowserMetrics-spare.pma',
|
||||
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
|
||||
),
|
||||
import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile)
|
||||
import_result = persona_importers.import_persona_from_source(
|
||||
persona,
|
||||
import_source,
|
||||
copy_profile=True,
|
||||
import_cookies=True,
|
||||
capture_storage=False,
|
||||
)
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
|
||||
# Extract cookies via CDP
|
||||
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(
|
||||
persona_chrome_dir,
|
||||
cookies_file,
|
||||
profile_dir=profile,
|
||||
chrome_binary=browser_binary,
|
||||
):
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
|
||||
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if import_result.profile_copied:
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
if import_result.cookies_imported:
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
elif not import_result.profile_copied:
|
||||
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
|
||||
|
||||
for warning in import_result.warnings:
|
||||
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
@@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
# Apply updates from CLI flags
|
||||
if name:
|
||||
# Validate new name to prevent path traversal
|
||||
is_valid, error_msg = validate_persona_name(name)
|
||||
is_valid, error_msg = persona_importers.validate_persona_name(name)
|
||||
if not is_valid:
|
||||
rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
@@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
ARCHIVERESULT_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ArchiveResultMachine │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄─────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──┬───────┬──┘ │ │
|
||||
│ │ │ │ tick() unless can_start() │
|
||||
│ │ │ exceeded_max_ │ │
|
||||
│ │ │ attempts │ │
|
||||
│ │ ▼ │ │
|
||||
│ │ ┌──────────┐ │ │
|
||||
│ │ │ SKIPPED │ │ │
|
||||
│ │ │ (final) │ │ │
|
||||
│ │ └──────────┘ │ │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │──────────────────┘ │
|
||||
│ │ │◄─────────────────────────────────────────────────┐ │
|
||||
│ │ enter: │ │ │ │
|
||||
│ │ result.run()│ tick() unless │ │ │
|
||||
│ │ (execute │ is_finished() │ │ │
|
||||
│ │ hook via │──────────────────────┘ │ │
|
||||
│ │ run_hook())│ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ tick() checks status set by hook output │ │
|
||||
│ ├─────────────┬─────────────┬─────────────┐ │ │
|
||||
│ ▼ ▼ ▼ ▼ │ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
|
||||
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
|
||||
│ │ │ │ │
|
||||
│ exceeded_max_ │ │ can_start()│ │
|
||||
│ attempts │ │ loops back │ │
|
||||
│ ▼ │ └────────────┘ │
|
||||
│ ┌──────────┐ │ │
|
||||
│ │ SKIPPED │◄─┘ │
|
||||
│ │ (final) │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
BINARY_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ BinaryMachine │
|
||||
@@ -193,8 +143,8 @@ def pluginmap(
|
||||
"""
|
||||
Show a map of all state machines and their associated plugin hooks.
|
||||
|
||||
Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
|
||||
ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
|
||||
Displays ASCII art diagrams of the core queued model state machines (Crawl,
|
||||
Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
|
||||
that will run for each model's transitions.
|
||||
"""
|
||||
from rich.console import Console
|
||||
@@ -257,17 +207,6 @@ def pluginmap(
|
||||
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
|
||||
prnt()
|
||||
|
||||
# Show diagrams first (unless quiet mode)
|
||||
if not quiet:
|
||||
# Show ArchiveResult diagram separately since it's different
|
||||
prnt(Panel(
|
||||
ARCHIVERESULT_MACHINE_DIAGRAM,
|
||||
title='[bold green]ArchiveResultMachine[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
prnt()
|
||||
|
||||
for event_name, info in model_events.items():
|
||||
# Discover hooks for this event
|
||||
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
|
||||
|
||||
@@ -145,17 +145,25 @@ def process_stdin_records() -> int:
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=record_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
archiveresult = ArchiveResult.from_json(record)
|
||||
archiveresult = None
|
||||
else:
|
||||
# New archiveresult - create it
|
||||
archiveresult = ArchiveResult.from_json(record)
|
||||
archiveresult = None
|
||||
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin_name = record.get('plugin')
|
||||
snapshot = None
|
||||
if archiveresult:
|
||||
archiveresult.retry_at = timezone.now()
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
|
||||
archiveresult.save()
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
plugin_name = plugin_name or archiveresult.plugin
|
||||
elif snapshot_id:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = None
|
||||
|
||||
if snapshot:
|
||||
snapshot.retry_at = timezone.now()
|
||||
if snapshot.status != Snapshot.StatusChoices.STARTED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
@@ -167,9 +175,9 @@ def process_stdin_records() -> int:
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
if archiveresult.plugin:
|
||||
plugin_names_by_crawl[crawl_id].add(archiveresult.plugin)
|
||||
output_records.append(archiveresult.to_json())
|
||||
if plugin_name:
|
||||
plugin_names_by_crawl[crawl_id].add(str(plugin_name))
|
||||
output_records.append(record if not archiveresult else archiveresult.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_BINARY:
|
||||
@@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int:
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.machine.models import Machine, Process
|
||||
from archivebox.services.runner import run_pending_crawls
|
||||
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
recover_orphaned_snapshots()
|
||||
recover_orphaned_crawls()
|
||||
Machine.current()
|
||||
current = Process.current()
|
||||
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
|
||||
@@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if daemon:
|
||||
if not sys.stdin.isatty():
|
||||
exit_code = process_stdin_records()
|
||||
if exit_code != 0:
|
||||
sys.exit(exit_code)
|
||||
sys.exit(run_runner(daemon=True))
|
||||
|
||||
if not sys.stdin.isatty():
|
||||
sys.exit(process_stdin_records())
|
||||
else:
|
||||
|
||||
@@ -3,9 +3,7 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Iterable
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
|
||||
|
||||
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
|
||||
"""Stop any existing orchestrator process so the server can take ownership."""
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
|
||||
running_runners = list(process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by('created_at'))
|
||||
|
||||
if not running_runners:
|
||||
return 0
|
||||
|
||||
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
|
||||
|
||||
if supervisor is not None and stop_worker_fn is not None:
|
||||
for worker_name in ('worker_runner', 'worker_runner_watch'):
|
||||
try:
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for proc in running_runners:
|
||||
try:
|
||||
proc.kill_tree(graceful_timeout=2.0)
|
||||
except Exception:
|
||||
try:
|
||||
proc.terminate(graceful_timeout=2.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
return len(running_runners)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool=False,
|
||||
@@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
if debug or reload:
|
||||
SHELL_CONFIG.DEBUG = True
|
||||
|
||||
if run_in_debug:
|
||||
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
|
||||
if reload:
|
||||
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
|
||||
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
|
||||
|
||||
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
|
||||
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if not is_reloader_child:
|
||||
env = os.environ.copy()
|
||||
subprocess.Popen(
|
||||
[sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'],
|
||||
env=env,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
@@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
from archivebox.workers.supervisord_util import (
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
stop_worker,
|
||||
start_server_workers,
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
machine = Machine.current()
|
||||
stop_existing_background_runner(
|
||||
machine=machine,
|
||||
process_model=Process,
|
||||
supervisor=get_existing_supervisord_process(),
|
||||
stop_worker_fn=stop_worker,
|
||||
)
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
|
||||
server_proc = get_worker(supervisor, server_worker_name)
|
||||
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
|
||||
if server_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
if runner_watch_state == 'RUNNING':
|
||||
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
sys.exit(1)
|
||||
|
||||
if run_in_debug:
|
||||
from django.core.management import call_command
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
if not reload:
|
||||
runserver_args.append('--noreload') # '--insecure'
|
||||
if nothreading:
|
||||
runserver_args.append('--nothreading')
|
||||
call_command("runserver", *runserver_args)
|
||||
else:
|
||||
from archivebox.workers.supervisord_util import (
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
start_server_workers,
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if the background crawl runner is already running for this data directory
|
||||
if Process.objects.filter(
|
||||
machine=Machine.current(),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
).exists():
|
||||
print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]')
|
||||
print(' Stop the existing runner before starting a new server')
|
||||
print(' To stop: pkill -f "archivebox run --daemon"')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if supervisord is already running
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
daphne_proc = get_worker(supervisor, 'worker_daphne')
|
||||
daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None
|
||||
|
||||
# If daphne is already running, error out
|
||||
if daphne_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
sys.exit(1)
|
||||
# Otherwise, daphne is not running - fall through to start it
|
||||
|
||||
# No existing workers found - start new ones
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
@click.command()
|
||||
|
||||
@@ -172,6 +172,9 @@ def list_snapshots(
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
sort: Optional[str] = None,
|
||||
csv: Optional[str] = None,
|
||||
with_headers: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
@@ -182,7 +185,11 @@ def list_snapshots(
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
if with_headers and not csv:
|
||||
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
|
||||
return 2
|
||||
|
||||
is_tty = sys.stdout.isatty() and not csv
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
|
||||
@@ -199,7 +206,29 @@ def list_snapshots(
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
if sort:
|
||||
queryset = queryset.order_by(sort)
|
||||
|
||||
count = 0
|
||||
if csv:
|
||||
cols = [col.strip() for col in csv.split(',') if col.strip()]
|
||||
if not cols:
|
||||
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
|
||||
return 2
|
||||
rows: list[str] = []
|
||||
if with_headers:
|
||||
rows.append(','.join(cols))
|
||||
for snapshot in queryset.iterator(chunk_size=500):
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=','))
|
||||
count += 1
|
||||
output = '\n'.join(rows)
|
||||
if output:
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import re
|
||||
import secrets
|
||||
import sys
|
||||
import shutil
|
||||
from typing import ClassVar, Dict, Optional, List
|
||||
@@ -8,7 +9,6 @@ from pathlib import Path
|
||||
|
||||
from rich import print
|
||||
from pydantic import Field, field_validator
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
@@ -104,7 +104,7 @@ class ServerConfig(BaseConfigSet):
|
||||
"danger-onedomain-fullreplay",
|
||||
)
|
||||
|
||||
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
|
||||
SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
|
||||
BIND_ADDR: str = Field(default="127.0.0.1:8000")
|
||||
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
|
||||
ADMIN_BASE_URL: str = Field(default="")
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
__package__ = 'archivebox.config'
|
||||
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import inspect
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Callable, Dict
|
||||
from urllib.parse import quote, urlencode
|
||||
from django.http import HttpRequest
|
||||
from django.utils import timezone
|
||||
from django.utils.html import format_html
|
||||
@@ -18,16 +21,27 @@ from archivebox.misc.util import parse_date
|
||||
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
|
||||
LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
|
||||
ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
|
||||
INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
KNOWN_BINARIES = [
|
||||
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
|
||||
'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
|
||||
'node', 'npm', 'npx', 'yt-dlp',
|
||||
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
|
||||
'python3', 'python', 'bash', 'zsh',
|
||||
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
|
||||
]
|
||||
|
||||
CANONICAL_BINARY_ALIASES = {
|
||||
'youtube-dl': 'yt-dlp',
|
||||
'ytdlp': 'yt-dlp',
|
||||
}
|
||||
|
||||
|
||||
def is_superuser(request: HttpRequest) -> bool:
|
||||
return bool(getattr(request.user, 'is_superuser', False))
|
||||
@@ -38,6 +52,249 @@ def format_parsed_datetime(value: object) -> str:
|
||||
return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else ""
|
||||
|
||||
|
||||
JSON_TOKEN_RE = re.compile(
|
||||
r'(?P<key>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
|
||||
r'|(?P<string>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
|
||||
r'|(?P<boolean>\btrue\b|\bfalse\b)'
|
||||
r'|(?P<null>\bnull\b)'
|
||||
r'|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
|
||||
)
|
||||
|
||||
|
||||
def render_code_block(text: str, *, highlighted: bool = False) -> str:
|
||||
code = html.escape(text, quote=False)
|
||||
|
||||
if highlighted:
|
||||
def _wrap_token(match: re.Match[str]) -> str:
|
||||
styles = {
|
||||
'key': 'color: #0550ae;',
|
||||
'string': 'color: #0a7f45;',
|
||||
'boolean': 'color: #8250df; font-weight: 600;',
|
||||
'null': 'color: #6e7781; font-style: italic;',
|
||||
'number': 'color: #b35900;',
|
||||
}
|
||||
token_type = next(name for name, value in match.groupdict().items() if value is not None)
|
||||
return f'<span style="{styles[token_type]}">{match.group(0)}</span>'
|
||||
|
||||
code = JSON_TOKEN_RE.sub(_wrap_token, code)
|
||||
|
||||
return (
|
||||
'<pre style="max-height: 600px; overflow: auto; background: #f6f8fa; '
|
||||
'border: 1px solid #d0d7de; border-radius: 6px; padding: 12px; margin: 0;">'
|
||||
'<code style="font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, '
|
||||
'\'Liberation Mono\', monospace; white-space: pre; line-height: 1.5;">'
|
||||
f'{code}'
|
||||
'</code></pre>'
|
||||
)
|
||||
|
||||
|
||||
def render_highlighted_json_block(value: Any) -> str:
|
||||
return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True)
|
||||
|
||||
|
||||
def get_plugin_docs_url(plugin_name: str) -> str:
|
||||
return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
|
||||
|
||||
|
||||
def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
|
||||
return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
|
||||
|
||||
|
||||
def get_live_config_url(key: str) -> str:
|
||||
return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
|
||||
|
||||
|
||||
def get_environment_binary_url(name: str) -> str:
|
||||
return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
|
||||
|
||||
|
||||
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
|
||||
binary_id = getattr(binary, 'id', None)
|
||||
if not binary_id:
|
||||
return None
|
||||
|
||||
base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
|
||||
changelist_filters = urlencode({'q': canonical_binary_name(name)})
|
||||
return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
|
||||
|
||||
|
||||
def get_machine_admin_url() -> str | None:
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
return Machine.current().admin_change_url
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def render_code_tag_list(values: list[str]) -> str:
|
||||
if not values:
|
||||
return '<span style="color: #6e7781;">(none)</span>'
|
||||
|
||||
tags = ''.join(
|
||||
str(format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
))
|
||||
for value in values
|
||||
)
|
||||
return f'<div style="display: flex; flex-wrap: wrap;">{tags}</div>'
|
||||
|
||||
|
||||
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
|
||||
rows = (
|
||||
('Title', config.get('title') or '(none)'),
|
||||
('Description', config.get('description') or '(none)'),
|
||||
('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
|
||||
('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
|
||||
('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
|
||||
)
|
||||
|
||||
rendered_rows = ''.join(
|
||||
str(format_html(
|
||||
'<div style="margin: 0 0 14px 0;">'
|
||||
'<div style="font-weight: 600; margin-bottom: 4px;">{}</div>'
|
||||
'<div>{}</div>'
|
||||
'</div>',
|
||||
label,
|
||||
value,
|
||||
))
|
||||
for label, value in rows
|
||||
)
|
||||
return f'<div style="margin: 4px 0 0 0;">{rendered_rows}</div>'
|
||||
|
||||
|
||||
def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str:
|
||||
if not values:
|
||||
return '<span style="color: #6e7781;">(none)</span>'
|
||||
|
||||
tags = []
|
||||
for value in values:
|
||||
if url_resolver is None:
|
||||
tags.append(str(format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
)))
|
||||
else:
|
||||
tags.append(str(format_html(
|
||||
'<a href="{}" style="text-decoration: none;">'
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
|
||||
'</a>',
|
||||
url_resolver(value),
|
||||
value,
|
||||
)))
|
||||
return f'<div style="display: flex; flex-wrap: wrap;">{"".join(tags)}</div>'
|
||||
|
||||
|
||||
def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str:
|
||||
links = [
|
||||
str(format_html('<a href="{}">Computed value</a>', get_live_config_url(prop_name))),
|
||||
]
|
||||
if machine_admin_url:
|
||||
links.append(str(format_html('<a href="{}">Edit override</a>', machine_admin_url)))
|
||||
|
||||
fallback = prop_info.get('x-fallback')
|
||||
if isinstance(fallback, str) and fallback:
|
||||
links.append(str(format_html('<a href="{}">Fallback: <code>{}</code></a>', get_live_config_url(fallback), fallback)))
|
||||
|
||||
aliases = prop_info.get('x-aliases') or []
|
||||
if isinstance(aliases, list):
|
||||
for alias in aliases:
|
||||
if isinstance(alias, str) and alias:
|
||||
links.append(str(format_html('<a href="{}">Alias: <code>{}</code></a>', get_live_config_url(alias), alias)))
|
||||
|
||||
default = prop_info.get('default')
|
||||
if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
|
||||
links.append(str(format_html('<a href="{}">Binary: <code>{}</code></a>', get_environment_binary_url(default), default)))
|
||||
|
||||
return ' '.join(links)
|
||||
|
||||
|
||||
def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
|
||||
header_links = [
|
||||
str(format_html('<a href="{}">Dependencies</a>', ENVIRONMENT_BINARIES_BASE_URL)),
|
||||
str(format_html('<a href="{}">Installed Binaries</a>', INSTALLED_BINARIES_BASE_URL)),
|
||||
]
|
||||
if machine_admin_url:
|
||||
header_links.insert(0, str(format_html('<a href="{}">Machine Config Editor</a>', machine_admin_url)))
|
||||
|
||||
cards = [
|
||||
f'<div style="margin: 0 0 16px 0;">{" | ".join(header_links)}</div>'
|
||||
]
|
||||
|
||||
for prop_name, prop_info in properties.items():
|
||||
prop_type = prop_info.get('type', 'unknown')
|
||||
if isinstance(prop_type, list):
|
||||
prop_type = ' | '.join(str(type_name) for type_name in prop_type)
|
||||
prop_desc = prop_info.get('description', '')
|
||||
|
||||
default_html = ''
|
||||
if 'default' in prop_info:
|
||||
default_html = str(format_html(
|
||||
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
|
||||
prop_info['default'],
|
||||
))
|
||||
|
||||
description_html = prop_desc or mark_safe('<span style="color: #6e7781;">(no description)</span>')
|
||||
cards.append(str(format_html(
|
||||
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
|
||||
'<div style="margin-bottom: 6px;">'
|
||||
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
|
||||
' <span style="color: #6e7781;">({})</span>'
|
||||
'</div>'
|
||||
'<div style="margin-bottom: 6px;">{}</div>'
|
||||
'<div style="font-size: 0.95em;">{}</div>'
|
||||
'{}'
|
||||
'</div>',
|
||||
get_live_config_url(prop_name),
|
||||
prop_name,
|
||||
prop_type,
|
||||
description_html,
|
||||
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
|
||||
mark_safe(default_html),
|
||||
)))
|
||||
|
||||
return ''.join(cards)
|
||||
|
||||
|
||||
def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
|
||||
if not hooks:
|
||||
return '<span style="color: #6e7781;">(none)</span>'
|
||||
|
||||
items = []
|
||||
for hook_name in hooks:
|
||||
if source == 'builtin':
|
||||
items.append(str(format_html(
|
||||
'<div style="margin: 0 0 8px 0;">'
|
||||
'<a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a>'
|
||||
'</div>',
|
||||
get_plugin_hook_source_url(plugin_name, hook_name),
|
||||
hook_name,
|
||||
)))
|
||||
else:
|
||||
items.append(str(format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
|
||||
hook_name,
|
||||
)))
|
||||
return ''.join(items)
|
||||
|
||||
|
||||
def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
|
||||
installed_binary_url = get_installed_binary_change_url(name, db_binary)
|
||||
|
||||
if installed_binary_url:
|
||||
return str(format_html(
|
||||
'<code>{}</code><br/>'
|
||||
'<a href="{}">View Installed Binary Record</a>',
|
||||
merged['abspath'],
|
||||
installed_binary_url,
|
||||
))
|
||||
|
||||
return str(format_html('<code>{}</code>', merged['abspath']))
|
||||
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
indent_str = " " * indent
|
||||
if indent == 0:
|
||||
@@ -80,21 +337,41 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
return f" {str(obj)}"
|
||||
|
||||
|
||||
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
|
||||
"""Detect available binaries using shutil.which."""
|
||||
binaries = {}
|
||||
def canonical_binary_name(name: str) -> str:
|
||||
return CANONICAL_BINARY_ALIASES.get(name, name)
|
||||
|
||||
for name in KNOWN_BINARIES:
|
||||
path = shutil.which(name)
|
||||
if path:
|
||||
binaries[name] = {
|
||||
'name': name,
|
||||
'abspath': path,
|
||||
'version': None, # Could add version detection later
|
||||
'is_available': True,
|
||||
}
|
||||
|
||||
return binaries
|
||||
def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
|
||||
return (
|
||||
int(binary.status == Binary.StatusChoices.INSTALLED),
|
||||
int(bool(binary.version)),
|
||||
int(bool(binary.abspath)),
|
||||
binary.modified_at,
|
||||
)
|
||||
|
||||
|
||||
def get_db_binaries_by_name() -> Dict[str, Binary]:
|
||||
grouped: Dict[str, list[Binary]] = {}
|
||||
for binary in Binary.objects.all():
|
||||
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
|
||||
|
||||
return {
|
||||
name: max(records, key=_binary_sort_key)
|
||||
for name, records in grouped.items()
|
||||
}
|
||||
|
||||
|
||||
def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
|
||||
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
|
||||
return {
|
||||
'name': canonical_binary_name(name),
|
||||
'version': str(getattr(binary, 'version', '') or ''),
|
||||
'binprovider': str(getattr(binary, 'binprovider', '') or ''),
|
||||
'abspath': str(getattr(binary, 'abspath', '') or ''),
|
||||
'sha256': str(getattr(binary, 'sha256', '') or ''),
|
||||
'status': str(getattr(binary, 'status', '') or ''),
|
||||
'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
|
||||
}
|
||||
|
||||
|
||||
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
|
||||
@@ -150,29 +427,18 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
"Found Abspath": [],
|
||||
}
|
||||
|
||||
# Get binaries from database (previously detected/installed)
|
||||
db_binaries = {b.name: b for b in Binary.objects.all()}
|
||||
|
||||
# Get currently detectable binaries
|
||||
detected = get_detected_binaries()
|
||||
|
||||
# Merge and display
|
||||
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
|
||||
db_binaries = get_db_binaries_by_name()
|
||||
all_binary_names = sorted(db_binaries.keys())
|
||||
|
||||
for name in all_binary_names:
|
||||
db_binary = db_binaries.get(name)
|
||||
detected_binary = detected.get(name)
|
||||
merged = serialize_binary_record(name, db_binaries.get(name))
|
||||
|
||||
rows['Binary Name'].append(ItemLink(name, key=name))
|
||||
|
||||
if db_binary:
|
||||
rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found')
|
||||
rows['Provided By'].append(db_binary.binprovider or 'PATH')
|
||||
rows['Found Abspath'].append(str(db_binary.abspath or ''))
|
||||
elif detected_binary:
|
||||
rows['Found Version'].append('✅ found')
|
||||
rows['Provided By'].append('PATH')
|
||||
rows['Found Abspath'].append(detected_binary['abspath'])
|
||||
if merged['is_available']:
|
||||
rows['Found Version'].append(f"✅ {merged['version']}" if merged['version'] else '✅ found')
|
||||
rows['Provided By'].append(merged['binprovider'] or '-')
|
||||
rows['Found Abspath'].append(merged['abspath'] or '-')
|
||||
else:
|
||||
rows['Found Version'].append('❌ missing')
|
||||
rows['Provided By'].append('-')
|
||||
@@ -187,41 +453,22 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
@render_with_item_view
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
key = canonical_binary_name(key)
|
||||
|
||||
# Try database first
|
||||
try:
|
||||
binary = Binary.objects.get(name=key)
|
||||
section: SectionData = {
|
||||
"name": binary.name,
|
||||
"description": str(binary.abspath or ''),
|
||||
"fields": {
|
||||
'name': binary.name,
|
||||
'binprovider': binary.binprovider,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': binary.version,
|
||||
'sha256': binary.sha256,
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[section],
|
||||
)
|
||||
except Binary.DoesNotExist:
|
||||
pass
|
||||
db_binary = get_db_binaries_by_name().get(key)
|
||||
merged = serialize_binary_record(key, db_binary)
|
||||
|
||||
# Try to detect from PATH
|
||||
path = shutil.which(key)
|
||||
if path:
|
||||
if merged['is_available']:
|
||||
section: SectionData = {
|
||||
"name": key,
|
||||
"description": path,
|
||||
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': 'PATH',
|
||||
'abspath': path,
|
||||
'version': 'unknown',
|
||||
'binprovider': merged['binprovider'] or '-',
|
||||
'abspath': merged['abspath'] or 'not found',
|
||||
'version': merged['version'] or 'unknown',
|
||||
'sha256': merged['sha256'],
|
||||
'status': merged['status'],
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -233,12 +480,13 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
section: SectionData = {
|
||||
"name": key,
|
||||
"description": "Binary not found",
|
||||
"description": "No persisted Binary record found",
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': 'not installed',
|
||||
'abspath': 'not found',
|
||||
'version': 'N/A',
|
||||
'binprovider': merged['binprovider'] or 'not recorded',
|
||||
'abspath': merged['abspath'] or 'not recorded',
|
||||
'version': merged['version'] or 'N/A',
|
||||
'status': merged['status'] or 'unrecorded',
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -293,8 +541,6 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
import json
|
||||
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
|
||||
plugins = get_filesystem_plugins()
|
||||
@@ -308,45 +554,61 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
)
|
||||
|
||||
# Base fields that all plugins have
|
||||
docs_url = get_plugin_docs_url(plugin['name'])
|
||||
machine_admin_url = get_machine_admin_url()
|
||||
fields = {
|
||||
"id": plugin['id'],
|
||||
"name": plugin['name'],
|
||||
"source": plugin['source'],
|
||||
"path": plugin['path'],
|
||||
"hooks": ', '.join(plugin['hooks']),
|
||||
}
|
||||
|
||||
# Add config.json data if available
|
||||
if plugin.get('config'):
|
||||
config_json = json.dumps(plugin['config'], indent=2)
|
||||
fields["config.json"] = mark_safe(
|
||||
'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; '
|
||||
f'padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>'
|
||||
)
|
||||
|
||||
# Also extract and display individual config properties for easier viewing
|
||||
if 'properties' in plugin['config']:
|
||||
config_properties = plugin['config']['properties']
|
||||
properties_summary = []
|
||||
for prop_name, prop_info in config_properties.items():
|
||||
prop_type = prop_info.get('type', 'unknown')
|
||||
prop_desc = prop_info.get('description', '')
|
||||
properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}")
|
||||
|
||||
if properties_summary:
|
||||
fields["Config Properties"] = mark_safe('<br/>'.join(properties_summary))
|
||||
|
||||
section: SectionData = {
|
||||
sections: list[SectionData] = [{
|
||||
"name": plugin['name'],
|
||||
"description": plugin['path'],
|
||||
"description": format_html(
|
||||
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
|
||||
plugin['path'],
|
||||
docs_url,
|
||||
),
|
||||
"fields": fields,
|
||||
"help_texts": {},
|
||||
}
|
||||
}]
|
||||
|
||||
if plugin['hooks']:
|
||||
sections.append({
|
||||
"name": "Hooks",
|
||||
"description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
if plugin.get('config'):
|
||||
sections.append({
|
||||
"name": "Plugin Metadata",
|
||||
"description": mark_safe(render_plugin_metadata_html(plugin['config'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
sections.append({
|
||||
"name": "config.json",
|
||||
"description": mark_safe(render_highlighted_json_block(plugin['config'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
config_properties = plugin['config'].get('properties', {})
|
||||
if config_properties:
|
||||
sections.append({
|
||||
"name": "Config Properties",
|
||||
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=plugin['name'],
|
||||
data=[section],
|
||||
data=sections,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,14 +1,23 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
from functools import reduce
|
||||
from operator import and_
|
||||
|
||||
from django.contrib import admin
|
||||
from django.db.models import Min, Q, TextField
|
||||
from django.db.models.functions import Cast
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.urls import reverse, resolve
|
||||
from django.utils import timezone
|
||||
from django.utils.text import smart_split
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
@@ -16,11 +25,71 @@ from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.widgets import InlineTagEditorWidget
|
||||
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
|
||||
|
||||
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def _stringify_env_value(value) -> str:
|
||||
if value is None:
|
||||
return ''
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return json.dumps(value, separators=(',', ':'))
|
||||
|
||||
|
||||
def _quote_shell_string(value: str) -> str:
|
||||
return "'" + str(value).replace("'", "'\"'\"'") + "'"
|
||||
|
||||
|
||||
def _get_replay_source_url(result: ArchiveResult) -> str:
|
||||
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
|
||||
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
|
||||
|
||||
|
||||
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
||||
source_url = _get_replay_source_url(result)
|
||||
plugin_name = str(result.plugin or '').strip()
|
||||
if not plugin_name and not source_url:
|
||||
return 'abx-dl'
|
||||
if not source_url:
|
||||
return f'abx-dl --plugins={plugin_name}'
|
||||
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
|
||||
|
||||
|
||||
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
|
||||
display_command = build_abx_dl_display_command(result)
|
||||
process = getattr(result, 'process', None)
|
||||
env = getattr(process, 'env', None) or {}
|
||||
env_items = ' '.join(
|
||||
f'{key}={shlex.quote(_stringify_env_value(value))}'
|
||||
for key, value in sorted(env.items())
|
||||
if value is not None
|
||||
)
|
||||
snapshot_dir = shlex.quote(str(result.snapshot_dir))
|
||||
if env_items:
|
||||
return f'cd {snapshot_dir}; env {env_items} {display_command}'
|
||||
return f'cd {snapshot_dir}; {display_command}'
|
||||
|
||||
|
||||
def get_plugin_admin_url(plugin_name: str) -> str:
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs
|
||||
|
||||
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
|
||||
if plugin_dir:
|
||||
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(builtin_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
|
||||
user_root = USER_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(user_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
|
||||
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
|
||||
|
||||
@@ -35,6 +104,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
'failed': ('#991b1b', '#fee2e2'), # red
|
||||
'queued': ('#6b7280', '#f3f4f6'), # gray
|
||||
'started': ('#92400e', '#fef3c7'), # amber
|
||||
'backoff': ('#92400e', '#fef3c7'),
|
||||
'skipped': ('#475569', '#f1f5f9'),
|
||||
'noresults': ('#475569', '#f1f5f9'),
|
||||
}
|
||||
|
||||
rows = []
|
||||
@@ -54,8 +126,10 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
|
||||
# Get full command as tooltip
|
||||
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
cmd_str_escaped = html.escape(display_cmd)
|
||||
cmd_attr = html.escape(replay_cmd, quote=True)
|
||||
|
||||
# Build output link - use embed_path() which checks output_files first
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
@@ -77,7 +151,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
|
||||
title="View/edit archive result">
|
||||
<code>{str(result.id)[:8]}</code>
|
||||
<code>{str(result.id)[-8:]}</code>
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
@@ -140,7 +214,15 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
|
||||
<b>Command:</b>
|
||||
</div>
|
||||
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
|
||||
<div style="position: relative; margin: 0; padding: 8px 56px 8px 8px; background: #1e293b; border-radius: 4px;">
|
||||
<button type="button"
|
||||
data-command="{cmd_attr}"
|
||||
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
|
||||
style="position: absolute; top: 6px; right: 6px; padding: 2px 8px; border: 0; border-radius: 4px; background: #334155; color: #e2e8f0; font-size: 11px; cursor: pointer;">
|
||||
Copy
|
||||
</button>
|
||||
<code title="{cmd_attr}" style="display: block; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; color: #e2e8f0; font-size: 11px;">{cmd_str_escaped}</code>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
</td>
|
||||
@@ -165,7 +247,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Details</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
|
||||
@@ -193,7 +275,7 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
@@ -259,10 +341,11 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
|
||||
list_display_links = None
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
|
||||
search_fields = ()
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
@@ -271,7 +354,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Plugin', {
|
||||
'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at'),
|
||||
'fields': ('plugin_with_icon', 'process_link', 'status'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
@@ -305,8 +388,61 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
self.request = request
|
||||
return super().change_view(request, object_id, form_url, extra_context)
|
||||
|
||||
def get_queryset(self, request):
|
||||
return (
|
||||
super()
|
||||
.get_queryset(request)
|
||||
.select_related('snapshot', 'process')
|
||||
.prefetch_related('snapshot__tags')
|
||||
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
|
||||
)
|
||||
|
||||
def get_search_results(self, request, queryset, search_term):
|
||||
if not search_term:
|
||||
return queryset, False
|
||||
|
||||
queryset = queryset.annotate(
|
||||
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
|
||||
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
|
||||
output_json_text=Cast('output_json', output_field=TextField()),
|
||||
cmd_text=Cast('process__cmd', output_field=TextField()),
|
||||
)
|
||||
|
||||
search_bits = [
|
||||
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
|
||||
for bit in smart_split(search_term)
|
||||
]
|
||||
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
|
||||
if not search_bits:
|
||||
return queryset, False
|
||||
|
||||
filters = []
|
||||
for bit in search_bits:
|
||||
filters.append(
|
||||
Q(snapshot_id_text__icontains=bit)
|
||||
| Q(snapshot__url__icontains=bit)
|
||||
| Q(snapshot__tags__name__icontains=bit)
|
||||
| Q(snapshot_crawl_id_text__icontains=bit)
|
||||
| Q(plugin__icontains=bit)
|
||||
| Q(hook_name__icontains=bit)
|
||||
| Q(output_str__icontains=bit)
|
||||
| Q(output_json_text__icontains=bit)
|
||||
| Q(cmd_text__icontains=bit)
|
||||
)
|
||||
|
||||
return queryset.filter(reduce(and_, filters)).distinct(), True
|
||||
|
||||
@admin.display(description='Details', ordering='id')
|
||||
def details_link(self, result):
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:core_archiveresult_change', args=[result.id]),
|
||||
str(result.id)[-8:],
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot Info'
|
||||
description='Snapshot',
|
||||
ordering='snapshot__url',
|
||||
)
|
||||
def snapshot_info(self, result):
|
||||
snapshot_id = str(result.snapshot_id)
|
||||
@@ -325,20 +461,83 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
def tags_str(self, result):
|
||||
return result.snapshot.tags_str()
|
||||
|
||||
@admin.display(description='Tags', ordering='snapshot_first_tag')
|
||||
def tags_inline(self, result):
|
||||
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
|
||||
tags_html = widget.render(
|
||||
name=f'tags_{result.snapshot_id}',
|
||||
value=result.snapshot.tags.all(),
|
||||
attrs={'id': f'tags_{result.snapshot_id}'},
|
||||
snapshot_id=str(result.snapshot_id),
|
||||
)
|
||||
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
|
||||
|
||||
@admin.display(description='Status', ordering='status')
|
||||
def status_badge(self, result):
|
||||
status = result.status or ArchiveResult.StatusChoices.QUEUED
|
||||
return format_html(
|
||||
'<span class="status-badge {} status-{}">{}</span>',
|
||||
status,
|
||||
status,
|
||||
result.get_status_display() or status,
|
||||
)
|
||||
|
||||
@admin.display(description='Plugin', ordering='plugin')
|
||||
def plugin_with_icon(self, result):
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
return format_html(
|
||||
'<span title="{}">{}</span> {}',
|
||||
'<a href="{}" title="{}">{}</a> <a href="{}"><code>{}</code></a>',
|
||||
get_plugin_admin_url(result.plugin),
|
||||
result.plugin,
|
||||
icon,
|
||||
get_plugin_admin_url(result.plugin),
|
||||
result.plugin,
|
||||
)
|
||||
|
||||
def cmd_str(self, result):
|
||||
@admin.display(description='Process', ordering='process__pid')
|
||||
def process_link(self, result):
|
||||
if not result.process_id:
|
||||
return '-'
|
||||
process_label = result.process.pid if result.process and result.process.pid else '-'
|
||||
return format_html(
|
||||
'<pre>{}</pre>',
|
||||
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:machine_process_change', args=[result.process_id]),
|
||||
process_label,
|
||||
)
|
||||
|
||||
@admin.display(description='Machine', ordering='process__machine__hostname')
|
||||
def machine_link(self, result):
|
||||
if not result.process_id or not result.process or not result.process.machine_id:
|
||||
return '-'
|
||||
machine = result.process.machine
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code> {}</a>',
|
||||
reverse('admin:machine_machine_change', args=[machine.id]),
|
||||
str(machine.id)[:8],
|
||||
machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Command')
|
||||
def cmd_str(self, result):
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
return format_html(
|
||||
'''
|
||||
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
|
||||
<button type="button"
|
||||
data-command="{}"
|
||||
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
|
||||
style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;">
|
||||
Copy
|
||||
</button>
|
||||
<code title="{}" style="display: block; width: 100%; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; box-sizing: border-box;">
|
||||
{}
|
||||
</code>
|
||||
</div>
|
||||
''',
|
||||
replay_cmd,
|
||||
replay_cmd,
|
||||
display_cmd,
|
||||
)
|
||||
|
||||
def output_display(self, result):
|
||||
@@ -352,6 +551,27 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.output_str,
|
||||
)
|
||||
|
||||
@admin.display(description='Output', ordering='output_str')
|
||||
def output_str_display(self, result):
|
||||
output_text = str(result.output_str or '').strip()
|
||||
if not output_text:
|
||||
return '-'
|
||||
|
||||
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
if live_path:
|
||||
return format_html(
|
||||
'<a href="{}" title="{}"><code>{}</code></a>',
|
||||
build_snapshot_url(str(result.snapshot_id), live_path),
|
||||
output_text,
|
||||
output_text,
|
||||
)
|
||||
|
||||
return format_html(
|
||||
'<span title="{}">{}</span>',
|
||||
output_text,
|
||||
output_text,
|
||||
)
|
||||
|
||||
def output_summary(self, result):
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
||||
output_html = format_html(
|
||||
|
||||
@@ -61,12 +61,14 @@ def register_admin_site():
|
||||
from archivebox.crawls.admin import register_admin as register_crawls_admin
|
||||
from archivebox.api.admin import register_admin as register_api_admin
|
||||
from archivebox.machine.admin import register_admin as register_machine_admin
|
||||
from archivebox.personas.admin import register_admin as register_personas_admin
|
||||
from archivebox.workers.admin import register_admin as register_workers_admin
|
||||
|
||||
register_core_admin(archivebox_admin)
|
||||
register_crawls_admin(archivebox_admin)
|
||||
register_api_admin(archivebox_admin)
|
||||
register_machine_admin(archivebox_admin)
|
||||
register_personas_admin(archivebox_admin)
|
||||
register_workers_admin(archivebox_admin)
|
||||
|
||||
return archivebox_admin
|
||||
|
||||
@@ -6,6 +6,7 @@ from pathlib import Path
|
||||
|
||||
from django.contrib import admin, messages
|
||||
from django.urls import path
|
||||
from django.shortcuts import get_object_or_404, redirect
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.utils import timezone
|
||||
@@ -14,6 +15,7 @@ from django.db.models.functions import Coalesce
|
||||
from django import forms
|
||||
from django.template import Template, RequestContext
|
||||
from django.contrib.admin.helpers import ActionForm
|
||||
from django.middleware.csrf import get_token
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
@@ -24,7 +26,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
|
||||
from archivebox.core.host_utils import build_snapshot_url, build_web_url
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add
|
||||
|
||||
from archivebox.core.models import Tag, Snapshot, ArchiveResult
|
||||
from archivebox.core.admin_archiveresults import render_archiveresults_list
|
||||
@@ -215,10 +217,23 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
|
||||
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'),
|
||||
path('<path:object_id>/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def redo_failed_view(self, request, object_id):
|
||||
snapshot = get_object_or_404(Snapshot, pk=object_id)
|
||||
|
||||
if request.method == 'POST':
|
||||
queued = bg_archive_snapshot(snapshot, overwrite=False)
|
||||
messages.success(
|
||||
request,
|
||||
f"Queued {queued} snapshot for re-archiving. The background runner will process it.",
|
||||
)
|
||||
|
||||
return redirect(snapshot.admin_change_url)
|
||||
|
||||
# def get_queryset(self, request):
|
||||
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
|
||||
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
|
||||
@@ -312,6 +327,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
def admin_actions(self, obj):
|
||||
summary_url = build_web_url(f'/{obj.archive_path}')
|
||||
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
|
||||
redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/'
|
||||
csrf_token = get_token(self.request)
|
||||
return format_html(
|
||||
'''
|
||||
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
|
||||
@@ -344,13 +361,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
onmouseout="this.style.background='#eff6ff';">
|
||||
🆕 Archive Now
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Redo failed extractors (missing outputs)"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
🔁 Redo Failed
|
||||
</a>
|
||||
<form action="{}" method="post" style="display: inline-flex; margin: 0;">
|
||||
<input type="hidden" name="csrfmiddlewaretoken" value="{}">
|
||||
<button type="submit" class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s; cursor: pointer;"
|
||||
title="Redo failed extractors (missing outputs)"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
🔁 Redo Failed
|
||||
</button>
|
||||
</form>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Re-run all extractors (overwrite existing)"
|
||||
@@ -367,14 +386,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
</a>
|
||||
</div>
|
||||
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
|
||||
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
|
||||
<b>Tip:</b> Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected.
|
||||
</p>
|
||||
''',
|
||||
summary_url,
|
||||
results_url,
|
||||
obj.url,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
redo_failed_url,
|
||||
csrf_token,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
)
|
||||
|
||||
@@ -1,63 +1,74 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.contrib import admin
|
||||
from urllib.parse import quote
|
||||
|
||||
from django import forms
|
||||
from django.contrib import admin, messages
|
||||
from django.contrib.admin.options import IS_POPUP_VAR
|
||||
from django.http import HttpRequest, HttpResponseRedirect
|
||||
from django.urls import reverse
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
|
||||
from archivebox.core.models import SnapshotTag, Tag
|
||||
from archivebox.core.tag_utils import (
|
||||
TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
TAG_SORT_CHOICES,
|
||||
build_tag_cards,
|
||||
get_tag_creator_choices,
|
||||
get_tag_year_choices,
|
||||
normalize_created_by_filter,
|
||||
normalize_created_year_filter,
|
||||
normalize_has_snapshots_filter,
|
||||
normalize_tag_sort,
|
||||
)
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
model = SnapshotTag
|
||||
# fk_name = 'snapshot'
|
||||
fields = ('id', 'tag')
|
||||
extra = 1
|
||||
# min_num = 1
|
||||
max_num = 1000
|
||||
autocomplete_fields = (
|
||||
'tag',
|
||||
)
|
||||
|
||||
|
||||
# class AutocompleteTags:
|
||||
# model = Tag
|
||||
# search_fields = ['name']
|
||||
# name = 'name'
|
||||
# # source_field = 'name'
|
||||
# remote_field = Tag._meta.get_field('name')
|
||||
|
||||
# class AutocompleteTagsAdminStub:
|
||||
# name = 'admin'
|
||||
|
||||
|
||||
# class TaggedItemInline(admin.TabularInline):
|
||||
# readonly_fields = ('object_link',)
|
||||
# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields)
|
||||
# model = TaggedItem
|
||||
# extra = 1
|
||||
# show_change_link = True
|
||||
|
||||
# @admin.display(description='object')
|
||||
# def object_link(self, obj):
|
||||
# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id)
|
||||
# return format_html('<a href="/admin/{}/{}/{}/change"><b>[{}]</b></a>', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj))
|
||||
class TagAdminForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = Tag
|
||||
fields = '__all__'
|
||||
widgets = {
|
||||
'name': forms.TextInput(attrs={
|
||||
'placeholder': 'research, receipts, product-design...',
|
||||
'autocomplete': 'off',
|
||||
'spellcheck': 'false',
|
||||
'data-tag-name-input': '1',
|
||||
}),
|
||||
}
|
||||
|
||||
def clean_name(self):
|
||||
name = (self.cleaned_data.get('name') or '').strip()
|
||||
if not name:
|
||||
raise forms.ValidationError('Tag name is required.')
|
||||
return name
|
||||
|
||||
|
||||
|
||||
class TagAdmin(BaseModelAdmin):
|
||||
list_display = ('created_at', 'created_by', 'id', 'name', 'num_snapshots', 'snapshots')
|
||||
form = TagAdminForm
|
||||
change_list_template = 'admin/core/tag/change_list.html'
|
||||
change_form_template = 'admin/core/tag/change_form.html'
|
||||
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
|
||||
list_filter = ('created_at', 'created_by')
|
||||
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
actions = ['delete_selected', 'merge_tags']
|
||||
ordering = ['-created_at']
|
||||
# inlines = [TaggedItemInline]
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
actions = ['delete_selected']
|
||||
ordering = ['name', 'id']
|
||||
|
||||
fieldsets = (
|
||||
('Tag Info', {
|
||||
('Tag', {
|
||||
'fields': ('name', 'slug'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
@@ -65,112 +76,137 @@ class TagAdmin(BaseModelAdmin):
|
||||
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
('Recent Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
add_fieldsets = (
|
||||
('Tag', {
|
||||
'fields': ('name',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
|
||||
return self.fieldsets if obj else self.add_fieldsets
|
||||
|
||||
def num_snapshots(self, tag):
|
||||
def changelist_view(self, request: HttpRequest, extra_context=None):
|
||||
query = (request.GET.get('q') or '').strip()
|
||||
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
|
||||
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
|
||||
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
|
||||
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
|
||||
extra_context = {
|
||||
**(extra_context or {}),
|
||||
'initial_query': query,
|
||||
'initial_sort': sort,
|
||||
'initial_created_by': created_by,
|
||||
'initial_year': year,
|
||||
'initial_has_snapshots': has_snapshots,
|
||||
'tag_sort_choices': TAG_SORT_CHOICES,
|
||||
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
'tag_created_by_choices': get_tag_creator_choices(),
|
||||
'tag_year_choices': get_tag_year_choices(),
|
||||
'initial_tag_cards': build_tag_cards(
|
||||
query=query,
|
||||
request=request,
|
||||
sort=sort,
|
||||
created_by=created_by,
|
||||
year=year,
|
||||
has_snapshots=has_snapshots,
|
||||
),
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_create_api_url': reverse('api-1:tags_create'),
|
||||
}
|
||||
return super().changelist_view(request, extra_context=extra_context)
|
||||
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
|
||||
current_name = (request.POST.get('name') or '').strip()
|
||||
if not current_name and obj:
|
||||
current_name = obj.name
|
||||
|
||||
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
|
||||
if obj:
|
||||
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
|
||||
|
||||
context.update({
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_similar_cards': similar_tag_cards,
|
||||
'tag_similar_query': current_name,
|
||||
})
|
||||
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
|
||||
|
||||
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
|
||||
return super().response_add(request, obj, post_url_continue=post_url_continue)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def response_change(self, request: HttpRequest, obj: Tag):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
|
||||
return super().response_change(request, obj)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
|
||||
changelist_url = reverse('admin:core_tag_changelist')
|
||||
if query:
|
||||
changelist_url = f'{changelist_url}?q={quote(query)}'
|
||||
return HttpResponseRedirect(changelist_url)
|
||||
|
||||
@admin.display(description='Snapshots')
|
||||
def snapshots(self, tag: Tag):
|
||||
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
|
||||
total_count = tag.snapshot_set.count()
|
||||
if not snapshots:
|
||||
return mark_safe(
|
||||
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
|
||||
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
|
||||
)
|
||||
|
||||
cards = []
|
||||
for snapshot in snapshots:
|
||||
title = (snapshot.title or '').strip() or snapshot.url
|
||||
cards.append(format_html(
|
||||
'''
|
||||
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
|
||||
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
|
||||
<span style="min-width:0;">
|
||||
<strong style="display:block;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</strong>
|
||||
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
|
||||
</span>
|
||||
</a>
|
||||
''',
|
||||
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
|
||||
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
|
||||
title[:120],
|
||||
snapshot.url[:120],
|
||||
))
|
||||
|
||||
cards.append(format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
|
||||
tag.id,
|
||||
total_count,
|
||||
))
|
||||
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
|
||||
|
||||
@admin.display(description='Snapshots', ordering='num_snapshots')
|
||||
def num_snapshots(self, tag: Tag):
|
||||
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
|
||||
return format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
||||
tag.id,
|
||||
tag.snapshot_set.count(),
|
||||
count,
|
||||
)
|
||||
|
||||
def snapshots(self, tag):
|
||||
total_count = tag.snapshot_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> {}',
|
||||
snap.pk,
|
||||
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
|
||||
snap.url[:64],
|
||||
)
|
||||
for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
|
||||
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">{total_count} total snapshots...<a>'))
|
||||
|
||||
# def get_urls(self):
|
||||
# urls = super().get_urls()
|
||||
# custom_urls = [
|
||||
# path(
|
||||
# "merge-tags/",
|
||||
# self.admin_site.admin_view(self.merge_tags_view),
|
||||
# name="taggit_tag_merge_tags",
|
||||
# ),
|
||||
# ]
|
||||
# return custom_urls + urls
|
||||
|
||||
# @admin.action(description="Merge selected tags")
|
||||
# def merge_tags(self, request, queryset):
|
||||
# selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME)
|
||||
# if not selected:
|
||||
# self.message_user(request, "Please select at least one tag.")
|
||||
# return redirect(request.get_full_path())
|
||||
|
||||
# selected_tag_ids = ",".join(selected)
|
||||
# redirect_url = f"{request.get_full_path()}merge-tags/"
|
||||
|
||||
# request.session["selected_tag_ids"] = selected_tag_ids
|
||||
|
||||
# return redirect(redirect_url)
|
||||
|
||||
# def merge_tags_view(self, request):
|
||||
# selected_tag_ids = request.session.get("selected_tag_ids", "").split(",")
|
||||
# if request.method == "POST":
|
||||
# form = MergeTagsForm(request.POST)
|
||||
# if form.is_valid():
|
||||
# new_tag_name = form.cleaned_data["new_tag_name"]
|
||||
# new_tag, created = Tag.objects.get_or_create(name=new_tag_name)
|
||||
# with transaction.atomic():
|
||||
# for tag_id in selected_tag_ids:
|
||||
# tag = Tag.objects.get(id=tag_id)
|
||||
# tagged_items = TaggedItem.objects.filter(tag=tag)
|
||||
# for tagged_item in tagged_items:
|
||||
# if TaggedItem.objects.filter(
|
||||
# tag=new_tag,
|
||||
# content_type=tagged_item.content_type,
|
||||
# object_id=tagged_item.object_id,
|
||||
# ).exists():
|
||||
# # we have the new tag as well, so we can just
|
||||
# # remove the tag association
|
||||
# tagged_item.delete()
|
||||
# else:
|
||||
# # point this taggedItem to the new one
|
||||
# tagged_item.tag = new_tag
|
||||
# tagged_item.save()
|
||||
|
||||
# # delete the old tag
|
||||
# if tag.id != new_tag.id:
|
||||
# tag.delete()
|
||||
|
||||
# self.message_user(request, "Tags have been merged", level="success")
|
||||
# # clear the selected_tag_ids from session after merge is complete
|
||||
# request.session.pop("selected_tag_ids", None)
|
||||
|
||||
# return redirect("..")
|
||||
# else:
|
||||
# self.message_user(request, "Form is invalid.", level="error")
|
||||
|
||||
# context = {
|
||||
# "form": MergeTagsForm(),
|
||||
# "selected_tag_ids": selected_tag_ids,
|
||||
# }
|
||||
# return render(request, "admin/taggit/merge_tags_form.html", context)
|
||||
|
||||
|
||||
# @admin.register(SnapshotTag, site=archivebox_admin)
|
||||
# class SnapshotTagAdmin(BaseModelAdmin):
|
||||
# list_display = ('id', 'snapshot', 'tag')
|
||||
# sort_fields = ('id', 'snapshot', 'tag')
|
||||
# search_fields = ('id', 'snapshot_id', 'tag_id')
|
||||
# fields = ('snapshot', 'id')
|
||||
# actions = ['delete_selected']
|
||||
# ordering = ['-id']
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(Tag, TagAdmin)
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from archivebox.misc.util import URL_REGEX, find_all_urls
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
from archivebox.crawls.schedule_utils import validate_schedule
|
||||
from archivebox.hooks import get_plugins
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget
|
||||
from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
@@ -22,6 +26,22 @@ def get_plugin_choices():
|
||||
return [(name, name) for name in get_plugins()]
|
||||
|
||||
|
||||
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
|
||||
schema = plugin_configs.get(plugin_name, {})
|
||||
description = str(schema.get('description') or '').strip()
|
||||
if not description:
|
||||
return plugin_name
|
||||
icon_html = get_plugin_icon(plugin_name)
|
||||
|
||||
return format_html(
|
||||
'<span class="plugin-choice-icon">{}</span><span class="plugin-choice-name">{}</span><a class="plugin-choice-description" href="https://archivebox.github.io/abx-plugins/#{}" target="_blank" rel="noopener noreferrer">{}</a>',
|
||||
icon_html,
|
||||
plugin_name,
|
||||
plugin_name,
|
||||
description,
|
||||
)
|
||||
|
||||
|
||||
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
|
||||
field = form.fields[name]
|
||||
if not isinstance(field, forms.ChoiceField):
|
||||
@@ -31,22 +51,19 @@ def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
# Basic fields
|
||||
url = forms.RegexField(
|
||||
label="URLs (one per line)",
|
||||
regex=URL_REGEX,
|
||||
min_length=6,
|
||||
url = forms.CharField(
|
||||
label="URLs",
|
||||
strip=True,
|
||||
widget=forms.Textarea,
|
||||
widget=forms.Textarea(attrs={
|
||||
'data-url-regex': URL_REGEX.pattern,
|
||||
}),
|
||||
required=True
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags (comma separated tag1,tag2,tag3)",
|
||||
label="Tags",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'list': 'tag-datalist',
|
||||
'autocomplete': 'off',
|
||||
})
|
||||
widget=TagEditorWidget(),
|
||||
)
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
@@ -58,11 +75,15 @@ class AddLinkForm(forms.Form):
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.Textarea(attrs={
|
||||
'rows': 3,
|
||||
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'Optional notes about this crawl',
|
||||
})
|
||||
)
|
||||
url_filters = forms.Field(
|
||||
label="URL allowlist / denylist",
|
||||
required=False,
|
||||
widget=URLFiltersWidget(source_selector='textarea[name="url"]'),
|
||||
)
|
||||
|
||||
# Plugin groups
|
||||
chrome_plugins = forms.MultipleChoiceField(
|
||||
@@ -111,24 +132,15 @@ class AddLinkForm(forms.Form):
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
)
|
||||
persona = forms.CharField(
|
||||
persona = forms.ModelChoiceField(
|
||||
label="Persona (authentication profile)",
|
||||
max_length=100,
|
||||
initial='Default',
|
||||
required=False,
|
||||
)
|
||||
overwrite = forms.BooleanField(
|
||||
label="Overwrite existing snapshots",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
update = forms.BooleanField(
|
||||
label="Update/retry previously failed URLs",
|
||||
initial=False,
|
||||
required=False,
|
||||
queryset=Persona.objects.none(),
|
||||
empty_label=None,
|
||||
to_field_name='name',
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only (don't archive yet)",
|
||||
label="Index only dry run (add crawl but don't archive yet)",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
@@ -142,11 +154,13 @@ class AddLinkForm(forms.Form):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Import at runtime to avoid circular imports
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
default_persona = Persona.get_or_create_default()
|
||||
self.fields['persona'].queryset = Persona.objects.order_by('name')
|
||||
self.fields['persona'].initial = default_persona.name
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
plugin_configs = discover_plugin_configs()
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
@@ -170,26 +184,28 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
# Populate plugin field choices
|
||||
get_choice_field(self, 'chrome_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
get_choice_field(self, 'archiving_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in archiving
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
get_choice_field(self, 'parsing_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in parsing
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
get_choice_field(self, 'search_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in search
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
get_choice_field(self, 'binary_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in binary
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
get_choice_field(self, 'extension_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in extensions
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
# Set update default from config
|
||||
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
|
||||
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
|
||||
if required_search_plugin in search_choices:
|
||||
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean() or {}
|
||||
@@ -207,6 +223,23 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def clean_url(self):
|
||||
value = self.cleaned_data.get('url') or ''
|
||||
urls = '\n'.join(find_all_urls(value))
|
||||
if not urls:
|
||||
raise forms.ValidationError('Enter at least one valid URL.')
|
||||
return urls
|
||||
|
||||
def clean_url_filters(self):
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
value = self.cleaned_data.get('url_filters') or {}
|
||||
return {
|
||||
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
|
||||
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
|
||||
'same_domain_only': bool(value.get('same_domain_only')),
|
||||
}
|
||||
|
||||
def clean_schedule(self):
|
||||
schedule = (self.cleaned_data.get('schedule') or '').strip()
|
||||
if not schedule:
|
||||
|
||||
@@ -163,6 +163,10 @@ def get_api_base_url(request=None) -> str:
|
||||
return _build_base_url_for_host(get_api_host(), request=request)
|
||||
|
||||
|
||||
def get_public_base_url(request=None) -> str:
|
||||
return _build_base_url_for_host(get_public_host(), request=request)
|
||||
|
||||
|
||||
# Backwards-compat aliases (archive == web)
|
||||
def get_archive_base_url(request=None) -> str:
|
||||
return get_web_base_url(request=request)
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("core", "0031_add_archiveresult_snapshot_status_index"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="archiveresult",
|
||||
name="retry_at",
|
||||
),
|
||||
]
|
||||
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
|
||||
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
||||
from archivebox.workers.tasks import bg_archive_snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import NetworkInterface, Binary
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
|
||||
|
||||
@@ -60,32 +60,41 @@ class Tag(ModelWithUUID):
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def _generate_unique_slug(self) -> str:
|
||||
base_slug = slugify(self.name) or 'tag'
|
||||
existing = Tag.objects.filter(slug__startswith=base_slug)
|
||||
if self.pk:
|
||||
existing = existing.exclude(pk=self.pk)
|
||||
existing_slugs = set(existing.values_list("slug", flat=True))
|
||||
|
||||
slug = base_slug
|
||||
i = 1
|
||||
while slug in existing_slugs:
|
||||
slug = f"{base_slug}_{i}"
|
||||
i += 1
|
||||
return slug
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if is_new:
|
||||
self.slug = slugify(self.name)
|
||||
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
|
||||
i = None
|
||||
while True:
|
||||
slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
|
||||
if slug not in existing:
|
||||
self.slug = slug
|
||||
break
|
||||
i = (i or 0) + 1
|
||||
existing_name = None
|
||||
if self.pk:
|
||||
existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first()
|
||||
|
||||
if not self.slug or existing_name != self.name:
|
||||
self.slug = self._generate_unique_slug()
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Tag',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': self.id,
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created Tag',
|
||||
# indent_level=0,
|
||||
# metadata={
|
||||
# 'id': self.id,
|
||||
# 'name': self.name,
|
||||
# 'slug': self.slug,
|
||||
# },
|
||||
# )
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
@@ -364,7 +373,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct()
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if not self.bookmarked_at:
|
||||
self.bookmarked_at = self.created_at or timezone.now()
|
||||
if not self.timestamp:
|
||||
@@ -393,24 +401,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
self.ensure_legacy_archive_symlink()
|
||||
if self.url not in self.crawl.urls:
|
||||
existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url}
|
||||
if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls:
|
||||
self.crawl.urls += f'\n{self.url}'
|
||||
self.crawl.save()
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Snapshot',
|
||||
indent_level=2,
|
||||
url=self.url,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id),
|
||||
'depth': self.depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created Snapshot',
|
||||
# indent_level=2,
|
||||
# url=self.url,
|
||||
# metadata={
|
||||
# 'id': str(self.id),
|
||||
# 'crawl_id': str(self.crawl_id),
|
||||
# 'depth': self.depth,
|
||||
# 'status': self.status,
|
||||
# },
|
||||
# )
|
||||
|
||||
# =========================================================================
|
||||
# Filesystem Migration Methods
|
||||
@@ -1528,16 +1537,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
"""
|
||||
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
|
||||
|
||||
Called by: SnapshotMachine.enter_started()
|
||||
|
||||
Hook Lifecycle:
|
||||
1. discover_hooks('Snapshot') → finds all plugin hooks
|
||||
2. For each hook:
|
||||
- Create ArchiveResult with status=QUEUED
|
||||
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
|
||||
3. ArchiveResults execute independently via ArchiveResultMachine
|
||||
4. Hook execution happens in ArchiveResult.run(), NOT here
|
||||
|
||||
Returns:
|
||||
list[ArchiveResult]: Newly created pending results
|
||||
"""
|
||||
@@ -1602,7 +1601,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'url': self.url,
|
||||
'title': self.title,
|
||||
'tags': self.tags_str(),
|
||||
'tags_str': self.tags_str(),
|
||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
'timestamp': self.timestamp,
|
||||
@@ -1672,7 +1670,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# ID not found, fall through to create-by-URL logic
|
||||
pass
|
||||
|
||||
url = record.get('url')
|
||||
from archivebox.misc.util import fix_url_from_markdown
|
||||
|
||||
url = fix_url_from_markdown(str(record.get('url') or '').strip())
|
||||
if not url:
|
||||
return None
|
||||
|
||||
@@ -1807,7 +1807,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
defaults={
|
||||
'plugin': plugin,
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
},
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
@@ -1853,11 +1852,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
failed = results.filter(status='failed').count()
|
||||
running = results.filter(status='started').count()
|
||||
skipped = results.filter(status='skipped').count()
|
||||
noresults = results.filter(status='noresults').count()
|
||||
total = results.count()
|
||||
pending = total - succeeded - failed - running - skipped
|
||||
pending = total - succeeded - failed - running - skipped - noresults
|
||||
|
||||
# Calculate percentage (succeeded + failed + skipped as completed)
|
||||
completed = succeeded + failed + skipped
|
||||
# Calculate percentage (succeeded + failed + skipped + noresults as completed)
|
||||
completed = succeeded + failed + skipped + noresults
|
||||
percent = int((completed / total * 100) if total > 0 else 0)
|
||||
|
||||
# Sum output sizes
|
||||
@@ -1875,47 +1875,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'running': running,
|
||||
'pending': pending,
|
||||
'skipped': skipped,
|
||||
'noresults': noresults,
|
||||
'percent': percent,
|
||||
'output_size': output_size,
|
||||
'is_sealed': is_sealed,
|
||||
}
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
|
||||
def retry_failed_archiveresults(self) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
|
||||
This enables seamless retry of the entire extraction pipeline:
|
||||
- Resets FAILED and SKIPPED results to QUEUED
|
||||
- Sets retry_at so workers pick them up
|
||||
- Plugins run in order (numeric prefix)
|
||||
- Each plugin checks its dependencies at runtime
|
||||
|
||||
Dependency handling (e.g., chrome → screenshot):
|
||||
- Plugins check if required outputs exist before running
|
||||
- If dependency output missing → plugin returns 'skipped'
|
||||
- On retry, if dependency now succeeds → dependent can run
|
||||
|
||||
Returns count of ArchiveResults reset.
|
||||
"""
|
||||
retry_at = retry_at or timezone.now()
|
||||
|
||||
count = self.archiveresult_set.filter(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
]
|
||||
).update(
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
retry_at=retry_at,
|
||||
output=None,
|
||||
output_str='',
|
||||
output_json=None,
|
||||
output_files={},
|
||||
output_size=0,
|
||||
output_mimetypes='',
|
||||
start_ts=None,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
# Also reset the snapshot and current_step so it gets re-checked from the beginning
|
||||
if count > 0:
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.retry_at = retry_at
|
||||
self.retry_at = timezone.now()
|
||||
self.current_step = 0 # Reset to step 0 for retry
|
||||
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
|
||||
|
||||
@@ -2228,6 +2219,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
best_result = outputs[0]
|
||||
context = {
|
||||
**self.to_dict(extended=True),
|
||||
'snapshot': self,
|
||||
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
|
||||
'url_str': htmlencode(urldecode(self.base_url)),
|
||||
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
|
||||
@@ -2275,8 +2267,8 @@ class SnapshotMachine(BaseStateMachine):
|
||||
│ • discover_hooks('Snapshot') → finds all plugin hooks │
|
||||
│ • create_pending_archiveresults() → creates ONE │
|
||||
│ ArchiveResult per hook (NO execution yet) │
|
||||
│ 2. ArchiveResults process independently with their own │
|
||||
│ state machines (see ArchiveResultMachine) │
|
||||
│ 2. The shared abx-dl runner executes hooks and the │
|
||||
│ projector updates ArchiveResult rows from events │
|
||||
│ 3. Advance through steps 0-9 as foreground hooks complete │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when is_finished()
|
||||
@@ -2358,7 +2350,7 @@ class SnapshotMachine(BaseStateMachine):
|
||||
cast(Any, crawl).sm.seal()
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes):
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
@@ -2366,6 +2358,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
SUCCEEDED = 'succeeded', 'Succeeded'
|
||||
FAILED = 'failed', 'Failed'
|
||||
SKIPPED = 'skipped', 'Skipped'
|
||||
NORESULTS = 'noresults', 'No Results'
|
||||
|
||||
INITIAL_STATE = StatusChoices.QUEUED
|
||||
ACTIVE_STATE = StatusChoices.STARTED
|
||||
FINAL_STATES = (
|
||||
StatusChoices.SUCCEEDED,
|
||||
StatusChoices.FAILED,
|
||||
StatusChoices.SKIPPED,
|
||||
StatusChoices.NORESULTS,
|
||||
)
|
||||
FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE)
|
||||
|
||||
@classmethod
|
||||
def get_plugin_choices(cls):
|
||||
@@ -2404,16 +2407,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
start_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||
end_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||
|
||||
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
# output_dir is computed via @property from snapshot.output_dir / plugin
|
||||
|
||||
state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
|
||||
retry_at_field_name = 'retry_at'
|
||||
state_field_name = 'status'
|
||||
active_state = StatusChoices.STARTED
|
||||
|
||||
snapshot_id: uuid.UUID
|
||||
process_id: uuid.UUID | None
|
||||
|
||||
@@ -2421,7 +2418,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
ModelWithOutputDir.Meta,
|
||||
ModelWithConfig.Meta,
|
||||
ModelWithNotes.Meta,
|
||||
ModelWithStateMachine.Meta,
|
||||
):
|
||||
app_label = 'core'
|
||||
verbose_name = 'Archive Result'
|
||||
@@ -2516,40 +2512,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
return None
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
|
||||
# Create Process record if this is a new ArchiveResult and no process exists yet
|
||||
if is_new and not self.process_id:
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
pwd=str(Path(self.snapshot.output_dir) / self.plugin),
|
||||
cmd=[], # Will be set by run()
|
||||
status='queued',
|
||||
timeout=120,
|
||||
env={},
|
||||
)
|
||||
self.process = process
|
||||
|
||||
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
|
||||
# Call the Django Model.save() directly instead
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created ArchiveResult',
|
||||
indent_level=3,
|
||||
plugin=self.plugin,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'snapshot_id': str(self.snapshot_id),
|
||||
'snapshot_url': str(self.snapshot.url)[:64],
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created ArchiveResult',
|
||||
# indent_level=3,
|
||||
# plugin=self.plugin,
|
||||
# metadata={
|
||||
# 'id': str(self.id),
|
||||
# 'snapshot_id': str(self.snapshot_id),
|
||||
# 'snapshot_url': str(self.snapshot.url)[:64],
|
||||
# 'status': self.status,
|
||||
# },
|
||||
# )
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
@@ -2566,6 +2546,28 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def get_absolute_url(self):
|
||||
return f'/{self.snapshot.archive_path}/{self.plugin}'
|
||||
|
||||
def reset_for_retry(self, *, save: bool = True) -> None:
|
||||
self.status = self.StatusChoices.QUEUED
|
||||
self.output_str = ''
|
||||
self.output_json = None
|
||||
self.output_files = {}
|
||||
self.output_size = 0
|
||||
self.output_mimetypes = ''
|
||||
self.start_ts = None
|
||||
self.end_ts = None
|
||||
if save:
|
||||
self.save(update_fields=[
|
||||
'status',
|
||||
'output_str',
|
||||
'output_json',
|
||||
'output_files',
|
||||
'output_size',
|
||||
'output_mimetypes',
|
||||
'start_ts',
|
||||
'end_ts',
|
||||
'modified_at',
|
||||
])
|
||||
|
||||
@property
|
||||
def plugin_module(self) -> Any | None:
|
||||
# Hook scripts are now used instead of Python plugin modules
|
||||
@@ -2723,11 +2725,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
return None
|
||||
|
||||
def create_output_dir(self):
|
||||
output_dir = Path(self.snapshot_dir) / self.plugin
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
return output_dir
|
||||
|
||||
@property
|
||||
def output_dir_name(self) -> str:
|
||||
return self.plugin
|
||||
@@ -2782,134 +2779,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def save_search_index(self):
|
||||
pass
|
||||
|
||||
def cascade_health_update(self, success: bool):
|
||||
"""Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
|
||||
# Update archival hierarchy
|
||||
self.snapshot.increment_health_stats(success)
|
||||
self.snapshot.crawl.increment_health_stats(success)
|
||||
|
||||
# Update execution infrastructure
|
||||
if self.binary:
|
||||
self.binary.increment_health_stats(success)
|
||||
if self.binary.machine:
|
||||
self.binary.machine.increment_health_stats(success)
|
||||
|
||||
if self.iface:
|
||||
self.iface.increment_health_stats(success)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's hook and update status.
|
||||
|
||||
If self.hook_name is set, runs only that specific hook.
|
||||
If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
|
||||
|
||||
Updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config with proper context
|
||||
config = get_config(
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
# Determine which hook(s) to run
|
||||
hooks = []
|
||||
|
||||
if self.hook_name:
|
||||
# SPECIFIC HOOK MODE: Find the specific hook by name
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
hook_path = plugin_dir / self.hook_name
|
||||
if hook_path.exists():
|
||||
hooks.append(hook_path)
|
||||
break
|
||||
else:
|
||||
# LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
|
||||
if matches:
|
||||
hooks.extend(sorted(matches))
|
||||
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
if self.hook_name:
|
||||
self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
|
||||
else:
|
||||
self.output_str = f'No hooks found for plugin: {self.plugin}'
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Output directory is plugin_dir for the hook output
|
||||
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
|
||||
|
||||
start_ts = timezone.now()
|
||||
process = None
|
||||
|
||||
for hook in hooks:
|
||||
# Run hook using Process.launch() - returns Process model
|
||||
process = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_dir,
|
||||
config=config,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
crawl_id=str(self.snapshot.crawl.id),
|
||||
depth=self.snapshot.depth,
|
||||
)
|
||||
|
||||
# Link ArchiveResult to Process
|
||||
self.process = process
|
||||
self.start_ts = start_ts
|
||||
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
|
||||
|
||||
if not process:
|
||||
# No hooks ran
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'No hooks executed'
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Update status based on hook execution
|
||||
if process.status == process.StatusChoices.RUNNING:
|
||||
# BACKGROUND HOOK - still running, return immediately
|
||||
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
|
||||
return
|
||||
|
||||
# FOREGROUND HOOK - completed, update from filesystem
|
||||
self.update_from_output()
|
||||
|
||||
# Clean up empty output directory if no files were created
|
||||
if plugin_dir.exists() and not self.output_files:
|
||||
try:
|
||||
if not any(plugin_dir.iterdir()):
|
||||
plugin_dir.rmdir()
|
||||
except (OSError, RuntimeError):
|
||||
pass
|
||||
|
||||
def update_from_output(self):
|
||||
"""
|
||||
Update this ArchiveResult from filesystem logs and output files.
|
||||
|
||||
Used for:
|
||||
- Foreground hooks that completed (called from ArchiveResult.run())
|
||||
- Background hooks that completed (called from Snapshot.cleanup())
|
||||
Used for Snapshot cleanup / orphan recovery when a hook's output exists
|
||||
on disk but the projector did not finalize the row in the database.
|
||||
|
||||
Updates:
|
||||
- status, output_str, output_json from ArchiveResult JSONL record
|
||||
- output_files, output_size, output_mimetypes by walking filesystem
|
||||
- end_ts, retry_at, cmd, cmd_version, binary FK
|
||||
- end_ts, cmd, cmd_version, binary FK
|
||||
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
|
||||
"""
|
||||
import mimetypes
|
||||
@@ -2924,7 +2804,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'Output directory not found'
|
||||
self.end_ts = timezone.now()
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
@@ -2948,6 +2827,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
'succeeded': self.StatusChoices.SUCCEEDED,
|
||||
'failed': self.StatusChoices.FAILED,
|
||||
'skipped': self.StatusChoices.SKIPPED,
|
||||
'noresults': self.StatusChoices.NORESULTS,
|
||||
}
|
||||
self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
|
||||
|
||||
@@ -3011,7 +2891,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Update timestamps
|
||||
self.end_ts = timezone.now()
|
||||
self.retry_at = None
|
||||
|
||||
self.save()
|
||||
|
||||
@@ -3095,340 +2974,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
|
||||
"""
|
||||
import re
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config with proper hierarchy
|
||||
config = get_config(
|
||||
user=self.created_by,
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
# Get allowlist/denylist (can be string or list)
|
||||
allowlist_raw = config.get('URL_ALLOWLIST', '')
|
||||
denylist_raw = config.get('URL_DENYLIST', '')
|
||||
|
||||
# Normalize to list of patterns
|
||||
def to_pattern_list(value):
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return [p.strip() for p in value.split(',') if p.strip()]
|
||||
return []
|
||||
|
||||
allowlist = to_pattern_list(allowlist_raw)
|
||||
denylist = to_pattern_list(denylist_raw)
|
||||
|
||||
# Denylist takes precedence
|
||||
if denylist:
|
||||
for pattern in denylist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return False
|
||||
except re.error:
|
||||
continue # Skip invalid regex patterns
|
||||
|
||||
# If allowlist exists, URL must match at least one pattern
|
||||
if allowlist:
|
||||
for pattern in allowlist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return True
|
||||
except re.error:
|
||||
continue # Skip invalid regex patterns
|
||||
return False # No allowlist patterns matched
|
||||
|
||||
return True # No filters or passed filters
|
||||
return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot)
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
"""Get the output directory for this plugin's results."""
|
||||
return Path(self.snapshot.output_dir) / self.plugin
|
||||
|
||||
def is_background_hook(self) -> bool:
|
||||
"""Check if this ArchiveResult is for a background hook."""
|
||||
plugin_dir = Path(self.pwd) if self.pwd else None
|
||||
if not plugin_dir:
|
||||
return False
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
return pid_file.exists()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ArchiveResult State Machine
|
||||
# =============================================================================
|
||||
|
||||
class ArchiveResultMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing ArchiveResult (single plugin execution) lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for its turn to run │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. archiveresult.run() │
|
||||
│ • Find specific hook by hook_name │
|
||||
│ • run_hook(script, output_dir, ...) → subprocess │
|
||||
│ │
|
||||
│ 2a. FOREGROUND hook (returns HookResult): │
|
||||
│ • update_from_output() immediately │
|
||||
│ - Read stdout.log │
|
||||
│ - Parse JSONL records │
|
||||
│ - Extract 'ArchiveResult' record → update status │
|
||||
│ - Walk output_dir → populate output_files │
|
||||
│ - Call process_hook_records() for side effects │
|
||||
│ │
|
||||
│ 2b. BACKGROUND hook (returns None): │
|
||||
│ • Status stays STARTED │
|
||||
│ • Continues running in background │
|
||||
│ • Killed by Snapshot.cleanup() when sealed │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() checks status
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
|
||||
│ • Set by hook's JSONL output during update_from_output() │
|
||||
│ • Health stats incremented (num_uses_succeeded/failed) │
|
||||
│ • Parent Snapshot health stats also updated │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model_attr_name = 'archiveresult'
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.StatusChoices.STARTED)
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
# Flow: queued → started → (succeeded|failed|skipped)
|
||||
# queued → skipped (if exceeded max attempts)
|
||||
# started → backoff → started (retry)
|
||||
tick = (
|
||||
queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
|
||||
| queued.to.itself(unless='can_start')
|
||||
| queued.to(started, cond='can_start')
|
||||
| started.to(succeeded, cond='is_succeeded')
|
||||
| started.to(failed, cond='is_failed')
|
||||
| started.to(skipped, cond='is_skipped')
|
||||
| started.to(backoff, cond='is_backoff')
|
||||
| backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
|
||||
| backoff.to.itself(unless='can_start')
|
||||
| backoff.to(started, cond='can_start')
|
||||
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
|
||||
# Reason: backoff should always retry→started, then started→final states
|
||||
)
|
||||
|
||||
archiveresult: ArchiveResult
|
||||
|
||||
def can_start(self) -> bool:
|
||||
"""Pure function - check if AR can start (has valid URL)."""
|
||||
return bool(self.archiveresult.snapshot.url)
|
||||
|
||||
def is_exceeded_max_attempts(self) -> bool:
|
||||
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
|
||||
# Count failed ArchiveResults for this snapshot (any plugin type)
|
||||
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.FAILED
|
||||
).count()
|
||||
|
||||
return failed_count >= max_attempts
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
|
||||
and not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""
|
||||
Check if extraction has completed (success, failure, or skipped).
|
||||
|
||||
For background hooks in STARTED state, checks if their Process has finished and reaps them.
|
||||
"""
|
||||
# If already in final state, return True
|
||||
if self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
):
|
||||
return True
|
||||
|
||||
# If in STARTED state with a Process, check if Process has finished running
|
||||
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
|
||||
if self.archiveresult.process_id:
|
||||
process = self.archiveresult.process
|
||||
|
||||
# If process is NOT running anymore, reap the background hook
|
||||
if not process.is_running:
|
||||
self.archiveresult.update_from_output()
|
||||
# Check if now in final state after reaping
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
|
||||
# Update Process with network interface
|
||||
if self.archiveresult.process_id:
|
||||
self.archiveresult.process.iface = NetworkInterface.current()
|
||||
self.archiveresult.process.save()
|
||||
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
def _check_and_seal_parent_snapshot(self):
|
||||
"""
|
||||
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
|
||||
|
||||
Note: In the new architecture, the shared runner handles step advancement and sealing.
|
||||
This method is kept for direct model-driven edge cases.
|
||||
"""
|
||||
import sys
|
||||
|
||||
snapshot = self.archiveresult.snapshot
|
||||
|
||||
# Check if all archiveresults are finished (in final states)
|
||||
remaining_active = snapshot.archiveresult_set.exclude(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
|
||||
# Seal the parent snapshot
|
||||
cast(Any, snapshot).sm.seal()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=True)
|
||||
|
||||
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
import sys
|
||||
|
||||
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=False)
|
||||
|
||||
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
import sys
|
||||
|
||||
# Set output_str if not already set (e.g., when skipped due to max attempts)
|
||||
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machine Registration
|
||||
# =============================================================================
|
||||
@@ -3436,4 +2988,3 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
# Manually register state machines with python-statemachine registry
|
||||
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
|
||||
registry.register(SnapshotMachine)
|
||||
registry.register(ArchiveResultMachine)
|
||||
|
||||
@@ -232,11 +232,12 @@ SQLITE_CONNECTION_OPTIONS = {
|
||||
# https://gcollazo.com/optimal-sqlite-settings-for-django/
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
|
||||
"timeout": 10,
|
||||
"timeout": 30,
|
||||
"check_same_thread": False,
|
||||
"transaction_mode": "IMMEDIATE",
|
||||
"init_command": (
|
||||
"PRAGMA foreign_keys=ON;"
|
||||
"PRAGMA busy_timeout = 30000;"
|
||||
"PRAGMA journal_mode = WAL;"
|
||||
"PRAGMA synchronous = NORMAL;"
|
||||
"PRAGMA temp_store = MEMORY;"
|
||||
|
||||
271
archivebox/core/tag_utils.py
Normal file
271
archivebox/core/tag_utils.py
Normal file
@@ -0,0 +1,271 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.db.models import Count, F, Q, QuerySet
|
||||
from django.db.models.functions import Lower
|
||||
from django.http import HttpRequest
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.core.host_utils import build_snapshot_url, build_web_url
|
||||
from archivebox.core.models import Snapshot, SnapshotTag, Tag
|
||||
|
||||
|
||||
TAG_SNAPSHOT_PREVIEW_LIMIT = 10
|
||||
TAG_SORT_CHOICES = (
|
||||
('name_asc', 'Name A-Z'),
|
||||
('name_desc', 'Name Z-A'),
|
||||
('created_desc', 'Created newest'),
|
||||
('created_asc', 'Created oldest'),
|
||||
('snapshots_desc', 'Most snapshots'),
|
||||
('snapshots_asc', 'Fewest snapshots'),
|
||||
)
|
||||
TAG_HAS_SNAPSHOTS_CHOICES = (
|
||||
('all', 'All'),
|
||||
('yes', 'Has snapshots'),
|
||||
('no', 'No snapshots'),
|
||||
)
|
||||
|
||||
|
||||
def normalize_tag_name(name: str) -> str:
|
||||
return (name or '').strip()
|
||||
|
||||
|
||||
def normalize_tag_sort(sort: str = 'created_desc') -> str:
|
||||
valid_sorts = {key for key, _label in TAG_SORT_CHOICES}
|
||||
return sort if sort in valid_sorts else 'created_desc'
|
||||
|
||||
|
||||
def normalize_has_snapshots_filter(value: str = 'all') -> str:
|
||||
valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES}
|
||||
return value if value in valid_filters else 'all'
|
||||
|
||||
|
||||
def normalize_created_by_filter(created_by: str = '') -> str:
|
||||
return created_by if str(created_by).isdigit() else ''
|
||||
|
||||
|
||||
def normalize_created_year_filter(year: str = '') -> str:
|
||||
year = (year or '').strip()
|
||||
return year if len(year) == 4 and year.isdigit() else ''
|
||||
|
||||
|
||||
def get_matching_tags(
|
||||
query: str = '',
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
) -> QuerySet[Tag]:
|
||||
queryset = Tag.objects.select_related('created_by').annotate(
|
||||
num_snapshots=Count('snapshot_set', distinct=True),
|
||||
)
|
||||
|
||||
query = normalize_tag_name(query)
|
||||
if query:
|
||||
queryset = queryset.filter(
|
||||
Q(name__icontains=query) | Q(slug__icontains=query),
|
||||
)
|
||||
|
||||
created_by = normalize_created_by_filter(created_by)
|
||||
if created_by:
|
||||
queryset = queryset.filter(created_by_id=int(created_by))
|
||||
|
||||
year = normalize_created_year_filter(year)
|
||||
if year:
|
||||
queryset = queryset.filter(created_at__year=int(year))
|
||||
|
||||
has_snapshots = normalize_has_snapshots_filter(has_snapshots)
|
||||
if has_snapshots == 'yes':
|
||||
queryset = queryset.filter(num_snapshots__gt=0)
|
||||
elif has_snapshots == 'no':
|
||||
queryset = queryset.filter(num_snapshots=0)
|
||||
|
||||
sort = normalize_tag_sort(sort)
|
||||
if sort == 'name_asc':
|
||||
queryset = queryset.order_by(Lower('name'), 'id')
|
||||
elif sort == 'name_desc':
|
||||
queryset = queryset.order_by(Lower('name').desc(), '-id')
|
||||
elif sort == 'created_asc':
|
||||
queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name'))
|
||||
elif sort == 'snapshots_desc':
|
||||
queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name'))
|
||||
elif sort == 'snapshots_asc':
|
||||
queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id')
|
||||
else:
|
||||
queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name'))
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
def get_tag_creator_choices() -> list[tuple[str, str]]:
|
||||
rows = (
|
||||
Tag.objects
|
||||
.filter(created_by__isnull=False)
|
||||
.values_list('created_by_id', 'created_by__username')
|
||||
.order_by(Lower('created_by__username'), 'created_by_id')
|
||||
.distinct()
|
||||
)
|
||||
return [(str(user_id), username or f'User {user_id}') for user_id, username in rows]
|
||||
|
||||
|
||||
def get_tag_year_choices() -> list[str]:
|
||||
years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC')
|
||||
return [str(year.year) for year in years]
|
||||
|
||||
|
||||
def get_tag_by_ref(tag_ref: str | int) -> Tag:
|
||||
if isinstance(tag_ref, int):
|
||||
return Tag.objects.get(pk=tag_ref)
|
||||
|
||||
ref = str(tag_ref).strip()
|
||||
if ref.isdigit():
|
||||
return Tag.objects.get(pk=int(ref))
|
||||
|
||||
try:
|
||||
return Tag.objects.get(slug__iexact=ref)
|
||||
except Tag.DoesNotExist:
|
||||
return Tag.objects.get(slug__icontains=ref)
|
||||
|
||||
|
||||
def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]:
|
||||
normalized_name = normalize_tag_name(name)
|
||||
if not normalized_name:
|
||||
raise ValueError('Tag name is required')
|
||||
|
||||
existing = Tag.objects.filter(name__iexact=normalized_name).first()
|
||||
if existing:
|
||||
return existing, False
|
||||
|
||||
tag = Tag.objects.create(
|
||||
name=normalized_name,
|
||||
created_by=created_by,
|
||||
)
|
||||
return tag, True
|
||||
|
||||
|
||||
def rename_tag(tag: Tag, name: str) -> Tag:
|
||||
normalized_name = normalize_tag_name(name)
|
||||
if not normalized_name:
|
||||
raise ValueError('Tag name is required')
|
||||
|
||||
existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first()
|
||||
if existing:
|
||||
raise ValueError(f'Tag "{existing.name}" already exists')
|
||||
|
||||
if tag.name != normalized_name:
|
||||
tag.name = normalized_name
|
||||
tag.save()
|
||||
return tag
|
||||
|
||||
|
||||
def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]:
|
||||
return tag.delete()
|
||||
|
||||
|
||||
def export_tag_urls(tag: Tag) -> str:
|
||||
urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True)
|
||||
return '\n'.join(urls)
|
||||
|
||||
|
||||
def export_tag_snapshots_jsonl(tag: Tag) -> str:
|
||||
snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags')
|
||||
return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots)
|
||||
|
||||
|
||||
def _display_snapshot_title(snapshot: Snapshot) -> str:
|
||||
title = (snapshot.title or '').strip()
|
||||
url = (snapshot.url or '').strip()
|
||||
if not title:
|
||||
return url
|
||||
|
||||
normalized_title = title.lower()
|
||||
if normalized_title == 'pending...' or normalized_title == url.lower():
|
||||
return url
|
||||
return title
|
||||
|
||||
|
||||
def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]:
|
||||
return {
|
||||
'id': str(snapshot.pk),
|
||||
'title': _display_snapshot_title(snapshot),
|
||||
'url': snapshot.url,
|
||||
'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request),
|
||||
'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]),
|
||||
'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request),
|
||||
'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None,
|
||||
}
|
||||
|
||||
|
||||
def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]:
|
||||
tag_ids = [tag.pk for tag in tags]
|
||||
if not tag_ids:
|
||||
return {}
|
||||
|
||||
snapshot_tags = (
|
||||
SnapshotTag.objects
|
||||
.filter(tag_id__in=tag_ids)
|
||||
.select_related('snapshot__crawl__created_by')
|
||||
.order_by(
|
||||
'tag_id',
|
||||
F('snapshot__downloaded_at').desc(nulls_last=True),
|
||||
F('snapshot__created_at').desc(nulls_last=True),
|
||||
F('snapshot_id').desc(),
|
||||
)
|
||||
)
|
||||
|
||||
preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list)
|
||||
for snapshot_tag in snapshot_tags:
|
||||
previews = preview_map[snapshot_tag.tag_id]
|
||||
if len(previews) >= preview_limit:
|
||||
continue
|
||||
previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request))
|
||||
return preview_map
|
||||
|
||||
|
||||
def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]:
|
||||
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
|
||||
return {
|
||||
'id': tag.pk,
|
||||
'name': tag.name,
|
||||
'slug': tag.slug,
|
||||
'num_snapshots': count,
|
||||
'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}",
|
||||
'edit_url': reverse('admin:core_tag_change', args=[tag.pk]),
|
||||
'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]),
|
||||
'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]),
|
||||
'rename_url': reverse('api-1:rename_tag', args=[tag.pk]),
|
||||
'delete_url': reverse('api-1:delete_tag', args=[tag.pk]),
|
||||
'snapshots': snapshot_previews or [],
|
||||
}
|
||||
|
||||
|
||||
def build_tag_cards(
|
||||
query: str = '',
|
||||
request: HttpRequest | None = None,
|
||||
limit: int | None = None,
|
||||
preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT,
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
) -> list[dict[str, Any]]:
|
||||
queryset = get_matching_tags(
|
||||
query=query,
|
||||
sort=sort,
|
||||
created_by=created_by,
|
||||
year=year,
|
||||
has_snapshots=has_snapshots,
|
||||
)
|
||||
if limit is not None:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
tags = list(queryset)
|
||||
preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit)
|
||||
return [
|
||||
build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, []))
|
||||
for tag in tags
|
||||
]
|
||||
@@ -11,6 +11,7 @@ from archivebox.hooks import (
|
||||
)
|
||||
from archivebox.core.host_utils import (
|
||||
get_admin_base_url,
|
||||
get_public_base_url,
|
||||
get_web_base_url,
|
||||
get_snapshot_base_url,
|
||||
build_snapshot_url,
|
||||
@@ -166,6 +167,11 @@ def web_base_url(context) -> str:
|
||||
return get_web_base_url(request=context.get('request'))
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def public_base_url(context) -> str:
|
||||
return get_public_base_url(request=context.get('request'))
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def snapshot_base_url(context, snapshot) -> str:
|
||||
snapshot_id = getattr(snapshot, 'id', snapshot)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import json
|
||||
import os
|
||||
import posixpath
|
||||
from glob import glob, escape
|
||||
@@ -7,7 +8,7 @@ from django.utils import timezone
|
||||
import inspect
|
||||
from typing import Callable, cast, get_type_hints
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import quote, urlparse
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
|
||||
@@ -26,7 +27,7 @@ from admin_data_views.typing import TableContext, ItemContext, SectionData
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
|
||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
@@ -37,7 +38,18 @@ from archivebox.core.models import Snapshot
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.forms import AddLinkForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.hooks import get_enabled_plugins, get_plugin_name
|
||||
from archivebox.hooks import (
|
||||
BUILTIN_PLUGINS_DIR,
|
||||
USER_PLUGINS_DIR,
|
||||
discover_plugin_configs,
|
||||
get_enabled_plugins,
|
||||
get_plugin_name,
|
||||
iter_plugin_dirs,
|
||||
)
|
||||
|
||||
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
|
||||
LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/'
|
||||
|
||||
|
||||
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
|
||||
@@ -699,6 +711,9 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
|
||||
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
|
||||
rel_path = path or ""
|
||||
show_indexes = bool(request.GET.get("files"))
|
||||
if not show_indexes and (not rel_path or rel_path == "index.html"):
|
||||
return SnapshotView.render_live_index(request, snapshot)
|
||||
|
||||
if not rel_path or rel_path.endswith("/"):
|
||||
if show_indexes:
|
||||
rel_path = rel_path.rstrip("/")
|
||||
@@ -784,7 +799,6 @@ class SnapshotHostView(View):
|
||||
raise Http404
|
||||
return _serve_snapshot_replay(request, snapshot, path)
|
||||
|
||||
|
||||
class SnapshotReplayView(View):
|
||||
"""Serve snapshot directory contents on a one-domain replay path."""
|
||||
|
||||
@@ -915,8 +929,17 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
return custom_config
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_dependency_map = {
|
||||
plugin_name: [
|
||||
str(required_plugin).strip()
|
||||
for required_plugin in (schema.get('required_plugins') or [])
|
||||
if str(required_plugin).strip()
|
||||
]
|
||||
for plugin_name, schema in plugin_configs.items()
|
||||
if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins')
|
||||
}
|
||||
return {
|
||||
**super().get_context_data(**kwargs),
|
||||
'title': "Create Crawl",
|
||||
@@ -924,8 +947,9 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
|
||||
'VERSION': VERSION,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
'required_search_plugin': required_search_plugin,
|
||||
'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True),
|
||||
'stdout': '',
|
||||
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
|
||||
}
|
||||
|
||||
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
|
||||
@@ -937,11 +961,10 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
depth = int(form.cleaned_data["depth"])
|
||||
plugins = ','.join(form.cleaned_data.get("plugins", []))
|
||||
schedule = form.cleaned_data.get("schedule", "").strip()
|
||||
persona = form.cleaned_data.get("persona", "Default")
|
||||
overwrite = form.cleaned_data.get("overwrite", False)
|
||||
update = form.cleaned_data.get("update", False)
|
||||
persona = form.cleaned_data.get("persona")
|
||||
index_only = form.cleaned_data.get("index_only", False)
|
||||
notes = form.cleaned_data.get("notes", "")
|
||||
url_filters = form.cleaned_data.get("url_filters") or {}
|
||||
custom_config = self._get_custom_config_overrides(form)
|
||||
|
||||
from archivebox.config.permissions import HOSTNAME
|
||||
@@ -957,6 +980,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
|
||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
|
||||
# 2. create a new Crawl with the URLs from the file
|
||||
@@ -964,16 +988,18 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
urls_content = sources_file.read_text()
|
||||
# Build complete config
|
||||
config = {
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': depth,
|
||||
'PLUGINS': plugins or '',
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
'DEFAULT_PERSONA': (persona.name if persona else 'Default'),
|
||||
}
|
||||
|
||||
# Merge custom config overrides
|
||||
config.update(custom_config)
|
||||
if url_filters.get('allowlist'):
|
||||
config['URL_ALLOWLIST'] = url_filters['allowlist']
|
||||
if url_filters.get('denylist'):
|
||||
config['URL_DENYLIST'] = url_filters['denylist']
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
@@ -999,6 +1025,8 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
crawl.schedule = crawl_schedule
|
||||
crawl.save(update_fields=['schedule'])
|
||||
|
||||
crawl.create_snapshots_from_urls()
|
||||
|
||||
# 4. start the Orchestrator & wait until it completes
|
||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||
# from archivebox.crawls.actors import CrawlActor
|
||||
@@ -1011,7 +1039,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
|
||||
urls = form.cleaned_data["url"]
|
||||
schedule = form.cleaned_data.get("schedule", "").strip()
|
||||
rough_url_count = urls.count('://')
|
||||
rough_url_count = len([url for url in urls.splitlines() if url.strip()])
|
||||
|
||||
# Build success message with schedule link if created
|
||||
schedule_msg = ""
|
||||
@@ -1080,10 +1108,6 @@ class WebAddView(AddView):
|
||||
'persona': defaults_form.fields['persona'].initial or 'Default',
|
||||
'config': {},
|
||||
}
|
||||
if defaults_form.fields['update'].initial:
|
||||
form_data['update'] = 'on'
|
||||
if defaults_form.fields['overwrite'].initial:
|
||||
form_data['overwrite'] = 'on'
|
||||
if defaults_form.fields['index_only'].initial:
|
||||
form_data['index_only'] = 'on'
|
||||
|
||||
@@ -1118,6 +1142,41 @@ def live_progress_view(request):
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
|
||||
normalized_hook_name = Path(hook_name).name if hook_name else ""
|
||||
if not normalized_hook_name:
|
||||
return (plugin, plugin, "unknown", "")
|
||||
|
||||
phase = "unknown"
|
||||
if normalized_hook_name.startswith("on_Crawl__"):
|
||||
phase = "crawl"
|
||||
elif normalized_hook_name.startswith("on_Snapshot__"):
|
||||
phase = "snapshot"
|
||||
elif normalized_hook_name.startswith("on_Binary__"):
|
||||
phase = "binary"
|
||||
|
||||
label = normalized_hook_name
|
||||
if "__" in normalized_hook_name:
|
||||
label = normalized_hook_name.split("__", 1)[1]
|
||||
label = label.rsplit(".", 1)[0]
|
||||
if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
|
||||
label = label[3:]
|
||||
label = label.replace("_", " ").strip() or plugin
|
||||
|
||||
return (plugin, label, phase, normalized_hook_name)
|
||||
|
||||
def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
|
||||
hook_path = ""
|
||||
if isinstance(cmd, list) and cmd:
|
||||
first = cmd[0]
|
||||
if isinstance(first, str):
|
||||
hook_path = first
|
||||
|
||||
if not hook_path:
|
||||
return ("", "setup", "unknown", "")
|
||||
|
||||
return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
|
||||
|
||||
machine = Machine.current()
|
||||
orchestrator_proc = Process.objects.filter(
|
||||
machine=machine,
|
||||
@@ -1188,8 +1247,19 @@ def live_progress_view(request):
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
)
|
||||
recent_processes = Process.objects.filter(
|
||||
machine=machine,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
modified_at__gte=timezone.now() - timedelta(minutes=10),
|
||||
).order_by("-modified_at")
|
||||
crawl_process_pids: dict[str, int] = {}
|
||||
snapshot_process_pids: dict[str, int] = {}
|
||||
process_records_by_crawl: dict[str, list[dict[str, object]]] = {}
|
||||
process_records_by_snapshot: dict[str, list[dict[str, object]]] = {}
|
||||
seen_process_records: set[str] = set()
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
@@ -1197,11 +1267,48 @@ def live_progress_view(request):
|
||||
|
||||
crawl_id = env.get('CRAWL_ID')
|
||||
snapshot_id = env.get('SNAPSHOT_ID')
|
||||
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
|
||||
if crawl_id and proc.pid:
|
||||
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
|
||||
if snapshot_id and proc.pid:
|
||||
if phase == "snapshot" and snapshot_id and proc.pid:
|
||||
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
|
||||
|
||||
for proc in recent_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
env = {}
|
||||
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if not crawl_id and not snapshot_id:
|
||||
continue
|
||||
|
||||
plugin, label, phase, hook_name = process_label(proc.cmd)
|
||||
|
||||
record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
|
||||
proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
|
||||
if proc_key in seen_process_records:
|
||||
continue
|
||||
seen_process_records.add(proc_key)
|
||||
|
||||
status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
|
||||
payload: dict[str, object] = {
|
||||
"id": str(proc.id),
|
||||
"plugin": plugin,
|
||||
"label": label,
|
||||
"hook_name": hook_name,
|
||||
"status": status,
|
||||
"phase": phase,
|
||||
"source": "process",
|
||||
"process_id": str(proc.id),
|
||||
}
|
||||
if status == "started" and proc.pid:
|
||||
payload["pid"] = proc.pid
|
||||
if phase == "snapshot" and snapshot_id:
|
||||
process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload)
|
||||
elif crawl_id:
|
||||
process_records_by_crawl.setdefault(str(crawl_id), []).append(payload)
|
||||
|
||||
active_crawls_qs = Crawl.objects.filter(
|
||||
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
|
||||
).prefetch_related(
|
||||
@@ -1234,6 +1341,11 @@ def live_progress_view(request):
|
||||
|
||||
# Calculate crawl progress
|
||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||
crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), []))
|
||||
crawl_setup_total = len(crawl_setup_plugins)
|
||||
crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
|
||||
crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
|
||||
crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
|
||||
|
||||
# Get active snapshots for this crawl (already prefetched)
|
||||
active_snapshots_for_crawl = []
|
||||
@@ -1241,28 +1353,21 @@ def live_progress_view(request):
|
||||
# Get archive results for this snapshot (already prefetched)
|
||||
snapshot_results = snapshot.archiveresult_set.all()
|
||||
|
||||
# Count in memory instead of DB queries
|
||||
total_plugins = len(snapshot_results)
|
||||
completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
|
||||
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
|
||||
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
|
||||
|
||||
# Calculate snapshot progress using per-plugin progress
|
||||
now = timezone.now()
|
||||
plugin_progress_values: list[int] = []
|
||||
all_plugins: list[dict[str, object]] = []
|
||||
seen_plugin_keys: set[str] = set()
|
||||
|
||||
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
|
||||
# Order: started first, then queued, then completed
|
||||
def plugin_sort_key(ar):
|
||||
status_order = {
|
||||
ArchiveResult.StatusChoices.STARTED: 0,
|
||||
ArchiveResult.StatusChoices.QUEUED: 1,
|
||||
ArchiveResult.StatusChoices.SUCCEEDED: 2,
|
||||
ArchiveResult.StatusChoices.FAILED: 3,
|
||||
ArchiveResult.StatusChoices.NORESULTS: 3,
|
||||
ArchiveResult.StatusChoices.FAILED: 4,
|
||||
}
|
||||
return (status_order.get(ar.status, 4), ar.plugin)
|
||||
return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
|
||||
|
||||
all_plugins = []
|
||||
for ar in sorted(snapshot_results, key=plugin_sort_key):
|
||||
status = ar.status
|
||||
progress_value = 0
|
||||
@@ -1270,6 +1375,7 @@ def live_progress_view(request):
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
):
|
||||
progress_value = 100
|
||||
elif status == ArchiveResult.StatusChoices.STARTED:
|
||||
@@ -1284,20 +1390,49 @@ def live_progress_view(request):
|
||||
progress_value = 0
|
||||
|
||||
plugin_progress_values.append(progress_value)
|
||||
plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
|
||||
|
||||
plugin_payload = {
|
||||
'id': str(ar.id),
|
||||
'plugin': ar.plugin,
|
||||
'label': label,
|
||||
'hook_name': hook_name,
|
||||
'phase': phase,
|
||||
'status': status,
|
||||
'process_id': str(ar.process_id) if ar.process_id else None,
|
||||
}
|
||||
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
|
||||
plugin_payload['pid'] = ar.process.pid
|
||||
if status == ArchiveResult.StatusChoices.STARTED:
|
||||
plugin_payload['progress'] = progress_value
|
||||
plugin_payload['timeout'] = ar.timeout or 120
|
||||
plugin_payload['source'] = 'archiveresult'
|
||||
all_plugins.append(plugin_payload)
|
||||
seen_plugin_keys.add(
|
||||
str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}"
|
||||
)
|
||||
|
||||
snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
|
||||
for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []):
|
||||
proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
|
||||
if proc_key in seen_plugin_keys:
|
||||
continue
|
||||
seen_plugin_keys.add(proc_key)
|
||||
all_plugins.append(proc_payload)
|
||||
|
||||
proc_status = proc_payload.get("status")
|
||||
if proc_status in ("succeeded", "failed", "skipped"):
|
||||
plugin_progress_values.append(100)
|
||||
elif proc_status == "started":
|
||||
plugin_progress_values.append(1)
|
||||
else:
|
||||
plugin_progress_values.append(0)
|
||||
|
||||
total_plugins = len(all_plugins)
|
||||
completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
|
||||
failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
|
||||
pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
|
||||
|
||||
snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
|
||||
|
||||
active_snapshots_for_crawl.append({
|
||||
'id': str(snapshot.id),
|
||||
@@ -1334,6 +1469,11 @@ def live_progress_view(request):
|
||||
'started_snapshots': started_snapshots,
|
||||
'failed_snapshots': 0,
|
||||
'pending_snapshots': pending_snapshots,
|
||||
'setup_plugins': crawl_setup_plugins,
|
||||
'setup_total_plugins': crawl_setup_total,
|
||||
'setup_completed_plugins': crawl_setup_completed,
|
||||
'setup_failed_plugins': crawl_setup_failed,
|
||||
'setup_pending_plugins': crawl_setup_pending,
|
||||
'active_snapshots': active_snapshots_for_crawl,
|
||||
'can_start': can_start,
|
||||
'urls_preview': urls_preview,
|
||||
@@ -1461,7 +1601,11 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
"""Determine where a config value comes from."""
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
# Check if it's from archivebox.machine.config
|
||||
# Environment variables override all persistent config sources.
|
||||
if key in os.environ:
|
||||
return 'Environment'
|
||||
|
||||
# Machine.config overrides ArchiveBox.conf.
|
||||
try:
|
||||
machine = Machine.current()
|
||||
if machine.config and key in machine.config:
|
||||
@@ -1469,10 +1613,6 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check if it's from environment variable
|
||||
if key in os.environ:
|
||||
return 'Environment'
|
||||
|
||||
# Check if it's from archivebox.config.file
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
@@ -1483,6 +1623,43 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
return 'Default'
|
||||
|
||||
|
||||
def find_plugin_for_config_key(key: str) -> str | None:
|
||||
for plugin_name, schema in discover_plugin_configs().items():
|
||||
if key in (schema.get('properties') or {}):
|
||||
return plugin_name
|
||||
return None
|
||||
|
||||
|
||||
def get_config_definition_link(key: str) -> tuple[str, str]:
|
||||
plugin_name = find_plugin_for_config_key(key)
|
||||
if not plugin_name:
|
||||
return (
|
||||
f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code',
|
||||
'archivebox/config',
|
||||
)
|
||||
|
||||
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
|
||||
if plugin_dir:
|
||||
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(builtin_root):
|
||||
return (
|
||||
f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json',
|
||||
f'abx_plugins/plugins/{plugin_name}/config.json',
|
||||
)
|
||||
|
||||
user_root = USER_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(user_root):
|
||||
return (
|
||||
f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/',
|
||||
f'data/custom_plugins/{plugin_name}/config.json',
|
||||
)
|
||||
|
||||
return (
|
||||
f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/',
|
||||
f'abx_plugins/plugins/{plugin_name}/config.json',
|
||||
)
|
||||
|
||||
|
||||
@render_with_table_view
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
CONFIGS = get_all_configs()
|
||||
@@ -1566,17 +1743,6 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
# Determine all sources for this config value
|
||||
sources_info = []
|
||||
|
||||
# Default value
|
||||
default_val = find_config_default(key)
|
||||
if default_val:
|
||||
sources_info.append(('Default', default_val, 'gray'))
|
||||
|
||||
# Config file value
|
||||
if CONSTANTS.CONFIG_FILE.exists():
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
if key in file_config:
|
||||
sources_info.append(('Config File', file_config[key], 'green'))
|
||||
|
||||
# Environment variable
|
||||
if key in os.environ:
|
||||
sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
|
||||
@@ -1592,6 +1758,17 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Config file value
|
||||
if CONSTANTS.CONFIG_FILE.exists():
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
if key in file_config:
|
||||
sources_info.append(('Config File', file_config[key], 'green'))
|
||||
|
||||
# Default value
|
||||
default_val = find_config_default(key)
|
||||
if default_val:
|
||||
sources_info.append(('Default', default_val, 'gray'))
|
||||
|
||||
# Final computed value
|
||||
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
|
||||
if not key_is_safe(key):
|
||||
@@ -1614,6 +1791,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
section_header = mark_safe(f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, calculated at runtime)</small>')
|
||||
|
||||
|
||||
definition_url, definition_label = get_config_definition_link(key)
|
||||
|
||||
section_data = cast(SectionData, {
|
||||
"name": section_header,
|
||||
"description": None,
|
||||
@@ -1621,7 +1800,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': final_value,
|
||||
'Source': find_config_source(key, merged_config),
|
||||
'Currently read from': find_config_source(key, merged_config),
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
@@ -1631,14 +1810,14 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
|
||||
See full definition in <code>archivebox/config</code>...
|
||||
<a href="{definition_url}" target="_blank" rel="noopener noreferrer">
|
||||
See full definition in <code>{definition_label}</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
<br/><hr/><br/>
|
||||
<b>Configuration Sources (in priority order):</b><br/><br/>
|
||||
<b>Configuration Sources (highest priority first):</b><br/><br/>
|
||||
{sources_html}
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
|
||||
@@ -1651,15 +1830,15 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
'Source': mark_safe(f'''
|
||||
'Currently read from': mark_safe(f'''
|
||||
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
|
||||
<br/><br/>
|
||||
Priority order (highest to lowest):
|
||||
<ol>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
|
||||
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
|
||||
</li>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
|
||||
<li><b style="color: gray">Default</b> - Default value from code</li>
|
||||
</ol>
|
||||
|
||||
@@ -131,7 +131,46 @@ class TagEditorWidget(forms.Widget):
|
||||
}};
|
||||
|
||||
window.updateHiddenInput_{widget_id} = function() {{
|
||||
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
|
||||
var hiddenInput = document.getElementById('{widget_id}');
|
||||
if (!hiddenInput) {{
|
||||
return;
|
||||
}}
|
||||
hiddenInput.value = currentTags_{widget_id}.join(',');
|
||||
hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
hiddenInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||
}};
|
||||
|
||||
function normalizeTags_{widget_id}(value) {{
|
||||
var rawTags = Array.isArray(value) ? value : String(value || '').split(',');
|
||||
var seen = {{}};
|
||||
return rawTags
|
||||
.map(function(tag) {{ return String(tag || '').trim(); }})
|
||||
.filter(function(tag) {{
|
||||
if (!tag) return false;
|
||||
var normalized = tag.toLowerCase();
|
||||
if (seen[normalized]) return false;
|
||||
seen[normalized] = true;
|
||||
return true;
|
||||
}})
|
||||
.sort(function(a, b) {{
|
||||
return a.toLowerCase().localeCompare(b.toLowerCase());
|
||||
}});
|
||||
}}
|
||||
|
||||
window.setTags_{widget_id} = function(value, options) {{
|
||||
currentTags_{widget_id} = normalizeTags_{widget_id}(value);
|
||||
rebuildPills_{widget_id}();
|
||||
if (!(options && options.skipHiddenUpdate)) {{
|
||||
updateHiddenInput_{widget_id}();
|
||||
}}
|
||||
}};
|
||||
|
||||
window.syncTagEditorFromHidden_{widget_id} = function() {{
|
||||
var hiddenInput = document.getElementById('{widget_id}');
|
||||
if (!hiddenInput) {{
|
||||
return;
|
||||
}}
|
||||
setTags_{widget_id}(hiddenInput.value, {{ skipHiddenUpdate: true }});
|
||||
}};
|
||||
|
||||
function computeTagStyle_{widget_id}(tagName) {{
|
||||
@@ -190,9 +229,7 @@ class TagEditorWidget(forms.Widget):
|
||||
|
||||
// Add to current tags
|
||||
currentTags_{widget_id}.push(tagName);
|
||||
currentTags_{widget_id}.sort(function(a, b) {{
|
||||
return a.toLowerCase().localeCompare(b.toLowerCase());
|
||||
}});
|
||||
currentTags_{widget_id} = normalizeTags_{widget_id}(currentTags_{widget_id});
|
||||
|
||||
// Rebuild pills
|
||||
rebuildPills_{widget_id}();
|
||||
@@ -252,6 +289,14 @@ class TagEditorWidget(forms.Widget):
|
||||
}}
|
||||
}});
|
||||
|
||||
document.getElementById('{widget_id}').addEventListener('change', function() {{
|
||||
syncTagEditorFromHidden_{widget_id}();
|
||||
}});
|
||||
|
||||
document.getElementById('{widget_id}').addEventListener('archivebox:sync-tags', function() {{
|
||||
syncTagEditorFromHidden_{widget_id}();
|
||||
}});
|
||||
|
||||
window.handleTagKeydown_{widget_id} = function(event) {{
|
||||
var input = event.target;
|
||||
var value = input.value.trim();
|
||||
@@ -320,6 +365,8 @@ class TagEditorWidget(forms.Widget):
|
||||
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
|
||||
return input ? input.value : '';
|
||||
}}
|
||||
|
||||
syncTagEditorFromHidden_{widget_id}();
|
||||
}})();
|
||||
</script>
|
||||
'''
|
||||
@@ -327,15 +374,232 @@ class TagEditorWidget(forms.Widget):
|
||||
return mark_safe(html)
|
||||
|
||||
|
||||
class URLFiltersWidget(forms.Widget):
|
||||
"""Render URL allowlist / denylist controls with same-domain autofill."""
|
||||
|
||||
template_name = ""
|
||||
|
||||
def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'):
|
||||
self.source_selector = source_selector
|
||||
super().__init__(attrs)
|
||||
|
||||
def render(self, name, value, attrs=None, renderer=None):
|
||||
value = value if isinstance(value, dict) else {}
|
||||
widget_id_raw = attrs.get('id', name) if attrs else name
|
||||
widget_id = re.sub(r'[^A-Za-z0-9_]', '_', str(widget_id_raw)) or name
|
||||
allowlist = escape(value.get('allowlist', '') or '')
|
||||
denylist = escape(value.get('denylist', '') or '')
|
||||
|
||||
return mark_safe(f'''
|
||||
<div id="{widget_id}_container" class="url-filters-widget">
|
||||
<input type="hidden" name="{name}" value="">
|
||||
<div class="url-filters-grid">
|
||||
<div class="url-filters-column">
|
||||
<div class="url-filter-label-row">
|
||||
<label for="{widget_id}_allowlist" class="url-filter-label"><span class="url-filter-label-main">🟢 URL_ALLOWLIST</span></label>
|
||||
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
|
||||
</div>
|
||||
<textarea id="{widget_id}_allowlist"
|
||||
name="{name}_allowlist"
|
||||
rows="2"
|
||||
placeholder="^https?://([^/]+\\.)?(example\\.com|example\\.org)([:/]|$)">{allowlist}</textarea>
|
||||
</div>
|
||||
<div class="url-filters-column">
|
||||
<div class="url-filter-label-row">
|
||||
<label for="{widget_id}_denylist" class="url-filter-label"><span class="url-filter-label-main">⛔ URL_DENYLIST</span></label>
|
||||
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
|
||||
</div>
|
||||
<textarea id="{widget_id}_denylist"
|
||||
name="{name}_denylist"
|
||||
rows="2"
|
||||
placeholder="^https?://([^/]+\\.)?(cdn\\.example\\.com|analytics\\.example\\.org)([:/]|$)">{denylist}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
<label class="url-filters-toggle" for="{widget_id}_same_domain_only">
|
||||
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
|
||||
<span>Same domain only</span>
|
||||
</label>
|
||||
<div class="help-text">These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.</div>
|
||||
<script>
|
||||
(function() {{
|
||||
var allowlistField = document.getElementById('{widget_id}_allowlist');
|
||||
var denylistField = document.getElementById('{widget_id}_denylist');
|
||||
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
|
||||
var sourceField = document.querySelector({json.dumps(self.source_selector)});
|
||||
var lastAutoGeneratedAllowlist = '';
|
||||
if (!allowlistField || !sameDomainOnly || !sourceField) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
function extractUrl(line) {{
|
||||
var trimmed = String(line || '').trim();
|
||||
if (!trimmed || trimmed.charAt(0) === '#') {{
|
||||
return '';
|
||||
}}
|
||||
if (trimmed.charAt(0) === '{{') {{
|
||||
try {{
|
||||
var record = JSON.parse(trimmed);
|
||||
return String(record.url || '').trim();
|
||||
}} catch (error) {{
|
||||
return '';
|
||||
}}
|
||||
}}
|
||||
return trimmed;
|
||||
}}
|
||||
|
||||
function escapeRegex(text) {{
|
||||
return String(text || '').replace(/[.*+?^${{}}()|[\\]\\\\]/g, '\\\\$&');
|
||||
}}
|
||||
|
||||
function buildHostRegex(domains) {{
|
||||
if (!domains.length) {{
|
||||
return '';
|
||||
}}
|
||||
return '^https?://(' + domains.map(escapeRegex).join('|') + ')([:/]|$)';
|
||||
}}
|
||||
|
||||
function getConfigEditorRows() {{
|
||||
return document.getElementById('id_config_rows');
|
||||
}}
|
||||
|
||||
function getConfigUpdater() {{
|
||||
return window.updateHiddenField_id_config || null;
|
||||
}}
|
||||
|
||||
function findConfigRow(key) {{
|
||||
var rows = getConfigEditorRows();
|
||||
if (!rows) {{
|
||||
return null;
|
||||
}}
|
||||
var matches = Array.prototype.filter.call(rows.querySelectorAll('.key-value-row'), function(row) {{
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
return keyInput && keyInput.value.trim() === key;
|
||||
}});
|
||||
return matches.length ? matches[0] : null;
|
||||
}}
|
||||
|
||||
function addConfigRow() {{
|
||||
if (typeof window.addKeyValueRow_id_config === 'function') {{
|
||||
window.addKeyValueRow_id_config();
|
||||
var rows = getConfigEditorRows();
|
||||
return rows ? rows.lastElementChild : null;
|
||||
}}
|
||||
return null;
|
||||
}}
|
||||
|
||||
function setConfigRow(key, value) {{
|
||||
var rows = getConfigEditorRows();
|
||||
var updater = getConfigUpdater();
|
||||
if (!rows || !updater) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var row = findConfigRow(key);
|
||||
if (!value) {{
|
||||
if (row) {{
|
||||
row.remove();
|
||||
updater();
|
||||
}}
|
||||
return;
|
||||
}}
|
||||
|
||||
if (!row) {{
|
||||
row = addConfigRow();
|
||||
}}
|
||||
if (!row) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
var valueInput = row.querySelector('.kv-value');
|
||||
if (!keyInput || !valueInput) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
keyInput.value = key;
|
||||
valueInput.value = value;
|
||||
keyInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
valueInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
updater();
|
||||
}}
|
||||
|
||||
function syncConfigEditor() {{
|
||||
setConfigRow('URL_ALLOWLIST', allowlistField.value.trim());
|
||||
setConfigRow('URL_DENYLIST', denylistField ? denylistField.value.trim() : '');
|
||||
}}
|
||||
|
||||
function syncAllowlistFromUrls() {{
|
||||
if (!sameDomainOnly.checked) {{
|
||||
if (allowlistField.value.trim() === lastAutoGeneratedAllowlist) {{
|
||||
allowlistField.value = '';
|
||||
syncConfigEditor();
|
||||
}}
|
||||
lastAutoGeneratedAllowlist = '';
|
||||
return;
|
||||
}}
|
||||
|
||||
var seen = Object.create(null);
|
||||
var domains = [];
|
||||
sourceField.value.split(/\\n+/).forEach(function(line) {{
|
||||
var url = extractUrl(line);
|
||||
if (!url) {{
|
||||
return;
|
||||
}}
|
||||
try {{
|
||||
var parsed = new URL(url);
|
||||
var domain = String(parsed.hostname || '').toLowerCase();
|
||||
if (!domain || seen[domain]) {{
|
||||
return;
|
||||
}}
|
||||
seen[domain] = true;
|
||||
domains.push(domain);
|
||||
}} catch (error) {{
|
||||
return;
|
||||
}}
|
||||
}});
|
||||
lastAutoGeneratedAllowlist = buildHostRegex(domains);
|
||||
allowlistField.value = lastAutoGeneratedAllowlist;
|
||||
syncConfigEditor();
|
||||
}}
|
||||
|
||||
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
|
||||
sourceField.addEventListener('input', syncAllowlistFromUrls);
|
||||
sourceField.addEventListener('change', syncAllowlistFromUrls);
|
||||
allowlistField.addEventListener('input', syncConfigEditor);
|
||||
allowlistField.addEventListener('change', syncConfigEditor);
|
||||
if (denylistField) {{
|
||||
denylistField.addEventListener('input', syncConfigEditor);
|
||||
denylistField.addEventListener('change', syncConfigEditor);
|
||||
}}
|
||||
|
||||
if (document.readyState === 'loading') {{
|
||||
document.addEventListener('DOMContentLoaded', syncConfigEditor, {{ once: true }});
|
||||
}} else {{
|
||||
syncConfigEditor();
|
||||
}}
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
''')
|
||||
|
||||
def value_from_datadict(self, data, files, name):
|
||||
return {
|
||||
'allowlist': data.get(f'{name}_allowlist', ''),
|
||||
'denylist': data.get(f'{name}_denylist', ''),
|
||||
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
|
||||
}
|
||||
|
||||
|
||||
class InlineTagEditorWidget(TagEditorWidget):
|
||||
"""
|
||||
Inline version of TagEditorWidget for use in list views.
|
||||
Includes AJAX save functionality for immediate persistence.
|
||||
"""
|
||||
|
||||
def __init__(self, attrs=None, snapshot_id=None):
|
||||
def __init__(self, attrs=None, snapshot_id=None, editable=True):
|
||||
super().__init__(attrs, snapshot_id)
|
||||
self.snapshot_id = snapshot_id
|
||||
self.editable = editable
|
||||
|
||||
def render(self, name, value, attrs=None, renderer=None, snapshot_id=None):
|
||||
"""Render inline tag editor with AJAX save."""
|
||||
@@ -361,20 +625,24 @@ class InlineTagEditorWidget(TagEditorWidget):
|
||||
# Build pills HTML with filter links
|
||||
pills_html = ''
|
||||
for td in tag_data:
|
||||
remove_button = ''
|
||||
if self.editable:
|
||||
remove_button = (
|
||||
f'<button type="button" class="tag-remove-btn" '
|
||||
f'data-tag-id="{td["id"]}" data-tag-name="{self._escape(td["name"])}">×</button>'
|
||||
)
|
||||
pills_html += f'''
|
||||
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}" style="{self._tag_style(td['name'])}">
|
||||
<a href="/admin/core/snapshot/?tags__id__exact={td['id']}" class="tag-link">{self._escape(td['name'])}</a>
|
||||
<button type="button" class="tag-remove-btn" data-tag-id="{td['id']}" data-tag-name="{self._escape(td['name'])}">×</button>
|
||||
{remove_button}
|
||||
</span>
|
||||
'''
|
||||
|
||||
tags_json = escape(json.dumps(tag_data))
|
||||
|
||||
html = f'''
|
||||
<span id="{widget_id}_container" class="tag-editor-inline" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}">
|
||||
<span id="{widget_id}_pills" class="tag-pills-inline">
|
||||
{pills_html}
|
||||
</span>
|
||||
input_html = ''
|
||||
readonly_class = ' readonly' if not self.editable else ''
|
||||
if self.editable:
|
||||
input_html = f'''
|
||||
<input type="text"
|
||||
id="{widget_id}_input"
|
||||
class="tag-inline-input-sm"
|
||||
@@ -384,6 +652,14 @@ class InlineTagEditorWidget(TagEditorWidget):
|
||||
data-inline-tag-input="1"
|
||||
>
|
||||
<datalist id="{widget_id}_datalist"></datalist>
|
||||
'''
|
||||
|
||||
html = f'''
|
||||
<span id="{widget_id}_container" class="tag-editor-inline{readonly_class}" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}" data-readonly="{int(not self.editable)}">
|
||||
<span id="{widget_id}_pills" class="tag-pills-inline">
|
||||
{pills_html}
|
||||
</span>
|
||||
{input_html}
|
||||
</span>
|
||||
'''
|
||||
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed
|
||||
from django.shortcuts import get_object_or_404, redirect
|
||||
from django.urls import path, reverse
|
||||
from django.utils.html import escape, format_html, format_html_join
|
||||
from django.utils import timezone
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.contrib import admin, messages
|
||||
from django.db.models import Count, Q
|
||||
@@ -13,16 +16,19 @@ from django_object_actions import action
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.core.widgets import TagEditorWidget
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
|
||||
|
||||
def render_snapshots_list(snapshots_qs, limit=20):
|
||||
def render_snapshots_list(snapshots_qs, limit=20, crawl=None):
|
||||
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
|
||||
|
||||
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
|
||||
total_results=Count('archiveresult'),
|
||||
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
|
||||
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
|
||||
started_results=Count('archiveresult', filter=Q(archiveresult__status='started')),
|
||||
skipped_results=Count('archiveresult', filter=Q(archiveresult__status='skipped')),
|
||||
)
|
||||
|
||||
if not snapshots:
|
||||
@@ -43,17 +49,57 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
|
||||
# Calculate progress
|
||||
total = snapshot.total_results
|
||||
done = snapshot.succeeded_results + snapshot.failed_results
|
||||
succeeded = snapshot.succeeded_results
|
||||
failed = snapshot.failed_results
|
||||
running = snapshot.started_results
|
||||
skipped = snapshot.skipped_results
|
||||
done = succeeded + failed + skipped
|
||||
pending = max(total - done - running, 0)
|
||||
progress_pct = int((done / total) * 100) if total > 0 else 0
|
||||
progress_text = f'{done}/{total}' if total > 0 else '-'
|
||||
progress_title = (
|
||||
f'{succeeded} succeeded, {failed} failed, {running} running, '
|
||||
f'{pending} pending, {skipped} skipped'
|
||||
)
|
||||
progress_color = '#28a745'
|
||||
if failed:
|
||||
progress_color = '#dc3545'
|
||||
elif running:
|
||||
progress_color = '#17a2b8'
|
||||
elif pending:
|
||||
progress_color = '#ffc107'
|
||||
|
||||
# Truncate title and URL
|
||||
title = (snapshot.title or 'Untitled')[:60]
|
||||
if len(snapshot.title or '') > 60:
|
||||
snapshot_title = snapshot.title or 'Untitled'
|
||||
title = snapshot_title[:60]
|
||||
if len(snapshot_title) > 60:
|
||||
title += '...'
|
||||
url_display = snapshot.url[:50]
|
||||
if len(snapshot.url) > 50:
|
||||
url_display += '...'
|
||||
delete_button = ''
|
||||
exclude_button = ''
|
||||
if crawl is not None:
|
||||
delete_url = reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk])
|
||||
exclude_url = reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, snapshot.pk])
|
||||
delete_button = f'''
|
||||
<button type="button"
|
||||
class="crawl-snapshots-action"
|
||||
data-post-url="{escape(delete_url)}"
|
||||
data-confirm="Delete this snapshot from the crawl?"
|
||||
title="Delete this snapshot from the crawl and remove its URL from the crawl queue."
|
||||
aria-label="Delete snapshot"
|
||||
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">🗑</button>
|
||||
'''
|
||||
exclude_button = f'''
|
||||
<button type="button"
|
||||
class="crawl-snapshots-action"
|
||||
data-post-url="{escape(exclude_url)}"
|
||||
data-confirm="Exclude this domain from the crawl? This removes matching queued URLs, deletes pending matching snapshots, and blocks future matches."
|
||||
title="Exclude this domain from this crawl. This removes matching URLs from the crawl queue, deletes pending matching snapshots, and blocks future matches."
|
||||
aria-label="Exclude domain from crawl"
|
||||
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">⊘</button>
|
||||
'''
|
||||
|
||||
# Format date
|
||||
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
|
||||
@@ -74,18 +120,18 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
</td>
|
||||
<td style="padding: 6px 8px; max-width: 300px;">
|
||||
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
|
||||
title="{snapshot.title or 'Untitled'}">{title}</a>
|
||||
title="{escape(snapshot_title)}">{escape(title)}</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; max-width: 250px;">
|
||||
<a href="{snapshot.url}" target="_blank"
|
||||
<a href="{escape(snapshot.url)}" target="_blank"
|
||||
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
|
||||
title="{snapshot.url}">{url_display}</a>
|
||||
title="{escape(snapshot.url)}">{escape(url_display)}</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
|
||||
<div style="display: inline-flex; align-items: center; gap: 6px;">
|
||||
<div style="display: inline-flex; align-items: center; gap: 6px;" title="{escape(progress_title)}">
|
||||
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {progress_pct}%; height: 100%;
|
||||
background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
|
||||
background: {progress_color};
|
||||
transition: width 0.3s;"></div>
|
||||
</div>
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
|
||||
@@ -96,6 +142,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
|
||||
{date_str}
|
||||
</td>
|
||||
{"<td style=\"padding: 6px 8px; white-space: nowrap; text-align: right;\"><div style=\"display: inline-flex; gap: 6px;\">%s%s</div></td>" % (exclude_button, delete_button) if crawl is not None else ""}
|
||||
</tr>
|
||||
''')
|
||||
|
||||
@@ -111,7 +158,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
'''
|
||||
|
||||
return mark_safe(f'''
|
||||
<div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
|
||||
<div data-crawl-snapshots-list style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
|
||||
<thead>
|
||||
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
|
||||
@@ -121,6 +168,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
|
||||
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
|
||||
{'<th style="padding: 8px; text-align: right; font-weight: 600; color: #333;">Actions</th>' if crawl is not None else ''}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@@ -129,11 +177,197 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{'''
|
||||
<script>
|
||||
(function() {
|
||||
if (window.__archiveboxCrawlSnapshotActionsBound) {
|
||||
return;
|
||||
}
|
||||
window.__archiveboxCrawlSnapshotActionsBound = true;
|
||||
|
||||
function getCookie(name) {
|
||||
var cookieValue = null;
|
||||
if (!document.cookie) {
|
||||
return cookieValue;
|
||||
}
|
||||
var cookies = document.cookie.split(';');
|
||||
for (var i = 0; i < cookies.length; i++) {
|
||||
var cookie = cookies[i].trim();
|
||||
if (cookie.substring(0, name.length + 1) === (name + '=')) {
|
||||
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
return cookieValue;
|
||||
}
|
||||
|
||||
document.addEventListener('click', function(event) {
|
||||
var button = event.target.closest('.crawl-snapshots-action');
|
||||
if (!button) {
|
||||
return;
|
||||
}
|
||||
event.preventDefault();
|
||||
|
||||
var confirmMessage = button.getAttribute('data-confirm');
|
||||
if (confirmMessage && !window.confirm(confirmMessage)) {
|
||||
return;
|
||||
}
|
||||
|
||||
button.disabled = true;
|
||||
|
||||
fetch(button.getAttribute('data-post-url'), {
|
||||
method: 'POST',
|
||||
credentials: 'same-origin',
|
||||
headers: {
|
||||
'X-CSRFToken': getCookie('csrftoken') || '',
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
}
|
||||
}).then(function(response) {
|
||||
return response.json().then(function(data) {
|
||||
if (!response.ok) {
|
||||
throw new Error(data.error || 'Request failed');
|
||||
}
|
||||
return data;
|
||||
});
|
||||
}).then(function() {
|
||||
window.location.reload();
|
||||
}).catch(function(error) {
|
||||
button.disabled = false;
|
||||
window.alert(error.message || 'Request failed');
|
||||
});
|
||||
});
|
||||
})();
|
||||
</script>
|
||||
''' if crawl is not None else ''}
|
||||
''')
|
||||
|
||||
|
||||
class URLFiltersWidget(forms.Widget):
|
||||
def render(self, name, value, attrs=None, renderer=None):
|
||||
value = value if isinstance(value, dict) else {}
|
||||
widget_id = (attrs or {}).get('id', name)
|
||||
allowlist = escape(value.get('allowlist', '') or '')
|
||||
denylist = escape(value.get('denylist', '') or '')
|
||||
|
||||
return mark_safe(f'''
|
||||
<div id="{widget_id}_container" style="min-width: 420px;">
|
||||
<input type="hidden" name="{name}" value="">
|
||||
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
|
||||
<div>
|
||||
<label for="{widget_id}_allowlist" style="display: block; font-weight: 600; margin-bottom: 4px;">Allowlist</label>
|
||||
<textarea id="{widget_id}_allowlist" name="{name}_allowlist" rows="3"
|
||||
style="width: 100%; font-family: monospace; font-size: 12px;"
|
||||
placeholder="example.com *.example.com">{allowlist}</textarea>
|
||||
</div>
|
||||
<div>
|
||||
<label for="{widget_id}_denylist" style="display: block; font-weight: 600; margin-bottom: 4px;">Denylist</label>
|
||||
<textarea id="{widget_id}_denylist" name="{name}_denylist" rows="3"
|
||||
style="width: 100%; font-family: monospace; font-size: 12px;"
|
||||
placeholder="static.example.com">{denylist}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
<label style="display: inline-flex; align-items: center; gap: 6px; margin-top: 8px; font-weight: 500;">
|
||||
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
|
||||
Same domain only
|
||||
</label>
|
||||
<p style="color: #666; font-size: 11px; margin: 6px 0 0 0;">
|
||||
Enter domains, wildcards, or regex patterns. Denylist takes precedence over allowlist.
|
||||
</p>
|
||||
<script>
|
||||
(function() {{
|
||||
if (window.__archiveboxUrlFilterEditors && window.__archiveboxUrlFilterEditors['{widget_id}']) {{
|
||||
return;
|
||||
}}
|
||||
window.__archiveboxUrlFilterEditors = window.__archiveboxUrlFilterEditors || {{}};
|
||||
window.__archiveboxUrlFilterEditors['{widget_id}'] = true;
|
||||
|
||||
var urlsField = document.getElementById('id_urls');
|
||||
var allowlistField = document.getElementById('{widget_id}_allowlist');
|
||||
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
|
||||
|
||||
function extractUrl(line) {{
|
||||
var trimmed = (line || '').trim();
|
||||
if (!trimmed || trimmed.charAt(0) === '#') {{
|
||||
return '';
|
||||
}}
|
||||
if (trimmed.charAt(0) === '{{') {{
|
||||
try {{
|
||||
var record = JSON.parse(trimmed);
|
||||
return String(record.url || '').trim();
|
||||
}} catch (error) {{
|
||||
return '';
|
||||
}}
|
||||
}}
|
||||
return trimmed;
|
||||
}}
|
||||
|
||||
function syncAllowlistFromUrls() {{
|
||||
if (!urlsField || !allowlistField || !sameDomainOnly || !sameDomainOnly.checked) {{
|
||||
return;
|
||||
}}
|
||||
var domains = [];
|
||||
var seen = Object.create(null);
|
||||
urlsField.value.split(/\\n+/).forEach(function(line) {{
|
||||
var url = extractUrl(line);
|
||||
if (!url) {{
|
||||
return;
|
||||
}}
|
||||
try {{
|
||||
var parsed = new URL(url);
|
||||
var domain = (parsed.hostname || '').toLowerCase();
|
||||
if (domain && !seen[domain]) {{
|
||||
seen[domain] = true;
|
||||
domains.push(domain);
|
||||
}}
|
||||
}} catch (error) {{
|
||||
return;
|
||||
}}
|
||||
}});
|
||||
allowlistField.value = domains.join('\\n');
|
||||
}}
|
||||
|
||||
if (sameDomainOnly) {{
|
||||
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
|
||||
}}
|
||||
if (urlsField) {{
|
||||
urlsField.addEventListener('input', syncAllowlistFromUrls);
|
||||
urlsField.addEventListener('change', syncAllowlistFromUrls);
|
||||
}}
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
''')
|
||||
|
||||
def value_from_datadict(self, data, files, name):
|
||||
return {
|
||||
'allowlist': data.get(f'{name}_allowlist', ''),
|
||||
'denylist': data.get(f'{name}_denylist', ''),
|
||||
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
|
||||
}
|
||||
|
||||
|
||||
class URLFiltersField(forms.Field):
|
||||
widget = URLFiltersWidget
|
||||
|
||||
def to_python(self, value):
|
||||
if isinstance(value, dict):
|
||||
return value
|
||||
return {'allowlist': '', 'denylist': '', 'same_domain_only': False}
|
||||
|
||||
|
||||
class CrawlAdminForm(forms.ModelForm):
|
||||
"""Custom form for Crawl admin to render urls field as textarea."""
|
||||
tags_editor = forms.CharField(
|
||||
label='Tags',
|
||||
required=False,
|
||||
widget=TagEditorWidget(),
|
||||
help_text='Type tag names and press Enter or Space to add. Click × to remove.',
|
||||
)
|
||||
url_filters = URLFiltersField(
|
||||
label='URL Filters',
|
||||
required=False,
|
||||
help_text='Set URL_ALLOWLIST / URL_DENYLIST for this crawl.',
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = Crawl
|
||||
@@ -144,8 +378,62 @@ class CrawlAdminForm(forms.ModelForm):
|
||||
'style': 'width: 100%; font-family: monospace; font-size: 13px;',
|
||||
'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
|
||||
}),
|
||||
'notes': forms.Textarea(attrs={
|
||||
'rows': 1,
|
||||
'style': 'width: 100%; min-height: 0; resize: vertical;',
|
||||
}),
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {}
|
||||
if self.instance and self.instance.pk:
|
||||
self.initial['tags_editor'] = self.instance.tags_str
|
||||
self.initial['url_filters'] = {
|
||||
'allowlist': config.get('URL_ALLOWLIST', ''),
|
||||
'denylist': config.get('URL_DENYLIST', ''),
|
||||
'same_domain_only': False,
|
||||
}
|
||||
|
||||
def clean_tags_editor(self):
|
||||
tags_str = self.cleaned_data.get('tags_editor', '')
|
||||
tag_names = []
|
||||
seen = set()
|
||||
for raw_name in tags_str.split(','):
|
||||
name = raw_name.strip()
|
||||
if not name:
|
||||
continue
|
||||
lowered = name.lower()
|
||||
if lowered in seen:
|
||||
continue
|
||||
seen.add(lowered)
|
||||
tag_names.append(name)
|
||||
return ','.join(tag_names)
|
||||
|
||||
def clean_url_filters(self):
|
||||
value = self.cleaned_data.get('url_filters') or {}
|
||||
return {
|
||||
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
|
||||
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
|
||||
'same_domain_only': bool(value.get('same_domain_only')),
|
||||
}
|
||||
|
||||
def save(self, commit=True):
|
||||
instance = super().save(commit=False)
|
||||
instance.tags_str = self.cleaned_data.get('tags_editor', '')
|
||||
url_filters = self.cleaned_data.get('url_filters') or {}
|
||||
instance.set_url_filters(
|
||||
url_filters.get('allowlist', ''),
|
||||
url_filters.get('denylist', ''),
|
||||
)
|
||||
if commit:
|
||||
instance.save()
|
||||
instance.apply_crawl_config_filters()
|
||||
save_m2m = getattr(self, '_save_m2m', None)
|
||||
if callable(save_m2m):
|
||||
save_m2m()
|
||||
return instance
|
||||
|
||||
|
||||
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
form = CrawlAdminForm
|
||||
@@ -161,11 +449,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes', 'tags_str'),
|
||||
'fields': ('label', 'notes', 'tags_editor'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('max_depth', 'config'),
|
||||
'fields': (('max_depth', 'url_filters'), 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
@@ -185,6 +473,28 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
add_fieldsets = (
|
||||
('URLs', {
|
||||
'fields': ('urls',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes', 'tags_editor'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': (('max_depth', 'url_filters'), 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('status', 'retry_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('schedule', 'created_by'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
|
||||
ordering = ['-created_at', '-retry_at']
|
||||
@@ -199,6 +509,25 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
num_snapshots_cached=Count('snapshot_set')
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request, obj=None):
|
||||
return self.fieldsets if obj else self.add_fieldsets
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path(
|
||||
'<path:object_id>/snapshot/<path:snapshot_id>/delete/',
|
||||
self.admin_site.admin_view(self.delete_snapshot_view),
|
||||
name='crawls_crawl_snapshot_delete',
|
||||
),
|
||||
path(
|
||||
'<path:object_id>/snapshot/<path:snapshot_id>/exclude-domain/',
|
||||
self.admin_site.admin_view(self.exclude_domain_view),
|
||||
name='crawls_crawl_snapshot_exclude_domain',
|
||||
),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
@admin.action(description='Delete selected crawls')
|
||||
def delete_selected_batched(self, request, queryset):
|
||||
"""Delete crawls in a single transaction to avoid SQLite concurrency issues."""
|
||||
@@ -218,8 +547,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
||||
def recrawl(self, request, obj):
|
||||
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
|
||||
from django.utils import timezone
|
||||
from django.shortcuts import redirect
|
||||
|
||||
# Validate URLs (required for crawl to start)
|
||||
if not obj.urls:
|
||||
@@ -252,7 +579,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
|
||||
|
||||
def snapshots(self, obj):
|
||||
return render_snapshots_list(obj.snapshot_set.all())
|
||||
return render_snapshots_list(obj.snapshot_set.all(), crawl=obj)
|
||||
|
||||
def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
|
||||
if request.method != 'POST':
|
||||
return HttpResponseNotAllowed(['POST'])
|
||||
|
||||
crawl = get_object_or_404(Crawl, pk=object_id)
|
||||
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
|
||||
|
||||
if snapshot.status == Snapshot.StatusChoices.STARTED:
|
||||
snapshot.cancel_running_hooks()
|
||||
|
||||
removed_urls = crawl.prune_url(snapshot.url)
|
||||
snapshot.delete()
|
||||
return JsonResponse({
|
||||
'ok': True,
|
||||
'snapshot_id': str(snapshot.id),
|
||||
'removed_urls': removed_urls,
|
||||
})
|
||||
|
||||
def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
|
||||
if request.method != 'POST':
|
||||
return HttpResponseNotAllowed(['POST'])
|
||||
|
||||
crawl = get_object_or_404(Crawl, pk=object_id)
|
||||
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
|
||||
result = crawl.exclude_domain(snapshot.url)
|
||||
return JsonResponse({
|
||||
'ok': True,
|
||||
**result,
|
||||
})
|
||||
|
||||
@admin.display(description='Schedule', ordering='schedule')
|
||||
def schedule_str(self, obj):
|
||||
|
||||
@@ -2,9 +2,12 @@ __package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import uuid
|
||||
import json
|
||||
import re
|
||||
from datetime import timedelta
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.db import models
|
||||
from django.core.validators import MaxValueValidator, MinValueValidator
|
||||
@@ -141,22 +144,21 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
return f'[...{short_id}] {first_url[:120]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Crawl',
|
||||
indent_level=1,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'first_url': first_url[:64],
|
||||
'max_depth': self.max_depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created Crawl',
|
||||
# indent_level=1,
|
||||
# metadata={
|
||||
# 'id': str(self.id),
|
||||
# 'first_url': first_url[:64],
|
||||
# 'max_depth': self.max_depth,
|
||||
# 'status': self.status,
|
||||
# },
|
||||
# )
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
@@ -248,6 +250,222 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if url.strip() and not url.strip().startswith('#')
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def normalize_domain(value: str) -> str:
|
||||
candidate = (value or '').strip().lower()
|
||||
if not candidate:
|
||||
return ''
|
||||
if '://' not in candidate and '/' not in candidate:
|
||||
candidate = f'https://{candidate.lstrip(".")}'
|
||||
try:
|
||||
parsed = urlparse(candidate)
|
||||
hostname = parsed.hostname or ''
|
||||
if not hostname:
|
||||
return ''
|
||||
if parsed.port:
|
||||
return f'{hostname}_{parsed.port}'
|
||||
return hostname
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def split_filter_patterns(value) -> list[str]:
|
||||
patterns = []
|
||||
seen = set()
|
||||
if isinstance(value, list):
|
||||
raw_values = value
|
||||
elif isinstance(value, str):
|
||||
raw_values = value.splitlines()
|
||||
else:
|
||||
raw_values = []
|
||||
|
||||
for raw_value in raw_values:
|
||||
pattern = str(raw_value or '').strip()
|
||||
if not pattern or pattern in seen:
|
||||
continue
|
||||
seen.add(pattern)
|
||||
patterns.append(pattern)
|
||||
return patterns
|
||||
|
||||
@classmethod
|
||||
def _pattern_matches_url(cls, url: str, pattern: str) -> bool:
|
||||
normalized_pattern = str(pattern or '').strip()
|
||||
if not normalized_pattern:
|
||||
return False
|
||||
|
||||
if re.fullmatch(r'[\w.*:-]+', normalized_pattern):
|
||||
wildcard_only_subdomains = normalized_pattern.startswith('*.')
|
||||
normalized_domain = cls.normalize_domain(
|
||||
normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern
|
||||
)
|
||||
normalized_url_domain = cls.normalize_domain(url)
|
||||
if not normalized_domain or not normalized_url_domain:
|
||||
return False
|
||||
|
||||
pattern_host = normalized_domain.split('_', 1)[0]
|
||||
url_host = normalized_url_domain.split('_', 1)[0]
|
||||
|
||||
if wildcard_only_subdomains:
|
||||
return url_host.endswith(f'.{pattern_host}')
|
||||
|
||||
if normalized_url_domain == normalized_domain:
|
||||
return True
|
||||
return url_host == pattern_host or url_host.endswith(f'.{pattern_host}')
|
||||
|
||||
try:
|
||||
return bool(re.search(normalized_pattern, url))
|
||||
except re.error:
|
||||
return False
|
||||
|
||||
def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
|
||||
if use_effective_config:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(crawl=self, snapshot=snapshot)
|
||||
else:
|
||||
config = self.config or {}
|
||||
return self.split_filter_patterns(config.get('URL_ALLOWLIST', ''))
|
||||
|
||||
def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
|
||||
if use_effective_config:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(crawl=self, snapshot=snapshot)
|
||||
else:
|
||||
config = self.config or {}
|
||||
return self.split_filter_patterns(config.get('URL_DENYLIST', ''))
|
||||
|
||||
def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool:
|
||||
denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot)
|
||||
allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot)
|
||||
|
||||
for pattern in denylist:
|
||||
if self._pattern_matches_url(url, pattern):
|
||||
return False
|
||||
|
||||
if allowlist:
|
||||
return any(self._pattern_matches_url(url, pattern) for pattern in allowlist)
|
||||
|
||||
return True
|
||||
|
||||
def set_url_filters(self, allowlist, denylist) -> None:
|
||||
config = dict(self.config or {})
|
||||
allow_patterns = self.split_filter_patterns(allowlist)
|
||||
deny_patterns = self.split_filter_patterns(denylist)
|
||||
|
||||
if allow_patterns:
|
||||
config['URL_ALLOWLIST'] = '\n'.join(allow_patterns)
|
||||
else:
|
||||
config.pop('URL_ALLOWLIST', None)
|
||||
|
||||
if deny_patterns:
|
||||
config['URL_DENYLIST'] = '\n'.join(deny_patterns)
|
||||
else:
|
||||
config.pop('URL_DENYLIST', None)
|
||||
|
||||
self.config = config
|
||||
|
||||
def apply_crawl_config_filters(self) -> dict[str, int]:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
removed_urls = self.prune_urls(
|
||||
lambda url: not self.url_passes_filters(url, use_effective_config=False)
|
||||
)
|
||||
|
||||
filtered_snapshots = [
|
||||
snapshot
|
||||
for snapshot in self.snapshot_set.filter(
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
|
||||
).only('pk', 'url', 'status')
|
||||
if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False)
|
||||
]
|
||||
|
||||
deleted_snapshots = 0
|
||||
if filtered_snapshots:
|
||||
started_snapshots = [
|
||||
snapshot for snapshot in filtered_snapshots
|
||||
if snapshot.status == Snapshot.StatusChoices.STARTED
|
||||
]
|
||||
for snapshot in started_snapshots:
|
||||
snapshot.cancel_running_hooks()
|
||||
|
||||
filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots]
|
||||
deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete()
|
||||
|
||||
return {
|
||||
'removed_urls': len(removed_urls),
|
||||
'deleted_snapshots': deleted_snapshots,
|
||||
}
|
||||
|
||||
def _iter_url_lines(self) -> list[tuple[str, str]]:
|
||||
entries: list[tuple[str, str]] = []
|
||||
for raw_line in (self.urls or '').splitlines():
|
||||
stripped = raw_line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
if stripped.startswith('#'):
|
||||
entries.append((raw_line.rstrip(), ''))
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(stripped)
|
||||
entries.append((raw_line.rstrip(), str(entry.get('url', '') or '').strip()))
|
||||
except json.JSONDecodeError:
|
||||
entries.append((raw_line.rstrip(), stripped))
|
||||
return entries
|
||||
|
||||
def prune_urls(self, predicate) -> list[str]:
|
||||
kept_lines: list[str] = []
|
||||
removed_urls: list[str] = []
|
||||
|
||||
for raw_line, url in self._iter_url_lines():
|
||||
if not url:
|
||||
kept_lines.append(raw_line)
|
||||
continue
|
||||
if predicate(url):
|
||||
removed_urls.append(url)
|
||||
continue
|
||||
kept_lines.append(raw_line)
|
||||
|
||||
next_urls = '\n'.join(kept_lines)
|
||||
if next_urls != (self.urls or ''):
|
||||
self.urls = next_urls
|
||||
self.save(update_fields=['urls', 'modified_at'])
|
||||
return removed_urls
|
||||
|
||||
def prune_url(self, url: str) -> int:
|
||||
target = (url or '').strip()
|
||||
removed = self.prune_urls(lambda candidate: candidate == target)
|
||||
return len(removed)
|
||||
|
||||
def exclude_domain(self, domain: str) -> dict[str, int | str | bool]:
|
||||
normalized_domain = self.normalize_domain(domain)
|
||||
if not normalized_domain:
|
||||
return {
|
||||
'domain': '',
|
||||
'created': False,
|
||||
'removed_urls': 0,
|
||||
'deleted_snapshots': 0,
|
||||
}
|
||||
|
||||
domains = self.get_url_denylist(use_effective_config=False)
|
||||
created = normalized_domain not in domains
|
||||
if created:
|
||||
domains.append(normalized_domain)
|
||||
self.set_url_filters(
|
||||
self.get_url_allowlist(use_effective_config=False),
|
||||
domains,
|
||||
)
|
||||
self.save(update_fields=['config', 'modified_at'])
|
||||
|
||||
filter_result = self.apply_crawl_config_filters()
|
||||
|
||||
return {
|
||||
'domain': normalized_domain,
|
||||
'created': created,
|
||||
'removed_urls': filter_result['removed_urls'],
|
||||
'deleted_snapshots': filter_result['deleted_snapshots'],
|
||||
}
|
||||
|
||||
def get_system_task(self) -> str | None:
|
||||
urls = self.get_urls_list()
|
||||
if len(urls) != 1:
|
||||
@@ -284,11 +502,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
Returns:
|
||||
True if URL was added, False if skipped (duplicate or depth exceeded)
|
||||
"""
|
||||
import json
|
||||
from archivebox.misc.util import fix_url_from_markdown
|
||||
|
||||
url = entry.get('url', '')
|
||||
url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
|
||||
if not url:
|
||||
return False
|
||||
if not self.url_passes_filters(url):
|
||||
return False
|
||||
|
||||
depth = entry.get('depth', 1)
|
||||
|
||||
@@ -301,20 +521,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
return False
|
||||
|
||||
# Check if already in urls (parse existing JSONL entries)
|
||||
existing_urls = set()
|
||||
for line in self.urls.splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
existing_entry = json.loads(line)
|
||||
existing_urls.add(existing_entry.get('url', ''))
|
||||
except json.JSONDecodeError:
|
||||
existing_urls.add(line.strip())
|
||||
existing_urls = {url for _raw_line, url in self._iter_url_lines() if url}
|
||||
|
||||
if url in existing_urls:
|
||||
return False
|
||||
|
||||
# Append as JSONL
|
||||
entry = {**entry, 'url': url}
|
||||
jsonl_entry = json.dumps(entry)
|
||||
self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n')
|
||||
self.save(update_fields=['urls', 'modified_at'])
|
||||
@@ -327,15 +540,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
Returns:
|
||||
List of newly created Snapshot objects
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.util import fix_url_from_markdown
|
||||
|
||||
created_snapshots = []
|
||||
|
||||
print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
|
||||
print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
|
||||
|
||||
for line in self.urls.splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
@@ -343,13 +552,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
# Parse JSONL or plain URL
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
url = entry.get('url', '')
|
||||
url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
|
||||
depth = entry.get('depth', 0)
|
||||
title = entry.get('title')
|
||||
timestamp = entry.get('timestamp')
|
||||
tags = entry.get('tags', '')
|
||||
except json.JSONDecodeError:
|
||||
url = line.strip()
|
||||
url = fix_url_from_markdown(line.strip())
|
||||
depth = 0
|
||||
title = None
|
||||
timestamp = None
|
||||
@@ -357,6 +566,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
if not url:
|
||||
continue
|
||||
if not self.url_passes_filters(url):
|
||||
continue
|
||||
|
||||
# Skip if depth exceeds max_depth
|
||||
if depth > self.max_depth:
|
||||
|
||||
@@ -64,6 +64,7 @@ from abx_plugins import get_plugins_dir
|
||||
from django.conf import settings
|
||||
from django.utils.safestring import mark_safe
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.misc.util import fix_url_from_markdown
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
@@ -266,7 +267,7 @@ def run_hook(
|
||||
if process.status == 'exited':
|
||||
records = process.get_records() # Get parsed JSONL output
|
||||
"""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from archivebox.machine.models import Process, Machine, NetworkInterface
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
import sys
|
||||
|
||||
@@ -280,6 +281,8 @@ def run_hook(
|
||||
|
||||
# Get current machine
|
||||
machine = Machine.current()
|
||||
iface = NetworkInterface.current(refresh=True)
|
||||
machine = iface.machine
|
||||
|
||||
# Auto-detect parent process if not explicitly provided
|
||||
# This enables automatic hierarchy tracking: Worker -> Hook
|
||||
@@ -294,6 +297,7 @@ def run_hook(
|
||||
# Create a failed Process record for hooks that don't exist
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
iface=iface,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
@@ -449,6 +453,7 @@ def run_hook(
|
||||
# Create Process record
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
iface=iface,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
@@ -458,6 +463,7 @@ def run_hook(
|
||||
|
||||
# Copy the env dict we already built (includes os.environ + all customizations)
|
||||
process.env = env.copy()
|
||||
process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script))
|
||||
|
||||
# Save env before launching
|
||||
process.save()
|
||||
@@ -472,6 +478,7 @@ def run_hook(
|
||||
# Create a failed Process record for exceptions
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
iface=iface,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
cmd=cmd,
|
||||
@@ -544,6 +551,9 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
|
||||
text = urls_file.read_text()
|
||||
for entry in Process.parse_records_from_text(text):
|
||||
if entry.get('url'):
|
||||
entry['url'] = fix_url_from_markdown(str(entry['url']).strip())
|
||||
if not entry['url']:
|
||||
continue
|
||||
# Track which parser plugin found this URL
|
||||
entry['plugin'] = subdir.name
|
||||
urls.append(entry)
|
||||
@@ -615,11 +625,30 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config()
|
||||
|
||||
def normalize_enabled_plugins(value: Any) -> List[str]:
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, str):
|
||||
raw = value.strip()
|
||||
if not raw:
|
||||
return []
|
||||
if raw.startswith('['):
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
parsed = None
|
||||
if isinstance(parsed, list):
|
||||
return [str(plugin).strip() for plugin in parsed if str(plugin).strip()]
|
||||
return [plugin.strip() for plugin in raw.split(',') if plugin.strip()]
|
||||
if isinstance(value, (list, tuple, set)):
|
||||
return [str(plugin).strip() for plugin in value if str(plugin).strip()]
|
||||
return [str(value).strip()] if str(value).strip() else []
|
||||
|
||||
# Support explicit ENABLED_PLUGINS override (legacy)
|
||||
if 'ENABLED_PLUGINS' in config:
|
||||
return config['ENABLED_PLUGINS']
|
||||
return normalize_enabled_plugins(config['ENABLED_PLUGINS'])
|
||||
if 'ENABLED_EXTRACTORS' in config:
|
||||
return config['ENABLED_EXTRACTORS']
|
||||
return normalize_enabled_plugins(config['ENABLED_EXTRACTORS'])
|
||||
|
||||
# Filter all plugins by enabled status
|
||||
all_plugins = get_plugins()
|
||||
@@ -1042,6 +1071,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
|
||||
if record_type == 'Snapshot':
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if record.get('url'):
|
||||
record = {
|
||||
**record,
|
||||
'url': fix_url_from_markdown(str(record['url']).strip()),
|
||||
}
|
||||
if not record['url']:
|
||||
continue
|
||||
|
||||
# Check if discovered snapshot exceeds crawl max_depth
|
||||
snapshot_depth = record.get('depth', 0)
|
||||
crawl = overrides.get('crawl')
|
||||
|
||||
@@ -113,7 +113,7 @@ class BinaryAdmin(BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
readonly_fields = ('created_at', 'modified_at', 'output_dir')
|
||||
|
||||
fieldsets = (
|
||||
('Binary Info', {
|
||||
@@ -166,7 +166,7 @@ class ProcessAdmin(BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid')
|
||||
search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'machine', 'binary', 'iface', 'archiveresult_link')
|
||||
readonly_fields = ('created_at', 'modified_at', 'machine', 'binary_link', 'iface_link', 'archiveresult_link')
|
||||
|
||||
fieldsets = (
|
||||
('Process Info', {
|
||||
@@ -178,7 +178,7 @@ class ProcessAdmin(BaseModelAdmin):
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Execution', {
|
||||
'fields': ('binary', 'iface', 'pid', 'exit_code', 'url'),
|
||||
'fields': ('binary_link', 'iface_link', 'pid', 'exit_code', 'url'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
@@ -216,6 +216,21 @@ class ProcessAdmin(BaseModelAdmin):
|
||||
process.binary.id, process.binary.name, process.binary.version,
|
||||
)
|
||||
|
||||
@admin.display(description='Binary', ordering='binary__name')
|
||||
def binary_link(self, process):
|
||||
return self.binary_info(process)
|
||||
|
||||
@admin.display(description='Network Interface', ordering='iface__id')
|
||||
def iface_link(self, process):
|
||||
if not process.iface:
|
||||
return '-'
|
||||
return format_html(
|
||||
'<a href="/admin/machine/networkinterface/{}/change"><code>{}</code> {}</a>',
|
||||
process.iface.id,
|
||||
str(process.iface.id)[:8],
|
||||
process.iface.iface or process.iface.ip_public or process.iface.ip_local,
|
||||
)
|
||||
|
||||
@admin.display(description='ArchiveResult')
|
||||
def archiveresult_link(self, process):
|
||||
if not hasattr(process, 'archiveresult'):
|
||||
|
||||
@@ -49,6 +49,89 @@ BINARY_RECHECK_INTERVAL = 1 * 30 * 60
|
||||
PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds
|
||||
PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid
|
||||
START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching
|
||||
LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"})
|
||||
|
||||
|
||||
def _find_existing_binary_for_reference(machine: 'Machine', reference: str) -> 'Binary | None':
|
||||
reference = str(reference or '').strip()
|
||||
if not reference:
|
||||
return None
|
||||
|
||||
qs = Binary.objects.filter(machine=machine)
|
||||
|
||||
direct_match = qs.filter(abspath=reference).order_by('-modified_at').first()
|
||||
if direct_match:
|
||||
return direct_match
|
||||
|
||||
ref_name = Path(reference).name
|
||||
if ref_name:
|
||||
named_match = qs.filter(name=ref_name).order_by('-modified_at').first()
|
||||
if named_match:
|
||||
return named_match
|
||||
|
||||
return qs.filter(name=reference).order_by('-modified_at').first()
|
||||
|
||||
|
||||
def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]:
|
||||
env = env or {}
|
||||
plugin_name = str(plugin_name or '').strip()
|
||||
hook_path = str(hook_path or '').strip()
|
||||
plugin_key = plugin_name.upper().replace('-', '_')
|
||||
keys: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def add(key: str) -> None:
|
||||
if key and key not in seen and env.get(key):
|
||||
seen.add(key)
|
||||
keys.append(key)
|
||||
|
||||
if plugin_key:
|
||||
add(f'{plugin_key}_BINARY')
|
||||
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_schema = discover_plugin_configs().get(plugin_name, {})
|
||||
schema_keys = [
|
||||
key
|
||||
for key in (plugin_schema.get('properties') or {})
|
||||
if key.endswith('_BINARY')
|
||||
]
|
||||
except Exception:
|
||||
schema_keys = []
|
||||
|
||||
schema_keys.sort(key=lambda key: (
|
||||
key != f'{plugin_key}_BINARY',
|
||||
key.endswith('_NODE_BINARY'),
|
||||
key.endswith('_CHROME_BINARY'),
|
||||
key,
|
||||
))
|
||||
for key in schema_keys:
|
||||
add(key)
|
||||
|
||||
if plugin_name.startswith('search_backend_'):
|
||||
backend_name = plugin_name.removeprefix('search_backend_').upper().replace('-', '_')
|
||||
configured_engine = str(env.get('SEARCH_BACKEND_ENGINE') or '').strip().upper().replace('-', '_')
|
||||
if backend_name and backend_name == configured_engine:
|
||||
add(f'{backend_name}_BINARY')
|
||||
|
||||
hook_suffix = Path(hook_path).suffix.lower()
|
||||
if hook_suffix == '.js':
|
||||
if plugin_key:
|
||||
add(f'{plugin_key}_NODE_BINARY')
|
||||
add('NODE_BINARY')
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]:
|
||||
if not isinstance(config, dict):
|
||||
return {}
|
||||
|
||||
sanitized = dict(config)
|
||||
for key in LEGACY_MACHINE_CONFIG_KEYS:
|
||||
sanitized.pop(key, None)
|
||||
return sanitized
|
||||
|
||||
|
||||
class MachineManager(models.Manager):
|
||||
@@ -89,13 +172,13 @@ class Machine(ModelWithHealthStats):
|
||||
global _CURRENT_MACHINE
|
||||
if _CURRENT_MACHINE:
|
||||
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
|
||||
return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
|
||||
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
|
||||
_CURRENT_MACHINE = None
|
||||
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
|
||||
guid=get_host_guid(),
|
||||
defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()},
|
||||
)
|
||||
return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
|
||||
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
|
||||
|
||||
@classmethod
|
||||
def _hydrate_config_from_sibling(cls, machine: 'Machine') -> 'Machine':
|
||||
@@ -115,6 +198,15 @@ class Machine(ModelWithHealthStats):
|
||||
machine.save(update_fields=['config', 'modified_at'])
|
||||
return machine
|
||||
|
||||
@classmethod
|
||||
def _sanitize_config(cls, machine: 'Machine') -> 'Machine':
|
||||
sanitized = _sanitize_machine_config(machine.config)
|
||||
current = machine.config or {}
|
||||
if sanitized != current:
|
||||
machine.config = sanitized
|
||||
machine.save(update_fields=['config', 'modified_at'])
|
||||
return machine
|
||||
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Machine model instance to a JSON-serializable dict.
|
||||
@@ -152,11 +244,10 @@ class Machine(ModelWithHealthStats):
|
||||
Returns:
|
||||
Machine instance or None
|
||||
"""
|
||||
config_patch = record.get('config')
|
||||
if isinstance(config_patch, dict) and config_patch:
|
||||
config_patch = _sanitize_machine_config(record.get('config'))
|
||||
if config_patch:
|
||||
machine = Machine.current()
|
||||
if not machine.config:
|
||||
machine.config = {}
|
||||
machine.config = _sanitize_machine_config(machine.config)
|
||||
machine.config.update(config_patch)
|
||||
machine.save(update_fields=['config'])
|
||||
return machine
|
||||
@@ -194,13 +285,17 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
|
||||
|
||||
@classmethod
|
||||
def current(cls) -> 'NetworkInterface':
|
||||
def current(cls, refresh: bool = False) -> 'NetworkInterface':
|
||||
global _CURRENT_INTERFACE
|
||||
machine = Machine.current()
|
||||
if _CURRENT_INTERFACE:
|
||||
if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
|
||||
if (
|
||||
not refresh
|
||||
and _CURRENT_INTERFACE.machine_id == machine.id
|
||||
and timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
|
||||
):
|
||||
return _CURRENT_INTERFACE
|
||||
_CURRENT_INTERFACE = None
|
||||
machine = Machine.current()
|
||||
net_info = get_host_network()
|
||||
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
|
||||
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
|
||||
@@ -747,14 +842,17 @@ class ProcessManager(models.Manager):
|
||||
|
||||
Called during migration and when creating new ArchiveResults.
|
||||
"""
|
||||
iface = kwargs.get('iface') or NetworkInterface.current()
|
||||
|
||||
# Defaults from ArchiveResult if not provided
|
||||
defaults = {
|
||||
'machine': Machine.current(),
|
||||
'machine': iface.machine,
|
||||
'pwd': kwargs.get('pwd') or str(archiveresult.snapshot.output_dir / archiveresult.plugin),
|
||||
'cmd': kwargs.get('cmd') or [],
|
||||
'status': 'queued',
|
||||
'timeout': kwargs.get('timeout', 120),
|
||||
'env': kwargs.get('env', {}),
|
||||
'iface': iface,
|
||||
}
|
||||
defaults.update(kwargs)
|
||||
|
||||
@@ -971,6 +1069,28 @@ class Process(models.Model):
|
||||
record['timeout'] = self.timeout
|
||||
return record
|
||||
|
||||
def hydrate_binary_from_context(self, *, plugin_name: str = '', hook_path: str = '') -> 'Binary | None':
|
||||
machine = self.machine if self.machine_id else Machine.current()
|
||||
|
||||
references: list[str] = []
|
||||
for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env):
|
||||
value = str(self.env.get(key) or '').strip()
|
||||
if value and value not in references:
|
||||
references.append(value)
|
||||
|
||||
if self.cmd:
|
||||
cmd_0 = str(self.cmd[0]).strip()
|
||||
if cmd_0 and cmd_0 not in references:
|
||||
references.append(cmd_0)
|
||||
|
||||
for reference in references:
|
||||
binary = _find_existing_binary_for_reference(machine, reference)
|
||||
if binary:
|
||||
self.binary = binary
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def parse_records_from_text(cls, text: str) -> list[dict]:
|
||||
"""Parse JSONL records from raw text using the shared JSONL parser."""
|
||||
@@ -1044,6 +1164,7 @@ class Process(models.Model):
|
||||
|
||||
current_pid = os.getpid()
|
||||
machine = Machine.current()
|
||||
iface = NetworkInterface.current()
|
||||
|
||||
# Check cache validity
|
||||
if _CURRENT_PROCESS:
|
||||
@@ -1053,6 +1174,9 @@ class Process(models.Model):
|
||||
and _CURRENT_PROCESS.machine_id == machine.id
|
||||
and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
|
||||
):
|
||||
if _CURRENT_PROCESS.iface_id != iface.id:
|
||||
_CURRENT_PROCESS.iface = iface
|
||||
_CURRENT_PROCESS.save(update_fields=['iface', 'modified_at'])
|
||||
_CURRENT_PROCESS.ensure_log_files()
|
||||
return _CURRENT_PROCESS
|
||||
_CURRENT_PROCESS = None
|
||||
@@ -1080,6 +1204,9 @@ class Process(models.Model):
|
||||
db_start_time = existing.started_at.timestamp()
|
||||
if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
|
||||
_CURRENT_PROCESS = existing
|
||||
if existing.iface_id != iface.id:
|
||||
existing.iface = iface
|
||||
existing.save(update_fields=['iface', 'modified_at'])
|
||||
_CURRENT_PROCESS.ensure_log_files()
|
||||
return existing
|
||||
|
||||
@@ -1112,6 +1239,7 @@ class Process(models.Model):
|
||||
pid=current_pid,
|
||||
started_at=started_at,
|
||||
status=cls.StatusChoices.RUNNING,
|
||||
iface=iface,
|
||||
)
|
||||
_CURRENT_PROCESS.ensure_log_files()
|
||||
return _CURRENT_PROCESS
|
||||
@@ -1176,7 +1304,9 @@ class Process(models.Model):
|
||||
|
||||
if 'supervisord' in argv_str:
|
||||
return cls.TypeChoices.SUPERVISORD
|
||||
elif 'archivebox run' in argv_str or 'runner_watch' in argv_str:
|
||||
elif 'runner_watch' in argv_str:
|
||||
return cls.TypeChoices.WORKER
|
||||
elif 'archivebox run' in argv_str:
|
||||
return cls.TypeChoices.ORCHESTRATOR
|
||||
elif 'archivebox' in argv_str:
|
||||
return cls.TypeChoices.CLI
|
||||
@@ -1321,14 +1451,17 @@ class Process(models.Model):
|
||||
if self.cmd:
|
||||
try:
|
||||
os_cmdline = os_proc.cmdline()
|
||||
# Check if first arg (binary) matches
|
||||
if os_cmdline and self.cmd:
|
||||
os_binary = os_cmdline[0] if os_cmdline else ''
|
||||
db_binary = self.cmd[0] if self.cmd else ''
|
||||
# Match by basename (handles /usr/bin/python3 vs python3)
|
||||
if os_binary and db_binary:
|
||||
if Path(os_binary).name != Path(db_binary).name:
|
||||
return None # Different binary, PID reused
|
||||
if db_binary:
|
||||
db_binary_name = Path(db_binary).name
|
||||
cmd_matches = any(
|
||||
arg == db_binary or Path(arg).name == db_binary_name
|
||||
for arg in os_cmdline
|
||||
if arg
|
||||
)
|
||||
if not cmd_matches:
|
||||
return None # Different command, PID reused
|
||||
except (psutil.AccessDenied, psutil.ZombieProcess):
|
||||
pass # Can't check cmdline, trust start time match
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import re
|
||||
import requests
|
||||
import json as pyjson
|
||||
import http.cookiejar
|
||||
from dateparser import parse as dateparser
|
||||
|
||||
from typing import List, Optional, Any, Callable
|
||||
from pathlib import Path
|
||||
@@ -13,7 +14,6 @@ from hashlib import sha256
|
||||
from urllib.parse import urlparse, quote, unquote
|
||||
from html import escape, unescape
|
||||
from datetime import datetime, timezone
|
||||
from dateparser import parse as dateparser
|
||||
from requests.exceptions import RequestException, ReadTimeout
|
||||
|
||||
from base32_crockford import encode as base32_encode
|
||||
@@ -122,9 +122,35 @@ def fix_url_from_markdown(url_str: str) -> str:
|
||||
|
||||
return url_str
|
||||
|
||||
def split_comma_separated_urls(url: str):
|
||||
offset = 0
|
||||
while True:
|
||||
http_index = url.find('http://', 1)
|
||||
https_index = url.find('https://', 1)
|
||||
next_indices = [idx for idx in (http_index, https_index) if idx != -1]
|
||||
if not next_indices:
|
||||
yield offset, url
|
||||
return
|
||||
|
||||
next_index = min(next_indices)
|
||||
if url[next_index - 1] != ',':
|
||||
yield offset, url
|
||||
return
|
||||
|
||||
yield offset, url[:next_index - 1]
|
||||
offset += next_index
|
||||
url = url[next_index:]
|
||||
|
||||
def find_all_urls(urls_str: str):
|
||||
for url in re.findall(URL_REGEX, urls_str):
|
||||
yield fix_url_from_markdown(url)
|
||||
skipped_starts = set()
|
||||
for match in re.finditer(URL_REGEX, urls_str):
|
||||
if match.start() in skipped_starts:
|
||||
continue
|
||||
|
||||
for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))):
|
||||
if offset:
|
||||
skipped_starts.add(match.start() + offset)
|
||||
yield url
|
||||
|
||||
|
||||
def is_static_file(url: str):
|
||||
@@ -214,7 +240,25 @@ def parse_date(date: Any) -> datetime | None:
|
||||
date = str(date)
|
||||
|
||||
if isinstance(date, str):
|
||||
parsed_date = dateparser(date, settings={'TIMEZONE': 'UTC'})
|
||||
normalized = date.strip()
|
||||
if not normalized:
|
||||
raise ValueError(f'Tried to parse invalid date string! {date}')
|
||||
|
||||
try:
|
||||
return datetime.fromtimestamp(float(normalized), tz=timezone.utc)
|
||||
except (TypeError, ValueError, OSError):
|
||||
pass
|
||||
|
||||
try:
|
||||
iso_date = normalized.replace('Z', '+00:00')
|
||||
parsed_date = datetime.fromisoformat(iso_date)
|
||||
if parsed_date.tzinfo is None:
|
||||
return parsed_date.replace(tzinfo=timezone.utc)
|
||||
return parsed_date.astimezone(timezone.utc)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'})
|
||||
if parsed_date is None:
|
||||
raise ValueError(f'Tried to parse invalid date string! {date}')
|
||||
return parsed_date.astimezone(timezone.utc)
|
||||
@@ -408,6 +452,7 @@ assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguat
|
||||
|
||||
URL_REGEX_TESTS = [
|
||||
('https://example.com', ['https://example.com']),
|
||||
('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']),
|
||||
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
|
||||
|
||||
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
|
||||
|
||||
@@ -1,2 +1,169 @@
|
||||
__package__ = "archivebox.personas"
|
||||
|
||||
# Register your models here.
|
||||
import shutil
|
||||
|
||||
from django.contrib import admin, messages
|
||||
from django.utils.html import format_html, format_html_join
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.personas.forms import PersonaAdminForm
|
||||
from archivebox.personas.importers import discover_local_browser_profiles
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
|
||||
class PersonaAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
form = PersonaAdminForm
|
||||
change_form_template = "admin/personas/persona/change_form.html"
|
||||
|
||||
list_display = ("name", "created_by", "created_at", "chrome_profile_state", "cookies_state", "auth_state")
|
||||
search_fields = ("name", "created_by__username")
|
||||
list_filter = ("created_by",)
|
||||
ordering = ["name"]
|
||||
list_per_page = 100
|
||||
readonly_fields = ("id", "created_at", "persona_paths", "import_artifact_status")
|
||||
|
||||
add_fieldsets = (
|
||||
("Persona", {
|
||||
"fields": ("name", "created_by"),
|
||||
"classes": ("card",),
|
||||
}),
|
||||
("Browser Import", {
|
||||
"fields": (
|
||||
"import_mode",
|
||||
"import_discovered_profile",
|
||||
"import_source",
|
||||
"import_profile_name",
|
||||
"import_copy_profile",
|
||||
"import_extract_cookies",
|
||||
"import_capture_storage",
|
||||
),
|
||||
"classes": ("card", "wide"),
|
||||
}),
|
||||
("Advanced", {
|
||||
"fields": ("config",),
|
||||
"classes": ("card", "wide"),
|
||||
}),
|
||||
)
|
||||
|
||||
change_fieldsets = add_fieldsets + (
|
||||
("Artifacts", {
|
||||
"fields": ("persona_paths", "import_artifact_status"),
|
||||
"classes": ("card", "wide"),
|
||||
}),
|
||||
("Timestamps", {
|
||||
"fields": ("id", "created_at"),
|
||||
"classes": ("card",),
|
||||
}),
|
||||
)
|
||||
|
||||
@admin.display(description="Chrome Profile")
|
||||
def chrome_profile_state(self, obj: Persona) -> str:
|
||||
return "yes" if (obj.path / "chrome_user_data").exists() else "no"
|
||||
|
||||
@admin.display(description="cookies.txt")
|
||||
def cookies_state(self, obj: Persona) -> str:
|
||||
return "yes" if obj.COOKIES_FILE else "no"
|
||||
|
||||
@admin.display(description="auth.json")
|
||||
def auth_state(self, obj: Persona) -> str:
|
||||
return "yes" if obj.AUTH_STORAGE_FILE else "no"
|
||||
|
||||
@admin.display(description="Persona Paths")
|
||||
def persona_paths(self, obj: Persona) -> str:
|
||||
return format_html(
|
||||
"<div class='abx-persona-path-list'>"
|
||||
"<div><strong>Persona root</strong><code>{}</code></div>"
|
||||
"<div><strong>chrome_user_data</strong><code>{}</code></div>"
|
||||
"<div><strong>chrome_extensions</strong><code>{}</code></div>"
|
||||
"<div><strong>chrome_downloads</strong><code>{}</code></div>"
|
||||
"<div><strong>cookies.txt</strong><code>{}</code></div>"
|
||||
"<div><strong>auth.json</strong><code>{}</code></div>"
|
||||
"</div>",
|
||||
obj.path,
|
||||
obj.CHROME_USER_DATA_DIR,
|
||||
obj.CHROME_EXTENSIONS_DIR,
|
||||
obj.CHROME_DOWNLOADS_DIR,
|
||||
obj.COOKIES_FILE or (obj.path / "cookies.txt"),
|
||||
obj.AUTH_STORAGE_FILE or (obj.path / "auth.json"),
|
||||
)
|
||||
|
||||
@admin.display(description="Import Artifacts")
|
||||
def import_artifact_status(self, obj: Persona) -> str:
|
||||
entries = [
|
||||
("Browser profile", (obj.path / "chrome_user_data").exists(), obj.CHROME_USER_DATA_DIR),
|
||||
("cookies.txt", bool(obj.COOKIES_FILE), obj.COOKIES_FILE or (obj.path / "cookies.txt")),
|
||||
("auth.json", bool(obj.AUTH_STORAGE_FILE), obj.AUTH_STORAGE_FILE or (obj.path / "auth.json")),
|
||||
]
|
||||
return format_html(
|
||||
"<div class='abx-persona-artifacts'>{}</div>",
|
||||
format_html_join(
|
||||
"",
|
||||
"<div class='abx-persona-artifact'><strong>{}</strong><span class='{}'>{}</span><code>{}</code></div>",
|
||||
(
|
||||
(
|
||||
label,
|
||||
"abx-artifact-state abx-artifact-state--yes" if enabled else "abx-artifact-state abx-artifact-state--no",
|
||||
"present" if enabled else "missing",
|
||||
path,
|
||||
)
|
||||
for label, enabled, path in entries
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request, obj=None):
|
||||
return self.change_fieldsets if obj else self.add_fieldsets
|
||||
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None):
|
||||
context["detected_profile_count"] = len(discover_local_browser_profiles())
|
||||
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
old_path = None
|
||||
new_path = None
|
||||
if change:
|
||||
previous = Persona.objects.get(pk=obj.pk)
|
||||
if previous.name != obj.name:
|
||||
old_path = previous.path
|
||||
new_path = obj.path
|
||||
|
||||
super().save_model(request, obj, form, change)
|
||||
|
||||
if old_path and new_path and old_path != new_path and old_path.exists():
|
||||
if new_path.exists():
|
||||
raise FileExistsError(f"Cannot rename Persona directory because the destination already exists: {new_path}")
|
||||
shutil.move(str(old_path), str(new_path))
|
||||
|
||||
obj.ensure_dirs()
|
||||
|
||||
import_result = form.apply_import(obj)
|
||||
if import_result is None:
|
||||
return
|
||||
|
||||
completed_actions = []
|
||||
if import_result.profile_copied:
|
||||
completed_actions.append("profile copied")
|
||||
if import_result.cookies_imported:
|
||||
completed_actions.append("cookies.txt generated")
|
||||
if import_result.storage_captured:
|
||||
completed_actions.append("auth.json captured")
|
||||
if import_result.user_agent_imported:
|
||||
completed_actions.append("USER_AGENT copied")
|
||||
|
||||
if completed_actions:
|
||||
messages.success(
|
||||
request,
|
||||
f'Imported {", ".join(completed_actions)} from {import_result.source.display_label}.',
|
||||
)
|
||||
else:
|
||||
messages.warning(
|
||||
request,
|
||||
f"Persona saved, but no browser artifacts were imported from {import_result.source.display_label}.",
|
||||
)
|
||||
|
||||
for warning in import_result.warnings:
|
||||
messages.warning(request, warning)
|
||||
|
||||
|
||||
def register_admin(admin_site: admin.AdminSite) -> None:
|
||||
admin_site.register(Persona, PersonaAdmin)
|
||||
|
||||
210
archivebox/personas/export_browser_state.js
Normal file
210
archivebox/personas/export_browser_state.js
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Export cookies and open-tab storage from a Chromium profile or live CDP URL.
|
||||
*
|
||||
* Environment variables:
|
||||
* ARCHIVEBOX_ABX_PLUGINS_DIR Absolute path to abx_plugins/plugins
|
||||
* CHROME_USER_DATA_DIR Local Chromium user-data directory to launch
|
||||
* CHROME_CDP_URL Existing browser CDP URL to attach to
|
||||
* COOKIES_OUTPUT_FILE Optional output path for Netscape cookies.txt
|
||||
* AUTH_STORAGE_OUTPUT_FILE Optional output path for auth.json
|
||||
* CHROME_BINARY Optional browser binary override
|
||||
* NODE_MODULES_DIR Optional node_modules path for puppeteer-core
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const os = require('os');
|
||||
const path = require('path');
|
||||
|
||||
const pluginsDir = process.env.ARCHIVEBOX_ABX_PLUGINS_DIR || process.env.ABX_PLUGINS_DIR;
|
||||
if (!pluginsDir) {
|
||||
console.error('ARCHIVEBOX_ABX_PLUGINS_DIR is required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const baseUtils = require(path.join(pluginsDir, 'base', 'utils.js'));
|
||||
baseUtils.ensureNodeModuleResolution(module);
|
||||
|
||||
const chromeUtils = require(path.join(pluginsDir, 'chrome', 'chrome_utils.js'));
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
function cookieToNetscape(cookie) {
|
||||
let domain = cookie.domain;
|
||||
if (!domain.startsWith('.') && !cookie.hostOnly) {
|
||||
domain = '.' + domain;
|
||||
}
|
||||
|
||||
const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
|
||||
const cookiePath = cookie.path || '/';
|
||||
const secure = cookie.secure ? 'TRUE' : 'FALSE';
|
||||
const expiry = cookie.expires && cookie.expires > 0 ? Math.floor(cookie.expires).toString() : '0';
|
||||
|
||||
return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`;
|
||||
}
|
||||
|
||||
function writeCookiesFile(cookies, outputPath) {
|
||||
const lines = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
];
|
||||
|
||||
for (const cookie of cookies) {
|
||||
lines.push(cookieToNetscape(cookie));
|
||||
}
|
||||
|
||||
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
||||
fs.writeFileSync(outputPath, lines.join('\n') + '\n');
|
||||
}
|
||||
|
||||
async function collectStorage(browser) {
|
||||
const localStorage = {};
|
||||
const sessionStorage = {};
|
||||
const pages = await browser.pages();
|
||||
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const url = page.url();
|
||||
if (!url || url === 'about:blank') continue;
|
||||
if (url.startsWith('chrome:') || url.startsWith('edge:') || url.startsWith('devtools:')) continue;
|
||||
|
||||
const payload = await page.evaluate(() => ({
|
||||
origin: window.location.origin,
|
||||
localStorage: Object.fromEntries(Object.entries(window.localStorage)),
|
||||
sessionStorage: Object.fromEntries(Object.entries(window.sessionStorage)),
|
||||
}));
|
||||
|
||||
if (!payload.origin || payload.origin === 'null') continue;
|
||||
if (Object.keys(payload.localStorage || {}).length > 0) {
|
||||
localStorage[payload.origin] = payload.localStorage;
|
||||
}
|
||||
if (Object.keys(payload.sessionStorage || {}).length > 0) {
|
||||
sessionStorage[payload.origin] = payload.sessionStorage;
|
||||
}
|
||||
} catch (error) {
|
||||
// Ignore pages that cannot be inspected via evaluate().
|
||||
}
|
||||
}
|
||||
|
||||
return { localStorage, sessionStorage };
|
||||
}
|
||||
|
||||
async function openBrowser() {
|
||||
const cdpUrl = process.env.CHROME_CDP_URL || '';
|
||||
if (cdpUrl) {
|
||||
const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, cdpUrl, { defaultViewport: null });
|
||||
return {
|
||||
browser,
|
||||
async cleanup() {
|
||||
try {
|
||||
await browser.disconnect();
|
||||
} catch (error) {}
|
||||
},
|
||||
sourceDescription: cdpUrl,
|
||||
};
|
||||
}
|
||||
|
||||
const userDataDir = process.env.CHROME_USER_DATA_DIR;
|
||||
if (!userDataDir) {
|
||||
throw new Error('Either CHROME_USER_DATA_DIR or CHROME_CDP_URL is required');
|
||||
}
|
||||
if (!fs.existsSync(userDataDir)) {
|
||||
throw new Error(`User data directory does not exist: ${userDataDir}`);
|
||||
}
|
||||
|
||||
const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'abx-browser-state-'));
|
||||
const binary = process.env.CHROME_BINARY || chromeUtils.findAnyChromiumBinary();
|
||||
if (!binary) {
|
||||
throw new Error('Could not find a Chromium binary for browser state export');
|
||||
}
|
||||
|
||||
const launched = await chromeUtils.launchChromium({
|
||||
binary,
|
||||
outputDir,
|
||||
userDataDir,
|
||||
headless: true,
|
||||
killZombies: false,
|
||||
});
|
||||
|
||||
if (!launched.success) {
|
||||
throw new Error(launched.error || 'Chrome launch failed');
|
||||
}
|
||||
|
||||
const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, launched.cdpUrl, { defaultViewport: null });
|
||||
|
||||
return {
|
||||
browser,
|
||||
async cleanup() {
|
||||
try {
|
||||
await browser.disconnect();
|
||||
} catch (error) {}
|
||||
try {
|
||||
await chromeUtils.killChrome(launched.pid, outputDir);
|
||||
} catch (error) {}
|
||||
try {
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
} catch (error) {}
|
||||
},
|
||||
sourceDescription: userDataDir,
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const cookiesOutput = process.env.COOKIES_OUTPUT_FILE || '';
|
||||
const authOutput = process.env.AUTH_STORAGE_OUTPUT_FILE || '';
|
||||
if (!cookiesOutput && !authOutput) {
|
||||
throw new Error('COOKIES_OUTPUT_FILE or AUTH_STORAGE_OUTPUT_FILE is required');
|
||||
}
|
||||
|
||||
const { browser, cleanup, sourceDescription } = await openBrowser();
|
||||
|
||||
try {
|
||||
const session = await browser.target().createCDPSession();
|
||||
const browserVersion = await session.send('Browser.getVersion');
|
||||
const cookieResult = await session.send('Storage.getCookies');
|
||||
const cookies = cookieResult?.cookies || [];
|
||||
const { localStorage, sessionStorage } = await collectStorage(browser);
|
||||
const userAgent = browserVersion?.userAgent || '';
|
||||
|
||||
if (cookiesOutput) {
|
||||
writeCookiesFile(cookies, cookiesOutput);
|
||||
}
|
||||
|
||||
if (authOutput) {
|
||||
fs.mkdirSync(path.dirname(authOutput), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
authOutput,
|
||||
JSON.stringify(
|
||||
{
|
||||
TYPE: 'auth',
|
||||
SOURCE: sourceDescription,
|
||||
captured_at: new Date().toISOString(),
|
||||
user_agent: userAgent,
|
||||
cookies,
|
||||
localStorage,
|
||||
sessionStorage,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
) + '\n',
|
||||
);
|
||||
}
|
||||
|
||||
console.error(
|
||||
`[+] Exported ${cookies.length} cookies` +
|
||||
`${authOutput ? ` and ${Object.keys(localStorage).length + Object.keys(sessionStorage).length} storage origins` : ''}` +
|
||||
`${userAgent ? ' with browser USER_AGENT' : ''}` +
|
||||
` from ${sourceDescription}`,
|
||||
);
|
||||
} finally {
|
||||
await cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(`ERROR: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
176
archivebox/personas/forms.py
Normal file
176
archivebox/personas/forms.py
Normal file
@@ -0,0 +1,176 @@
|
||||
__package__ = "archivebox.personas"
|
||||
|
||||
from typing import Any
|
||||
|
||||
from django import forms
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
from archivebox.personas.importers import (
|
||||
PersonaImportResult,
|
||||
PersonaImportSource,
|
||||
discover_local_browser_profiles,
|
||||
import_persona_from_source,
|
||||
resolve_custom_import_source,
|
||||
validate_persona_name,
|
||||
)
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
|
||||
def _mode_label(title: str, description: str) -> str:
|
||||
return mark_safe(
|
||||
f'<span class="abx-import-mode-option"><strong>{title}</strong><span>{description}</span></span>'
|
||||
)
|
||||
|
||||
|
||||
class PersonaAdminForm(forms.ModelForm):
|
||||
import_mode = forms.ChoiceField(
|
||||
required=False,
|
||||
initial="none",
|
||||
label="Bootstrap this persona",
|
||||
widget=forms.RadioSelect,
|
||||
choices=(
|
||||
("none", _mode_label("Blank Persona", "Create the persona without importing browser state yet.")),
|
||||
("discovered", _mode_label("Use a detected profile", "Pick from Chromium profiles auto-discovered on this host.")),
|
||||
("custom", _mode_label("Use a custom path or CDP URL", "Paste an absolute Chromium path or attach to a live browser debugging endpoint.")),
|
||||
),
|
||||
help_text="These options run after the Persona row is saved, using the same backend import helpers as the CLI.",
|
||||
)
|
||||
import_discovered_profile = forms.ChoiceField(
|
||||
required=False,
|
||||
label="Autodiscovered profiles",
|
||||
widget=forms.RadioSelect,
|
||||
choices=(),
|
||||
help_text="Detected from local Chrome, Chromium, Brave, and Edge profile roots.",
|
||||
)
|
||||
import_source = forms.CharField(
|
||||
required=False,
|
||||
label="Absolute path or CDP URL",
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "/Users/alice/Library/Application Support/Google/Chrome or ws://127.0.0.1:9222/devtools/browser/...",
|
||||
"style": "width: 100%; font-family: monospace;",
|
||||
}
|
||||
),
|
||||
help_text="Accepts an absolute Chromium user-data dir, an exact profile dir, or a live HTTP/WS CDP endpoint.",
|
||||
)
|
||||
import_profile_name = forms.CharField(
|
||||
required=False,
|
||||
label="Profile directory name",
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "Default or Profile 1",
|
||||
"style": "width: 100%; font-family: monospace;",
|
||||
}
|
||||
),
|
||||
help_text="Only used when the custom path points at a browser root containing multiple profiles.",
|
||||
)
|
||||
import_copy_profile = forms.BooleanField(
|
||||
required=False,
|
||||
initial=True,
|
||||
label="Copy browser profile into this persona",
|
||||
help_text="Copies the chosen Chromium user-data tree into `chrome_user_data` for future archiving runs.",
|
||||
)
|
||||
import_extract_cookies = forms.BooleanField(
|
||||
required=False,
|
||||
initial=True,
|
||||
label="Generate `cookies.txt`",
|
||||
help_text="Extracts cookies through Chrome DevTools Protocol and writes a Netscape cookie jar for wget/curl-based plugins.",
|
||||
)
|
||||
import_capture_storage = forms.BooleanField(
|
||||
required=False,
|
||||
initial=True,
|
||||
label="Capture open-tab storage into `auth.json`",
|
||||
help_text="Snapshots currently open tab `localStorage` / `sessionStorage` values by origin. This is most useful for live CDP imports.",
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = Persona
|
||||
fields = ("name", "created_by", "config")
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self.discovered_profiles = discover_local_browser_profiles()
|
||||
self._resolved_import_source: PersonaImportSource | None = None
|
||||
|
||||
self.fields["import_mode"].widget.attrs["class"] = "abx-import-mode"
|
||||
self.fields["import_discovered_profile"].widget.attrs["class"] = "abx-profile-picker"
|
||||
|
||||
if self.discovered_profiles:
|
||||
self.fields["import_discovered_profile"].choices = [
|
||||
(profile.choice_value, profile.as_choice_label()) for profile in self.discovered_profiles
|
||||
]
|
||||
else:
|
||||
self.fields["import_discovered_profile"].choices = []
|
||||
self.fields["import_discovered_profile"].help_text = (
|
||||
"No local Chromium profiles were detected on this host right now. "
|
||||
"Use the custom path/CDP option if the browser data lives elsewhere."
|
||||
)
|
||||
|
||||
def clean_name(self) -> str:
|
||||
name = str(self.cleaned_data.get("name") or "").strip()
|
||||
is_valid, error_message = validate_persona_name(name)
|
||||
if not is_valid:
|
||||
raise forms.ValidationError(error_message)
|
||||
return name
|
||||
|
||||
def clean(self) -> dict[str, Any]:
|
||||
cleaned_data = super().clean()
|
||||
self._resolved_import_source = None
|
||||
|
||||
import_mode = str(cleaned_data.get("import_mode") or "none").strip() or "none"
|
||||
if import_mode == "none":
|
||||
return cleaned_data
|
||||
|
||||
if import_mode == "discovered":
|
||||
selection = str(cleaned_data.get("import_discovered_profile") or "").strip()
|
||||
if not selection:
|
||||
self.add_error("import_discovered_profile", "Choose one of the discovered profiles to import.")
|
||||
return cleaned_data
|
||||
try:
|
||||
self._resolved_import_source = PersonaImportSource.from_choice_value(selection)
|
||||
except ValueError as err:
|
||||
self.add_error("import_discovered_profile", str(err))
|
||||
return cleaned_data
|
||||
elif import_mode == "custom":
|
||||
raw_value = str(cleaned_data.get("import_source") or "").strip()
|
||||
if not raw_value:
|
||||
self.add_error("import_source", "Provide an absolute Chromium profile path or a CDP URL.")
|
||||
return cleaned_data
|
||||
try:
|
||||
self._resolved_import_source = resolve_custom_import_source(
|
||||
raw_value,
|
||||
profile_dir=str(cleaned_data.get("import_profile_name") or "").strip() or None,
|
||||
)
|
||||
except ValueError as err:
|
||||
self.add_error("import_source", str(err))
|
||||
return cleaned_data
|
||||
else:
|
||||
self.add_error("import_mode", "Choose how this Persona should be bootstrapped.")
|
||||
return cleaned_data
|
||||
|
||||
copy_profile = bool(cleaned_data.get("import_copy_profile"))
|
||||
import_cookies = bool(cleaned_data.get("import_extract_cookies"))
|
||||
capture_storage = bool(cleaned_data.get("import_capture_storage"))
|
||||
|
||||
if self._resolved_import_source.kind == "cdp":
|
||||
if not (import_cookies or capture_storage):
|
||||
self.add_error(
|
||||
"import_extract_cookies",
|
||||
"CDP imports can only capture cookies and/or open-tab storage. Profile copying is not available for a remote browser endpoint.",
|
||||
)
|
||||
elif not (copy_profile or import_cookies or capture_storage):
|
||||
raise forms.ValidationError("Select at least one import action.")
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def apply_import(self, persona: Persona) -> PersonaImportResult | None:
|
||||
if not self._resolved_import_source:
|
||||
return None
|
||||
|
||||
return import_persona_from_source(
|
||||
persona,
|
||||
self._resolved_import_source,
|
||||
copy_profile=bool(self.cleaned_data.get("import_copy_profile")),
|
||||
import_cookies=bool(self.cleaned_data.get("import_extract_cookies")),
|
||||
capture_storage=bool(self.cleaned_data.get("import_capture_storage")),
|
||||
)
|
||||
845
archivebox/personas/importers.py
Normal file
845
archivebox/personas/importers.py
Normal file
@@ -0,0 +1,845 @@
|
||||
"""
|
||||
Shared persona browser discovery/import helpers.
|
||||
|
||||
These helpers are used by both the CLI and the Django admin so Persona import
|
||||
behavior stays consistent regardless of where it is triggered from.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import SafeString
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
|
||||
BROWSER_LABELS = {
|
||||
"chrome": "Google Chrome",
|
||||
"chromium": "Chromium",
|
||||
"brave": "Brave",
|
||||
"edge": "Microsoft Edge",
|
||||
"custom": "Custom Path",
|
||||
"persona": "Persona Template",
|
||||
}
|
||||
|
||||
BROWSER_PROFILE_DIR_NAMES = (
|
||||
"Default",
|
||||
"Profile ",
|
||||
"Guest Profile",
|
||||
)
|
||||
|
||||
VOLATILE_PROFILE_COPY_PATTERNS = (
|
||||
"Cache",
|
||||
"Code Cache",
|
||||
"GPUCache",
|
||||
"ShaderCache",
|
||||
"Service Worker",
|
||||
"GCM Store",
|
||||
"*.log",
|
||||
"Crashpad",
|
||||
"BrowserMetrics",
|
||||
"BrowserMetrics-spare.pma",
|
||||
"SingletonLock",
|
||||
"SingletonSocket",
|
||||
"SingletonCookie",
|
||||
)
|
||||
|
||||
PERSONA_PROFILE_DIR_CANDIDATES = (
|
||||
"chrome_profile",
|
||||
"chrome_user_data",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PersonaImportSource:
|
||||
kind: str
|
||||
browser: str = "custom"
|
||||
source_name: str | None = None
|
||||
user_data_dir: Path | None = None
|
||||
profile_dir: str | None = None
|
||||
browser_binary: str | None = None
|
||||
cdp_url: str | None = None
|
||||
|
||||
@property
|
||||
def browser_label(self) -> str:
|
||||
return BROWSER_LABELS.get(self.browser, self.browser.title())
|
||||
|
||||
@property
|
||||
def profile_path(self) -> Path | None:
|
||||
if not self.user_data_dir or not self.profile_dir:
|
||||
return None
|
||||
return self.user_data_dir / self.profile_dir
|
||||
|
||||
@property
|
||||
def display_label(self) -> str:
|
||||
if self.kind == "cdp":
|
||||
return self.cdp_url or "CDP URL"
|
||||
profile_suffix = f" / {self.profile_dir}" if self.profile_dir else ""
|
||||
source_prefix = f": {self.source_name}" if self.source_name else ""
|
||||
return f"{self.browser_label}{source_prefix}{profile_suffix}"
|
||||
|
||||
@property
|
||||
def choice_value(self) -> str:
|
||||
return json.dumps(
|
||||
{
|
||||
"kind": self.kind,
|
||||
"browser": self.browser,
|
||||
"source_name": self.source_name or "",
|
||||
"user_data_dir": str(self.user_data_dir) if self.user_data_dir else "",
|
||||
"profile_dir": self.profile_dir or "",
|
||||
"browser_binary": self.browser_binary or "",
|
||||
"cdp_url": self.cdp_url or "",
|
||||
},
|
||||
sort_keys=True,
|
||||
)
|
||||
|
||||
def as_choice_label(self) -> SafeString:
|
||||
path_str = str(self.profile_path or self.user_data_dir or self.cdp_url or "")
|
||||
binary_suffix = f"Using {self.browser_binary}" if self.browser_binary else "Will auto-detect a Chromium binary"
|
||||
return format_html(
|
||||
'<span class="abx-profile-option">'
|
||||
'<strong>{}</strong>'
|
||||
'<span class="abx-profile-option__meta">{}</span>'
|
||||
'<code>{}</code>'
|
||||
"</span>",
|
||||
self.display_label,
|
||||
binary_suffix,
|
||||
path_str,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_choice_value(cls, value: str) -> "PersonaImportSource":
|
||||
try:
|
||||
payload = json.loads(value)
|
||||
except json.JSONDecodeError as err:
|
||||
raise ValueError("Invalid discovered profile selection.") from err
|
||||
|
||||
if payload.get("kind") != "browser-profile":
|
||||
raise ValueError("Invalid discovered profile selection.")
|
||||
|
||||
user_data_dir = Path(str(payload.get("user_data_dir") or "")).expanduser()
|
||||
profile_dir = str(payload.get("profile_dir") or "").strip()
|
||||
browser = str(payload.get("browser") or "custom").strip().lower() or "custom"
|
||||
source_name = str(payload.get("source_name") or "").strip() or None
|
||||
browser_binary = str(payload.get("browser_binary") or "").strip() or None
|
||||
|
||||
return resolve_browser_profile_source(
|
||||
browser=browser,
|
||||
source_name=source_name,
|
||||
user_data_dir=user_data_dir,
|
||||
profile_dir=profile_dir,
|
||||
browser_binary=browser_binary,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PersonaImportResult:
|
||||
source: PersonaImportSource
|
||||
profile_copied: bool = False
|
||||
cookies_imported: bool = False
|
||||
storage_captured: bool = False
|
||||
user_agent_imported: bool = False
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def did_work(self) -> bool:
|
||||
return self.profile_copied or self.cookies_imported or self.storage_captured or self.user_agent_imported
|
||||
|
||||
|
||||
def get_chrome_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Chrome user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / "Library" / "Application Support" / "Google" / "Chrome",
|
||||
home / "Library" / "Application Support" / "Chromium",
|
||||
]
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / ".config" / "google-chrome",
|
||||
home / ".config" / "chromium",
|
||||
home / ".config" / "chrome",
|
||||
home / "snap" / "chromium" / "common" / "chromium",
|
||||
]
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / "Google" / "Chrome" / "User Data",
|
||||
local_app_data / "Chromium" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and _list_profile_names(candidate):
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_brave_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Brave user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / ".config" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and _list_profile_names(candidate):
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_edge_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Edge user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / "Library" / "Application Support" / "Microsoft Edge",
|
||||
]
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / ".config" / "microsoft-edge",
|
||||
home / ".config" / "microsoft-edge-beta",
|
||||
home / ".config" / "microsoft-edge-dev",
|
||||
]
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / "Microsoft" / "Edge" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and _list_profile_names(candidate):
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_browser_binary(browser: str) -> Optional[str]:
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
browser = browser.lower()
|
||||
|
||||
if system == "Darwin":
|
||||
candidates = {
|
||||
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
|
||||
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
|
||||
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
|
||||
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
|
||||
}.get(browser, [])
|
||||
elif system == "Linux":
|
||||
candidates = {
|
||||
"chrome": ["/usr/bin/google-chrome", "/usr/bin/google-chrome-stable", "/usr/bin/google-chrome-beta", "/usr/bin/google-chrome-unstable"],
|
||||
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
|
||||
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
|
||||
"edge": ["/usr/bin/microsoft-edge", "/usr/bin/microsoft-edge-stable", "/usr/bin/microsoft-edge-beta", "/usr/bin/microsoft-edge-dev"],
|
||||
}.get(browser, [])
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = {
|
||||
"chrome": [
|
||||
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
|
||||
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
||||
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
||||
],
|
||||
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
|
||||
"brave": [
|
||||
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
|
||||
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
],
|
||||
"edge": [
|
||||
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
|
||||
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
],
|
||||
}.get(browser, [])
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate and Path(candidate).exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
"chrome": get_chrome_user_data_dir,
|
||||
"chromium": get_chrome_user_data_dir,
|
||||
"brave": get_brave_user_data_dir,
|
||||
"edge": get_edge_user_data_dir,
|
||||
}
|
||||
|
||||
CHROMIUM_BROWSERS = tuple(BROWSER_PROFILE_FINDERS.keys())
|
||||
|
||||
|
||||
NETSCAPE_COOKIE_HEADER = [
|
||||
"# Netscape HTTP Cookie File",
|
||||
"# https://curl.se/docs/http-cookies.html",
|
||||
"# This file was generated by ArchiveBox persona cookie extraction",
|
||||
"#",
|
||||
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
|
||||
"",
|
||||
]
|
||||
|
||||
|
||||
def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
"""Validate persona name to prevent path traversal."""
|
||||
if not name or not name.strip():
|
||||
return False, "Persona name cannot be empty"
|
||||
if "/" in name or "\\" in name:
|
||||
return False, "Persona name cannot contain path separators (/ or \\)"
|
||||
if ".." in name:
|
||||
return False, "Persona name cannot contain parent directory references (..)"
|
||||
if name.startswith("."):
|
||||
return False, "Persona name cannot start with a dot (.)"
|
||||
if "\x00" in name or "\n" in name or "\r" in name:
|
||||
return False, "Persona name contains invalid characters"
|
||||
return True, ""
|
||||
|
||||
|
||||
def discover_local_browser_profiles() -> list[PersonaImportSource]:
|
||||
discovered: list[PersonaImportSource] = []
|
||||
|
||||
for browser, finder in BROWSER_PROFILE_FINDERS.items():
|
||||
user_data_dir = finder()
|
||||
if not user_data_dir:
|
||||
continue
|
||||
|
||||
browser_binary = get_browser_binary(browser)
|
||||
for profile_dir in _list_profile_names(user_data_dir):
|
||||
try:
|
||||
discovered.append(
|
||||
resolve_browser_profile_source(
|
||||
browser=browser,
|
||||
user_data_dir=user_data_dir,
|
||||
profile_dir=profile_dir,
|
||||
browser_binary=browser_binary,
|
||||
)
|
||||
)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
discovered.extend(discover_persona_template_profiles())
|
||||
|
||||
return discovered
|
||||
|
||||
|
||||
def discover_persona_template_profiles(personas_dir: Path | None = None) -> list[PersonaImportSource]:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
templates: list[PersonaImportSource] = []
|
||||
candidate_roots: list[Path] = []
|
||||
|
||||
if personas_dir is not None:
|
||||
candidate_roots.append(personas_dir.expanduser())
|
||||
else:
|
||||
candidate_roots.extend(
|
||||
[
|
||||
CONSTANTS.PERSONAS_DIR.expanduser(),
|
||||
Path.home() / ".config" / "abx" / "personas",
|
||||
]
|
||||
)
|
||||
|
||||
seen_roots: set[Path] = set()
|
||||
for personas_root in candidate_roots:
|
||||
resolved_root = personas_root.resolve()
|
||||
if resolved_root in seen_roots:
|
||||
continue
|
||||
seen_roots.add(resolved_root)
|
||||
|
||||
if not resolved_root.exists() or not resolved_root.is_dir():
|
||||
continue
|
||||
|
||||
for persona_dir in sorted((path for path in resolved_root.iterdir() if path.is_dir()), key=lambda path: path.name.lower()):
|
||||
for candidate_dir_name in PERSONA_PROFILE_DIR_CANDIDATES:
|
||||
user_data_dir = persona_dir / candidate_dir_name
|
||||
if not user_data_dir.exists() or not user_data_dir.is_dir():
|
||||
continue
|
||||
|
||||
for profile_dir in _list_profile_names(user_data_dir):
|
||||
try:
|
||||
templates.append(
|
||||
resolve_browser_profile_source(
|
||||
browser="persona",
|
||||
source_name=persona_dir.name,
|
||||
user_data_dir=user_data_dir,
|
||||
profile_dir=profile_dir,
|
||||
browser_binary=get_browser_binary("chrome"),
|
||||
)
|
||||
)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return templates
|
||||
|
||||
|
||||
def resolve_browser_import_source(browser: str, profile_dir: str | None = None) -> PersonaImportSource:
|
||||
browser = browser.lower().strip()
|
||||
if browser not in BROWSER_PROFILE_FINDERS:
|
||||
supported = ", ".join(BROWSER_PROFILE_FINDERS)
|
||||
raise ValueError(f"Unknown browser: {browser}. Supported browsers: {supported}")
|
||||
|
||||
user_data_dir = BROWSER_PROFILE_FINDERS[browser]()
|
||||
if not user_data_dir:
|
||||
raise ValueError(f"Could not find {browser} profile directory")
|
||||
|
||||
chosen_profile = profile_dir or pick_default_profile_dir(user_data_dir)
|
||||
if not chosen_profile:
|
||||
raise ValueError(f"Could not find a profile in {user_data_dir}")
|
||||
|
||||
return resolve_browser_profile_source(
|
||||
browser=browser,
|
||||
user_data_dir=user_data_dir,
|
||||
profile_dir=chosen_profile,
|
||||
browser_binary=get_browser_binary(browser),
|
||||
)
|
||||
|
||||
|
||||
def resolve_browser_profile_source(
|
||||
browser: str,
|
||||
user_data_dir: Path,
|
||||
profile_dir: str,
|
||||
source_name: str | None = None,
|
||||
browser_binary: str | None = None,
|
||||
) -> PersonaImportSource:
|
||||
resolved_root = user_data_dir.expanduser()
|
||||
if not resolved_root.is_absolute():
|
||||
resolved_root = resolved_root.resolve()
|
||||
if not resolved_root.exists():
|
||||
raise ValueError(f"Profile root does not exist: {resolved_root}")
|
||||
if not profile_dir.strip():
|
||||
raise ValueError("Profile directory name cannot be empty.")
|
||||
|
||||
profile_path = resolved_root / profile_dir
|
||||
if not _looks_like_profile_dir(profile_path):
|
||||
raise ValueError(f"Profile directory does not look valid: {profile_path}")
|
||||
|
||||
return PersonaImportSource(
|
||||
kind="browser-profile",
|
||||
browser=browser,
|
||||
source_name=source_name,
|
||||
user_data_dir=resolved_root,
|
||||
profile_dir=profile_dir,
|
||||
browser_binary=browser_binary,
|
||||
)
|
||||
|
||||
|
||||
def resolve_custom_import_source(raw_value: str, profile_dir: str | None = None) -> PersonaImportSource:
|
||||
raw_value = raw_value.strip()
|
||||
if not raw_value:
|
||||
raise ValueError("Provide an absolute browser profile path or a CDP URL.")
|
||||
|
||||
if _looks_like_cdp_url(raw_value):
|
||||
return PersonaImportSource(kind="cdp", cdp_url=raw_value)
|
||||
|
||||
source_path = Path(raw_value).expanduser()
|
||||
if not source_path.is_absolute():
|
||||
raise ValueError("Custom browser path must be an absolute path.")
|
||||
if not source_path.exists():
|
||||
raise ValueError(f"Custom browser path does not exist: {source_path}")
|
||||
|
||||
explicit_profile = profile_dir.strip() if profile_dir else ""
|
||||
if _looks_like_profile_dir(source_path):
|
||||
if explicit_profile and explicit_profile != source_path.name:
|
||||
raise ValueError("Profile name does not match the provided profile directory path.")
|
||||
return resolve_browser_profile_source(
|
||||
browser="custom",
|
||||
user_data_dir=source_path.parent.resolve(),
|
||||
profile_dir=source_path.name,
|
||||
)
|
||||
|
||||
chosen_profile = explicit_profile or pick_default_profile_dir(source_path)
|
||||
if not chosen_profile:
|
||||
raise ValueError(
|
||||
"Could not find a Chromium profile in that directory. "
|
||||
"Provide an exact profile directory path or fill in the profile name field."
|
||||
)
|
||||
|
||||
return resolve_browser_profile_source(
|
||||
browser="custom",
|
||||
user_data_dir=source_path.resolve(),
|
||||
profile_dir=chosen_profile,
|
||||
)
|
||||
|
||||
|
||||
def pick_default_profile_dir(user_data_dir: Path) -> str | None:
|
||||
profiles = _list_profile_names(user_data_dir)
|
||||
if not profiles:
|
||||
return None
|
||||
if "Default" in profiles:
|
||||
return "Default"
|
||||
return profiles[0]
|
||||
|
||||
|
||||
def import_persona_from_source(
|
||||
persona: "Persona",
|
||||
source: PersonaImportSource,
|
||||
*,
|
||||
copy_profile: bool = True,
|
||||
import_cookies: bool = True,
|
||||
capture_storage: bool = False,
|
||||
) -> PersonaImportResult:
|
||||
persona.ensure_dirs()
|
||||
result = PersonaImportResult(source=source)
|
||||
|
||||
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
|
||||
cookies_file = persona.path / "cookies.txt"
|
||||
auth_file = persona.path / "auth.json"
|
||||
|
||||
launch_user_data_dir: Path | None = None
|
||||
|
||||
if source.kind == "browser-profile":
|
||||
if copy_profile and source.user_data_dir:
|
||||
resolved_source_root = source.user_data_dir.resolve()
|
||||
resolved_persona_root = persona_chrome_dir.resolve()
|
||||
if resolved_source_root == resolved_persona_root:
|
||||
result.warnings.append("Skipped profile copy because the selected source is already this persona's chrome_user_data directory.")
|
||||
else:
|
||||
copy_browser_user_data_dir(resolved_source_root, resolved_persona_root)
|
||||
persona.cleanup_chrome_profile(resolved_persona_root)
|
||||
result.profile_copied = True
|
||||
launch_user_data_dir = resolved_persona_root
|
||||
else:
|
||||
launch_user_data_dir = source.user_data_dir
|
||||
elif copy_profile:
|
||||
result.warnings.append("Profile copying is only available for local Chromium profile paths. CDP imports can only pull cookies and open-tab storage.")
|
||||
|
||||
if source.kind == "cdp":
|
||||
export_success, auth_payload, export_message = export_browser_state(
|
||||
cdp_url=source.cdp_url,
|
||||
cookies_output_file=cookies_file if import_cookies else None,
|
||||
auth_output_file=auth_file if capture_storage else None,
|
||||
)
|
||||
else:
|
||||
export_success, auth_payload, export_message = export_browser_state(
|
||||
user_data_dir=launch_user_data_dir,
|
||||
profile_dir=source.profile_dir,
|
||||
chrome_binary=source.browser_binary,
|
||||
cookies_output_file=cookies_file if import_cookies else None,
|
||||
auth_output_file=auth_file if capture_storage else None,
|
||||
)
|
||||
|
||||
if not export_success:
|
||||
result.warnings.append(export_message or "Browser import failed.")
|
||||
return result
|
||||
|
||||
if import_cookies and cookies_file.exists():
|
||||
result.cookies_imported = True
|
||||
if capture_storage and auth_file.exists():
|
||||
result.storage_captured = True
|
||||
if _apply_imported_user_agent(persona, auth_payload):
|
||||
result.user_agent_imported = True
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def copy_browser_user_data_dir(source_dir: Path, destination_dir: Path) -> None:
|
||||
destination_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.rmtree(destination_dir, ignore_errors=True)
|
||||
shutil.copytree(
|
||||
source_dir,
|
||||
destination_dir,
|
||||
symlinks=True,
|
||||
ignore=shutil.ignore_patterns(*VOLATILE_PROFILE_COPY_PATTERNS),
|
||||
)
|
||||
|
||||
|
||||
def export_browser_state(
|
||||
*,
|
||||
user_data_dir: Path | None = None,
|
||||
cdp_url: str | None = None,
|
||||
profile_dir: str | None = None,
|
||||
chrome_binary: str | None = None,
|
||||
cookies_output_file: Path | None = None,
|
||||
auth_output_file: Path | None = None,
|
||||
) -> tuple[bool, dict | None, str]:
|
||||
if not user_data_dir and not cdp_url:
|
||||
return False, None, "Missing browser source."
|
||||
|
||||
from abx_plugins import get_plugins_dir
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
state_script = Path(__file__).with_name("export_browser_state.js")
|
||||
if not state_script.exists():
|
||||
return False, None, f"Browser state export script not found at {state_script}"
|
||||
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
|
||||
chrome_plugin_dir = Path(get_plugins_dir()).resolve()
|
||||
|
||||
env = os.environ.copy()
|
||||
env["NODE_MODULES_DIR"] = str(node_modules_dir)
|
||||
env["ARCHIVEBOX_ABX_PLUGINS_DIR"] = str(chrome_plugin_dir)
|
||||
|
||||
if user_data_dir:
|
||||
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
|
||||
if cdp_url:
|
||||
env["CHROME_CDP_URL"] = cdp_url
|
||||
env["CHROME_IS_LOCAL"] = "false"
|
||||
if chrome_binary:
|
||||
env["CHROME_BINARY"] = str(chrome_binary)
|
||||
if profile_dir:
|
||||
extra_arg = f"--profile-directory={profile_dir}"
|
||||
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
|
||||
args_list: list[str] = []
|
||||
if existing_extra:
|
||||
if existing_extra.startswith("["):
|
||||
try:
|
||||
parsed = json.loads(existing_extra)
|
||||
if isinstance(parsed, list):
|
||||
args_list.extend(str(x) for x in parsed)
|
||||
except Exception:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
else:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
args_list.append(extra_arg)
|
||||
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
|
||||
|
||||
temp_dir: Path | None = None
|
||||
tmp_cookies_file: Path | None = None
|
||||
tmp_auth_file: Path | None = None
|
||||
|
||||
if cookies_output_file and cookies_output_file.exists():
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
|
||||
tmp_cookies_file = temp_dir / "cookies.txt"
|
||||
env["COOKIES_OUTPUT_FILE"] = str(tmp_cookies_file)
|
||||
elif cookies_output_file:
|
||||
env["COOKIES_OUTPUT_FILE"] = str(cookies_output_file)
|
||||
|
||||
if auth_output_file and auth_output_file.exists():
|
||||
temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
|
||||
tmp_auth_file = temp_dir / "auth.json"
|
||||
env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file)
|
||||
elif auth_output_file:
|
||||
env["AUTH_STORAGE_OUTPUT_FILE"] = str(auth_output_file)
|
||||
else:
|
||||
temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
|
||||
tmp_auth_file = temp_dir / "auth.json"
|
||||
env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["node", str(state_script)],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, "Browser state export timed out."
|
||||
except FileNotFoundError:
|
||||
return False, None, "Node.js was not found, so ArchiveBox could not extract browser state."
|
||||
except Exception as err:
|
||||
return False, None, f"Browser state export failed: {err}"
|
||||
|
||||
if result.returncode != 0:
|
||||
message = (result.stderr or result.stdout or "").strip() or "Browser state export failed."
|
||||
return False, None, message
|
||||
|
||||
auth_payload: dict | None = None
|
||||
if cookies_output_file and tmp_cookies_file and tmp_cookies_file.exists():
|
||||
_merge_netscape_cookies(cookies_output_file, tmp_cookies_file)
|
||||
if auth_output_file and tmp_auth_file and tmp_auth_file.exists():
|
||||
_merge_auth_storage(auth_output_file, tmp_auth_file)
|
||||
auth_payload = _load_auth_storage(tmp_auth_file)
|
||||
elif auth_output_file and auth_output_file.exists():
|
||||
auth_payload = _load_auth_storage(auth_output_file)
|
||||
elif tmp_auth_file and tmp_auth_file.exists():
|
||||
auth_payload = _load_auth_storage(tmp_auth_file)
|
||||
|
||||
if temp_dir and temp_dir.exists():
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return True, auth_payload, (result.stderr or result.stdout or "").strip()
|
||||
|
||||
|
||||
def _list_profile_names(user_data_dir: Path) -> list[str]:
|
||||
if not user_data_dir.exists() or not user_data_dir.is_dir():
|
||||
return []
|
||||
|
||||
profiles: list[str] = []
|
||||
for child in sorted(user_data_dir.iterdir(), key=lambda path: path.name.lower()):
|
||||
if not child.is_dir():
|
||||
continue
|
||||
if child.name == "System Profile":
|
||||
continue
|
||||
if child.name == "Default" or child.name.startswith("Profile ") or child.name.startswith("Guest Profile"):
|
||||
if _looks_like_profile_dir(child):
|
||||
profiles.append(child.name)
|
||||
continue
|
||||
if _looks_like_profile_dir(child):
|
||||
profiles.append(child.name)
|
||||
return profiles
|
||||
|
||||
|
||||
def _looks_like_profile_dir(path: Path) -> bool:
|
||||
if not path.exists() or not path.is_dir():
|
||||
return False
|
||||
|
||||
marker_paths = (
|
||||
path / "Preferences",
|
||||
path / "History",
|
||||
path / "Cookies",
|
||||
path / "Network" / "Cookies",
|
||||
path / "Local Storage",
|
||||
path / "Session Storage",
|
||||
)
|
||||
|
||||
if any(marker.exists() for marker in marker_paths):
|
||||
return True
|
||||
|
||||
return any(path.name == prefix or path.name.startswith(prefix) for prefix in BROWSER_PROFILE_DIR_NAMES)
|
||||
|
||||
|
||||
def _looks_like_cdp_url(value: str) -> bool:
|
||||
parsed = urlparse(value)
|
||||
return parsed.scheme in {"ws", "wss", "http", "https"} and bool(parsed.netloc)
|
||||
|
||||
|
||||
def _parse_netscape_cookies(path: Path) -> dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]:
|
||||
cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]] = {}
|
||||
if not path.exists():
|
||||
return cookies
|
||||
|
||||
for line in path.read_text().splitlines():
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
|
||||
cookies[(domain, cookie_path, name)] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
|
||||
return cookies
|
||||
|
||||
|
||||
def _write_netscape_cookies(
|
||||
path: Path,
|
||||
cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]],
|
||||
) -> None:
|
||||
lines = list(NETSCAPE_COOKIE_HEADER)
|
||||
for cookie in cookies.values():
|
||||
lines.append("\t".join(cookie))
|
||||
path.write_text("\n".join(lines) + "\n")
|
||||
|
||||
|
||||
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
|
||||
existing = _parse_netscape_cookies(existing_file)
|
||||
new = _parse_netscape_cookies(new_file)
|
||||
existing.update(new)
|
||||
_write_netscape_cookies(existing_file, existing)
|
||||
|
||||
|
||||
def _merge_auth_storage(existing_file: Path, new_file: Path) -> None:
|
||||
existing_payload = _load_auth_storage(existing_file)
|
||||
new_payload = _load_auth_storage(new_file)
|
||||
|
||||
existing_local = existing_payload.setdefault("localStorage", {})
|
||||
existing_session = existing_payload.setdefault("sessionStorage", {})
|
||||
|
||||
for origin, payload in (new_payload.get("localStorage") or {}).items():
|
||||
existing_local[origin] = payload
|
||||
for origin, payload in (new_payload.get("sessionStorage") or {}).items():
|
||||
existing_session[origin] = payload
|
||||
|
||||
cookies = _merge_cookie_dicts(existing_payload.get("cookies") or [], new_payload.get("cookies") or [])
|
||||
|
||||
merged = {
|
||||
**existing_payload,
|
||||
**new_payload,
|
||||
"cookies": cookies,
|
||||
"localStorage": existing_local,
|
||||
"sessionStorage": existing_session,
|
||||
"user_agent": new_payload.get("user_agent") or existing_payload.get("user_agent") or "",
|
||||
}
|
||||
existing_file.write_text(json.dumps(merged, indent=2, sort_keys=True) + "\n")
|
||||
|
||||
|
||||
def _load_auth_storage(path: Path) -> dict:
|
||||
if not path.exists():
|
||||
return {
|
||||
"TYPE": "auth",
|
||||
"cookies": [],
|
||||
"localStorage": {},
|
||||
"sessionStorage": {},
|
||||
}
|
||||
try:
|
||||
payload = json.loads(path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return {
|
||||
"TYPE": "auth",
|
||||
"cookies": [],
|
||||
"localStorage": {},
|
||||
"sessionStorage": {},
|
||||
}
|
||||
if not isinstance(payload, dict):
|
||||
return {
|
||||
"TYPE": "auth",
|
||||
"cookies": [],
|
||||
"localStorage": {},
|
||||
"sessionStorage": {},
|
||||
}
|
||||
return payload
|
||||
|
||||
|
||||
def _merge_cookie_dicts(existing: list[dict], new: list[dict]) -> list[dict]:
|
||||
merged: dict[tuple[str, str, str], dict] = {}
|
||||
for cookie in existing:
|
||||
key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or ""))
|
||||
merged[key] = cookie
|
||||
for cookie in new:
|
||||
key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or ""))
|
||||
merged[key] = cookie
|
||||
return list(merged.values())
|
||||
|
||||
|
||||
def _apply_imported_user_agent(persona: "Persona", auth_payload: dict | None) -> bool:
|
||||
if not auth_payload:
|
||||
return False
|
||||
|
||||
user_agent = str(auth_payload.get("user_agent") or "").strip()
|
||||
if not user_agent:
|
||||
return False
|
||||
|
||||
config = dict(persona.config or {})
|
||||
if config.get("USER_AGENT") == user_agent:
|
||||
return False
|
||||
|
||||
config["USER_AGENT"] = user_agent
|
||||
persona.config = config
|
||||
persona.save(update_fields=["config"])
|
||||
return True
|
||||
@@ -117,6 +117,12 @@ class Persona(ModelWithConfig):
|
||||
cookies_path = self.path / 'cookies.txt'
|
||||
return str(cookies_path) if cookies_path.exists() else ''
|
||||
|
||||
@property
|
||||
def AUTH_STORAGE_FILE(self) -> str:
|
||||
"""Derived path to auth.json for this persona (if it exists)."""
|
||||
auth_path = self.path / 'auth.json'
|
||||
return str(auth_path) if auth_path.exists() else ''
|
||||
|
||||
def get_derived_config(self) -> dict:
|
||||
"""
|
||||
Get config dict with derived paths filled in.
|
||||
@@ -127,6 +133,7 @@ class Persona(ModelWithConfig):
|
||||
- CHROME_EXTENSIONS_DIR (derived from persona path)
|
||||
- CHROME_DOWNLOADS_DIR (derived from persona path)
|
||||
- COOKIES_FILE (derived from persona path, if file exists)
|
||||
- AUTH_STORAGE_FILE (derived from persona path, if file exists)
|
||||
- ACTIVE_PERSONA (set to this persona's name)
|
||||
"""
|
||||
derived = dict(self.config or {})
|
||||
@@ -140,6 +147,8 @@ class Persona(ModelWithConfig):
|
||||
derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
|
||||
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
|
||||
derived['COOKIES_FILE'] = self.COOKIES_FILE
|
||||
if 'AUTH_STORAGE_FILE' not in derived and self.AUTH_STORAGE_FILE:
|
||||
derived['AUTH_STORAGE_FILE'] = self.AUTH_STORAGE_FILE
|
||||
|
||||
# Always set ACTIVE_PERSONA to this persona's name
|
||||
derived['ACTIVE_PERSONA'] = self.name
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import mimetypes
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
@@ -7,9 +8,10 @@ from pathlib import Path
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ArchiveResultEvent
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
from .process_service import ProcessService, parse_event_datetime
|
||||
|
||||
|
||||
@@ -48,22 +50,93 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
|
||||
|
||||
def _normalize_status(status: str) -> str:
|
||||
if status == "noresult":
|
||||
return "skipped"
|
||||
return "noresults"
|
||||
return status or "failed"
|
||||
|
||||
|
||||
def _has_content_files(output_files: list[str]) -> bool:
|
||||
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
|
||||
|
||||
|
||||
def _iter_archiveresult_records(stdout: str) -> list[dict]:
|
||||
records: list[dict] = []
|
||||
for raw_line in stdout.splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line.startswith("{"):
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if record.get("type") == "ArchiveResult":
|
||||
records.append(record)
|
||||
return records
|
||||
|
||||
|
||||
class ArchiveResultService(BaseService):
|
||||
LISTENS_TO = [ArchiveResultEvent]
|
||||
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, process_service: ProcessService):
|
||||
self.process_service = process_service
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
|
||||
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
|
||||
if snapshot_output_dir is None:
|
||||
return
|
||||
plugin_dir = Path(snapshot_output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
|
||||
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
|
||||
|
||||
def _project(self, event: ArchiveResultEvent) -> None:
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
|
||||
return
|
||||
|
||||
plugin_dir = Path(event.output_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
return
|
||||
|
||||
synthetic_record = {
|
||||
"plugin": event.plugin_name,
|
||||
"hook_name": event.hook_name,
|
||||
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||
"output_str": event.stderr if event.exit_code != 0 else "",
|
||||
"error": event.stderr if event.exit_code != 0 else "",
|
||||
}
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
synthetic_record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
|
||||
return str(snapshot.output_dir) if snapshot is not None else None
|
||||
|
||||
def _project(
|
||||
self,
|
||||
event: ArchiveResultEvent,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
@@ -86,8 +159,6 @@ class ArchiveResultService(BaseService):
|
||||
},
|
||||
)
|
||||
|
||||
plugin_dir = Path(snapshot.output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
result.process = process or result.process
|
||||
result.status = _normalize_status(event.status)
|
||||
result.output_str = event.output_str
|
||||
@@ -97,7 +168,28 @@ class ArchiveResultService(BaseService):
|
||||
result.output_mimetypes = output_mimetypes
|
||||
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
|
||||
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
result.retry_at = None
|
||||
if event.error:
|
||||
result.notes = event.error
|
||||
result.save()
|
||||
|
||||
def _project_from_process_completed(
|
||||
self,
|
||||
event: ProcessCompletedEvent,
|
||||
record: dict,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
archive_result_event = ArchiveResultEvent(
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
plugin=record.get("plugin") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or event.hook_name,
|
||||
status=record.get("status") or "",
|
||||
process_id=event.process_id,
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
)
|
||||
self._project(archive_result_event, output_files, output_size, output_mimetypes)
|
||||
|
||||
@@ -1,19 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
import asyncio
|
||||
|
||||
from abx_dl.events import BinaryEvent, BinaryInstalledEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class BinaryService(BaseService):
|
||||
LISTENS_TO = [BinaryEvent, BinaryInstalledEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
|
||||
await sync_to_async(self._project_binary, thread_sensitive=True)(event)
|
||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
||||
await run_db_op(self._project_binary, event)
|
||||
|
||||
async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None:
|
||||
await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event)
|
||||
async def on_BinaryInstalledEvent__Outer(self, event: BinaryInstalledEvent) -> None:
|
||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
||||
await run_db_op(self._project_installed_binary, event, resolved)
|
||||
|
||||
def _project_binary(self, event: BinaryEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
@@ -44,7 +48,39 @@ class BinaryService(BaseService):
|
||||
},
|
||||
)
|
||||
|
||||
def _project_installed_binary(self, event: BinaryInstalledEvent) -> None:
|
||||
def _resolve_installed_binary_metadata(self, event: BinaryInstalledEvent) -> dict[str, str]:
|
||||
resolved = {
|
||||
"abspath": event.abspath or "",
|
||||
"version": event.version or "",
|
||||
"sha256": event.sha256 or "",
|
||||
"binproviders": event.binproviders or "",
|
||||
"binprovider": event.binprovider or "",
|
||||
}
|
||||
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
|
||||
return resolved
|
||||
|
||||
try:
|
||||
from abx_dl.dependencies import load_binary
|
||||
|
||||
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
|
||||
spec = {
|
||||
"name": event.name,
|
||||
"binproviders": allowed_providers,
|
||||
"overrides": event.overrides or {},
|
||||
}
|
||||
binary = load_binary(spec)
|
||||
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
|
||||
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
|
||||
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
|
||||
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
|
||||
if provider_name:
|
||||
resolved["binprovider"] = str(provider_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return resolved
|
||||
|
||||
def _project_installed_binary(self, event: BinaryInstalledEvent, resolved: dict[str, str]) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
@@ -55,10 +91,14 @@ class BinaryService(BaseService):
|
||||
"status": Binary.StatusChoices.QUEUED,
|
||||
},
|
||||
)
|
||||
binary.abspath = event.abspath or binary.abspath
|
||||
binary.version = event.version or binary.version
|
||||
binary.sha256 = event.sha256 or binary.sha256
|
||||
binary.binprovider = event.binprovider or binary.binprovider
|
||||
binary.abspath = resolved["abspath"] or binary.abspath
|
||||
binary.version = resolved["version"] or binary.version
|
||||
binary.sha256 = resolved["sha256"] or binary.sha256
|
||||
if resolved["binproviders"]:
|
||||
binary.binproviders = resolved["binproviders"]
|
||||
binary.binprovider = resolved["binprovider"] or binary.binprovider
|
||||
if event.overrides and binary.overrides != event.overrides:
|
||||
binary.overrides = event.overrides
|
||||
binary.status = Binary.StatusChoices.INSTALLED
|
||||
binary.retry_at = None
|
||||
binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"])
|
||||
binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class CrawlService(BaseService):
|
||||
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
||||
@@ -15,17 +14,17 @@ class CrawlService(BaseService):
|
||||
self.crawl_id = crawl_id
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None:
|
||||
await sync_to_async(self._mark_completed, thread_sensitive=True)()
|
||||
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
|
||||
await run_db_op(self._mark_completed)
|
||||
|
||||
def _mark_started(self) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
16
archivebox/services/db.py
Normal file
16
archivebox/services/db.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.db import close_old_connections
|
||||
|
||||
|
||||
def _run_db_op(func, *args, **kwargs):
|
||||
close_old_connections()
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
close_old_connections()
|
||||
|
||||
|
||||
async def run_db_op(func, *args, **kwargs):
|
||||
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)
|
||||
1
archivebox/services/live_ui.py
Normal file
1
archivebox/services/live_ui.py
Normal file
@@ -0,0 +1 @@
|
||||
from abx_dl.cli import LiveBusUI
|
||||
@@ -1,16 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import MachineEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class MachineService(BaseService):
|
||||
LISTENS_TO = [MachineEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_MachineEvent(self, event: MachineEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
@@ -3,12 +3,13 @@ from __future__ import annotations
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
@@ -33,27 +34,33 @@ class ProcessService(BaseService):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None:
|
||||
await sync_to_async(self._project_started, thread_sensitive=True)(event)
|
||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
||||
await run_db_op(self._project_started, event)
|
||||
|
||||
async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None:
|
||||
await sync_to_async(self._project_completed, thread_sensitive=True)(event)
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
await run_db_op(self._project_completed, event)
|
||||
|
||||
def get_db_process_id(self, process_id: str) -> str | None:
|
||||
return self.process_ids.get(process_id)
|
||||
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
|
||||
from archivebox.machine.models import Machine, Process
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
db_process_id = self.process_ids.get(event.process_id)
|
||||
iface = NetworkInterface.current(refresh=True)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
return process
|
||||
|
||||
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
@@ -77,12 +84,14 @@ class ProcessService(BaseService):
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
process.status = process.StatusChoices.RUNNING
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
|
||||
def _project_completed(self, event: ProcessCompletedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
if not process.cmd:
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
@@ -92,4 +101,5 @@ class ProcessService(BaseService):
|
||||
process.exit_code = event.exit_code
|
||||
process.status = process.StatusChoices.EXITED
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
|
||||
@@ -3,16 +3,21 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from contextlib import nullcontext
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Any
|
||||
|
||||
from django.utils import timezone
|
||||
from rich.console import Console
|
||||
|
||||
from abx_dl.events import BinaryEvent
|
||||
from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins
|
||||
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services
|
||||
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
|
||||
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
|
||||
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
@@ -21,6 +26,7 @@ from .machine_service import MachineService
|
||||
from .process_service import ProcessService
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
from .live_ui import LiveBusUI
|
||||
|
||||
|
||||
def _bus_name(prefix: str, identifier: str) -> str:
|
||||
@@ -35,6 +41,19 @@ def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
|
||||
return [name.strip() for name in raw.split(",") if name.strip()]
|
||||
|
||||
|
||||
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
|
||||
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
|
||||
total = 0
|
||||
for plugin in selected.values():
|
||||
total += len(list(plugin.get_crawl_hooks()))
|
||||
total += len(list(plugin.get_snapshot_hooks()))
|
||||
return total
|
||||
|
||||
|
||||
def _runner_debug(message: str) -> None:
|
||||
print(f"[runner] {message}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _attach_bus_trace(bus) -> None:
|
||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
||||
if not trace_target:
|
||||
@@ -78,10 +97,51 @@ async def _stop_bus_trace(bus) -> None:
|
||||
bus._archivebox_trace_task = None
|
||||
|
||||
|
||||
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
||||
if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
|
||||
return False
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
machine = Machine.current()
|
||||
if Process.objects.filter(
|
||||
machine=machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
).exists():
|
||||
return False
|
||||
|
||||
log_path = CONSTANTS.LOGS_DIR / "errors.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
env = os.environ.copy()
|
||||
env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
|
||||
|
||||
with log_path.open("a", encoding="utf-8") as log_handle:
|
||||
subprocess.Popen(
|
||||
[sys.executable, "-m", "archivebox", "run", "--daemon"],
|
||||
cwd=str(CONSTANTS.DATA_DIR),
|
||||
env=env,
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=log_handle,
|
||||
stderr=log_handle,
|
||||
start_new_session=True,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
class CrawlRunner:
|
||||
MAX_CONCURRENT_SNAPSHOTS = 8
|
||||
|
||||
def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None):
|
||||
def __init__(
|
||||
self,
|
||||
crawl,
|
||||
*,
|
||||
snapshot_ids: list[str] | None = None,
|
||||
selected_plugins: list[str] | None = None,
|
||||
process_discovered_snapshots_inline: bool = True,
|
||||
):
|
||||
self.crawl = crawl
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
@@ -90,7 +150,12 @@ class CrawlRunner:
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot)
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
self.bus,
|
||||
crawl_id=str(crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
)
|
||||
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
|
||||
self.selected_plugins = selected_plugins
|
||||
self.initial_snapshot_ids = snapshot_ids
|
||||
@@ -100,6 +165,29 @@ class CrawlRunner:
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.primary_url = ""
|
||||
self._live_stream = None
|
||||
|
||||
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
|
||||
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
CrawlService(bus, crawl_id=str(self.crawl.id))
|
||||
SnapshotService(
|
||||
bus,
|
||||
crawl_id=str(self.crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=config_overrides,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
return bus, abx_services
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
@@ -107,35 +195,63 @@ class CrawlRunner:
|
||||
|
||||
try:
|
||||
await sync_to_async(self._prepare, thread_sensitive=True)()
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=self.base_config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
if self.crawl.get_system_task() == INSTALL_URL:
|
||||
await self._run_install_crawl()
|
||||
else:
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
await self._wait_for_snapshot_tasks()
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
if self.abx_services is not None:
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
live_ui = self._create_live_ui()
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=self.base_config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
if self.crawl.get_system_task() == INSTALL_URL:
|
||||
await self._run_install_crawl()
|
||||
else:
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
|
||||
await self._wait_for_snapshot_tasks()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
|
||||
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
|
||||
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
|
||||
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
|
||||
if self.abx_services is not None:
|
||||
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
|
||||
finally:
|
||||
await _stop_bus_trace(self.bus)
|
||||
await self.bus.stop()
|
||||
if self._live_stream is not None:
|
||||
try:
|
||||
self._live_stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._live_stream = None
|
||||
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
|
||||
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
|
||||
if crawl_is_finished:
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
else:
|
||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
||||
@@ -145,17 +261,36 @@ class CrawlRunner:
|
||||
task = asyncio.create_task(self._run_snapshot(snapshot_id))
|
||||
self.snapshot_tasks[snapshot_id] = task
|
||||
|
||||
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
|
||||
return None
|
||||
|
||||
async def _wait_for_snapshot_tasks(self) -> None:
|
||||
while True:
|
||||
active = [task for task in self.snapshot_tasks.values() if not task.done()]
|
||||
if not active:
|
||||
pending_tasks: list[asyncio.Task[None]] = []
|
||||
for snapshot_id, task in list(self.snapshot_tasks.items()):
|
||||
if task.done():
|
||||
if self.snapshot_tasks.get(snapshot_id) is task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
task.result()
|
||||
continue
|
||||
pending_tasks.append(task)
|
||||
if not pending_tasks:
|
||||
return
|
||||
await asyncio.gather(*active)
|
||||
done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
|
||||
for task in done:
|
||||
task.result()
|
||||
|
||||
def _prepare(self) -> None:
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
||||
current_iface = NetworkInterface.current(refresh=True)
|
||||
current_process = Process.current()
|
||||
if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
|
||||
current_process.iface = current_iface
|
||||
current_process.machine = current_iface.machine
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
if self.selected_plugins is None:
|
||||
@@ -168,6 +303,52 @@ class CrawlRunner:
|
||||
if self.persona:
|
||||
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
||||
|
||||
def _create_live_ui(self) -> LiveBusUI | None:
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
stderr_is_tty = sys.stderr.isatty()
|
||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||
if not interactive_tty:
|
||||
return None
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
stream = self._live_stream
|
||||
except OSError:
|
||||
self._live_stream = None
|
||||
try:
|
||||
terminal_size = os.get_terminal_size(stream.fileno())
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
except (AttributeError, OSError, ValueError):
|
||||
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
ui_console = Console(
|
||||
file=stream,
|
||||
force_terminal=True,
|
||||
width=terminal_width,
|
||||
height=terminal_height,
|
||||
_environ={
|
||||
"COLUMNS": str(terminal_width),
|
||||
"LINES": str(terminal_height),
|
||||
},
|
||||
)
|
||||
plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
|
||||
live_ui = LiveBusUI(
|
||||
self.bus,
|
||||
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
|
||||
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
|
||||
ui_console=ui_console,
|
||||
interactive_tty=True,
|
||||
)
|
||||
live_ui.print_intro(
|
||||
url=self.primary_url or INSTALL_URL,
|
||||
output_dir=Path(self.crawl.output_dir),
|
||||
plugins_label=plugins_label,
|
||||
)
|
||||
return live_ui
|
||||
|
||||
def _create_root_snapshots(self) -> list[str]:
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
@@ -290,18 +471,34 @@ class CrawlRunner:
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
snapshot_bus, snapshot_services = self._create_projector_bus(
|
||||
identifier=f"{self.crawl.id}_{snapshot['id']}",
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
try:
|
||||
_attach_bus_trace(snapshot_bus)
|
||||
_runner_debug(f"snapshot {snapshot_id} starting download()")
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=snapshot_bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
|
||||
await snapshot_services.process.wait_for_background_monitors()
|
||||
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
await _stop_bus_trace(snapshot_bus)
|
||||
await snapshot_bus.stop()
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -322,11 +519,24 @@ class CrawlRunner:
|
||||
}
|
||||
|
||||
|
||||
def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None:
|
||||
def run_crawl(
|
||||
crawl_id: str,
|
||||
*,
|
||||
snapshot_ids: list[str] | None = None,
|
||||
selected_plugins: list[str] | None = None,
|
||||
process_discovered_snapshots_inline: bool = True,
|
||||
) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run())
|
||||
asyncio.run(
|
||||
CrawlRunner(
|
||||
crawl,
|
||||
snapshot_ids=snapshot_ids,
|
||||
selected_plugins=selected_plugins,
|
||||
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
|
||||
).run()
|
||||
)
|
||||
|
||||
|
||||
async def _run_binary(binary_id: str) -> None:
|
||||
@@ -397,28 +607,203 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
live_stream = None
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names)
|
||||
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
|
||||
timeout_seconds = int(config.get("TIMEOUT") or 60)
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
stderr_is_tty = sys.stderr.isatty()
|
||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||
ui_console = None
|
||||
live_ui = None
|
||||
|
||||
if interactive_tty:
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
stream = live_stream
|
||||
except OSError:
|
||||
live_stream = None
|
||||
try:
|
||||
terminal_size = os.get_terminal_size(stream.fileno())
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
except (AttributeError, OSError, ValueError):
|
||||
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
ui_console = Console(
|
||||
file=stream,
|
||||
force_terminal=True,
|
||||
width=terminal_width,
|
||||
height=terminal_height,
|
||||
_environ={
|
||||
"COLUMNS": str(terminal_width),
|
||||
"LINES": str(terminal_height),
|
||||
},
|
||||
)
|
||||
|
||||
with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
|
||||
output_dir = Path(temp_dir)
|
||||
if ui_console is not None:
|
||||
live_ui = LiveBusUI(
|
||||
bus,
|
||||
total_hooks=_count_selected_hooks(selected_plugins, None),
|
||||
timeout_seconds=timeout_seconds,
|
||||
ui_console=ui_console,
|
||||
interactive_tty=interactive_tty,
|
||||
)
|
||||
live_ui.print_intro(
|
||||
url=INSTALL_URL,
|
||||
output_dir=output_dir,
|
||||
plugins_label=plugins_label,
|
||||
)
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(bus)
|
||||
results = await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
output_dir=output_dir,
|
||||
config_overrides=config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
if live_ui is not None:
|
||||
live_ui.print_summary(results, output_dir=output_dir)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
try:
|
||||
if live_stream is not None:
|
||||
live_stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def run_install(*, plugin_names: list[str] | None = None) -> None:
|
||||
asyncio.run(_run_install(plugin_names=plugin_names))
|
||||
|
||||
|
||||
def recover_orphaned_crawls() -> int:
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_crawl_ids: set[str] = set()
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
continue
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
if crawl_id:
|
||||
active_crawl_ids.add(str(crawl_id))
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_crawls = Crawl.objects.filter(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at__isnull=True,
|
||||
).prefetch_related("snapshot_set")
|
||||
|
||||
for crawl in orphaned_crawls:
|
||||
if str(crawl.id) in active_crawl_ids:
|
||||
continue
|
||||
|
||||
snapshots = list(crawl.snapshot_set.all())
|
||||
if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
continue
|
||||
|
||||
crawl.retry_at = now
|
||||
crawl.save(update_fields=["retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
|
||||
return recovered
|
||||
|
||||
|
||||
def recover_orphaned_snapshots() -> int:
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_snapshot_ids: set[str] = set()
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
continue
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if snapshot_id:
|
||||
active_snapshot_ids.add(str(snapshot_id))
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_snapshots = (
|
||||
Snapshot.objects
|
||||
.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set")
|
||||
)
|
||||
|
||||
for snapshot in orphaned_snapshots:
|
||||
if str(snapshot.id) in active_snapshot_ids:
|
||||
continue
|
||||
|
||||
results = list(snapshot.archiveresult_set.all())
|
||||
if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or now
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
continue
|
||||
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = now
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = now
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
|
||||
return recovered
|
||||
|
||||
|
||||
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
while True:
|
||||
@@ -436,10 +821,48 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
|
||||
.first()
|
||||
)
|
||||
if binary is not None:
|
||||
if not binary.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
run_binary(str(binary.id))
|
||||
continue
|
||||
|
||||
pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED)
|
||||
queued_crawls = Crawl.objects.filter(
|
||||
retry_at__lte=timezone.now(),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
)
|
||||
if crawl_id:
|
||||
queued_crawls = queued_crawls.filter(id=crawl_id)
|
||||
queued_crawls = queued_crawls.order_by("retry_at", "created_at")
|
||||
|
||||
queued_crawl = queued_crawls.first()
|
||||
if queued_crawl is not None:
|
||||
if not queued_crawl.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
|
||||
continue
|
||||
|
||||
if crawl_id is None:
|
||||
snapshot = (
|
||||
Snapshot.objects.filter(retry_at__lte=timezone.now())
|
||||
.exclude(status=Snapshot.StatusChoices.SEALED)
|
||||
.select_related("crawl")
|
||||
.order_by("retry_at", "created_at")
|
||||
.first()
|
||||
)
|
||||
if snapshot is not None:
|
||||
if not snapshot.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
run_crawl(
|
||||
str(snapshot.crawl_id),
|
||||
snapshot_ids=[str(snapshot.id)],
|
||||
process_discovered_snapshots_inline=False,
|
||||
)
|
||||
continue
|
||||
|
||||
pending = Crawl.objects.filter(
|
||||
retry_at__lte=timezone.now(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
if crawl_id:
|
||||
pending = pending.filter(id=crawl_id)
|
||||
pending = pending.order_by("retry_at", "created_at")
|
||||
@@ -451,4 +874,7 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
|
||||
continue
|
||||
return 0
|
||||
|
||||
run_crawl(str(crawl.id))
|
||||
if not crawl.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
|
||||
run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class SnapshotService(BaseService):
|
||||
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
||||
@@ -18,13 +18,17 @@ class SnapshotService(BaseService):
|
||||
self.schedule_snapshot = schedule_snapshot
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
|
||||
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._project_snapshot, event)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
||||
await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
|
||||
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._write_snapshot_details)(snapshot_id)
|
||||
|
||||
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -39,7 +43,6 @@ class SnapshotService(BaseService):
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return str(snapshot.id)
|
||||
|
||||
if event.depth > crawl.max_depth:
|
||||
@@ -73,56 +76,36 @@ class SnapshotService(BaseService):
|
||||
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return str(snapshot.id)
|
||||
|
||||
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
||||
from archivebox.config.configset import get_config
|
||||
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
||||
|
||||
config = get_config(
|
||||
user=getattr(crawl, "created_by", None),
|
||||
crawl=crawl,
|
||||
snapshot=parent_snapshot,
|
||||
)
|
||||
|
||||
def to_pattern_list(value):
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
|
||||
return []
|
||||
|
||||
allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
|
||||
denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
|
||||
|
||||
for pattern in denylist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return False
|
||||
except re.error:
|
||||
continue
|
||||
|
||||
if allowlist:
|
||||
for pattern in allowlist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return True
|
||||
except re.error:
|
||||
continue
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _seal_snapshot(self, snapshot_id: str) -> None:
|
||||
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return
|
||||
return None
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
return str(snapshot.id)
|
||||
|
||||
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is not None:
|
||||
snapshot.ensure_crawl_symlink()
|
||||
|
||||
def _write_snapshot_details(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is None:
|
||||
return
|
||||
snapshot.write_index_jsonl()
|
||||
snapshot.write_json_details()
|
||||
snapshot.write_html_details()
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import TagEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class TagService(BaseService):
|
||||
LISTENS_TO = [TagEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_TagEvent(self, event: TagEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
|
||||
def _project(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
@@ -1083,8 +1083,11 @@
|
||||
width: 100% !important;
|
||||
}
|
||||
|
||||
body.filters-collapsed.change-list #changelist .changelist-form-container > div {
|
||||
body.filters-collapsed.change-list #changelist .changelist-form-container > div,
|
||||
body.filters-collapsed.change-list #changelist .changelist-form-container > form {
|
||||
max-width: 100% !important;
|
||||
width: 100% !important;
|
||||
flex: 1 1 100% !important;
|
||||
}
|
||||
|
||||
/* Actions bar */
|
||||
@@ -1372,7 +1375,8 @@
|
||||
order: 2;
|
||||
align-self: flex-start;
|
||||
}
|
||||
body.change-list #changelist .changelist-form-container > div {
|
||||
body.change-list #changelist .changelist-form-container > div,
|
||||
body.change-list #changelist .changelist-form-container > form {
|
||||
flex: 1 1 auto;
|
||||
min-width: 0;
|
||||
order: 1;
|
||||
|
||||
268
archivebox/templates/admin/core/tag/change_form.html
Normal file
268
archivebox/templates/admin/core/tag/change_form.html
Normal file
@@ -0,0 +1,268 @@
|
||||
{% extends "admin/change_form.html" %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} app-core model-tag tag-form-page{% endblock %}
|
||||
|
||||
{% block extrastyle %}
|
||||
{{ block.super }}
|
||||
<style>
|
||||
.tag-form-hero {
|
||||
margin: 0 0 20px;
|
||||
padding: 22px 24px;
|
||||
border-radius: 20px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background:
|
||||
radial-gradient(circle at top right, rgba(245, 158, 11, 0.12), transparent 30%),
|
||||
linear-gradient(135deg, #fff7ed 0%, #ffffff 48%, #eff6ff 100%);
|
||||
box-shadow: 0 12px 30px rgba(15, 23, 42, 0.06);
|
||||
display: grid;
|
||||
gap: 16px;
|
||||
grid-template-columns: minmax(0, 1.7fr) minmax(260px, 1fr);
|
||||
}
|
||||
|
||||
.tag-form-hero h2 {
|
||||
margin: 0 0 8px;
|
||||
font-size: 28px;
|
||||
line-height: 1.05;
|
||||
color: #111827;
|
||||
}
|
||||
|
||||
.tag-form-hero p {
|
||||
margin: 0;
|
||||
color: #475569;
|
||||
font-size: 14px;
|
||||
max-width: 70ch;
|
||||
}
|
||||
|
||||
.tag-form-hero__meta {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.tag-form-hero__meta div {
|
||||
padding: 14px 16px;
|
||||
border-radius: 14px;
|
||||
border: 1px solid rgba(203, 213, 225, 0.85);
|
||||
background: rgba(255, 255, 255, 0.88);
|
||||
}
|
||||
|
||||
.tag-form-hero__meta span {
|
||||
display: block;
|
||||
margin-bottom: 8px;
|
||||
font-size: 11px;
|
||||
font-weight: 700;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.08em;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.tag-similar-panel {
|
||||
margin-top: 18px;
|
||||
padding: 18px;
|
||||
border-radius: 18px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background: #fff;
|
||||
box-shadow: 0 10px 24px rgba(15, 23, 42, 0.05);
|
||||
}
|
||||
|
||||
.tag-similar-panel h3 {
|
||||
margin: 0 0 6px;
|
||||
font-size: 16px;
|
||||
color: #111827;
|
||||
}
|
||||
|
||||
.tag-similar-panel p {
|
||||
margin: 0 0 14px;
|
||||
font-size: 13px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.tag-similar-list {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
||||
}
|
||||
|
||||
.tag-similar-card {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
padding: 14px 16px;
|
||||
border-radius: 16px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background: #f8fafc;
|
||||
text-decoration: none;
|
||||
color: #0f172a;
|
||||
}
|
||||
|
||||
.tag-similar-card strong {
|
||||
font-size: 15px;
|
||||
line-height: 1.1;
|
||||
}
|
||||
|
||||
.tag-similar-card span {
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.tag-similar-card__snapshots {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.tag-similar-snapshot {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
min-width: 0;
|
||||
max-width: 100%;
|
||||
padding: 6px 8px;
|
||||
border-radius: 999px;
|
||||
background: #fff;
|
||||
border: 1px solid #dbe4ee;
|
||||
font-size: 11px;
|
||||
color: #334155;
|
||||
}
|
||||
|
||||
.tag-similar-snapshot img {
|
||||
width: 14px;
|
||||
height: 14px;
|
||||
border-radius: 4px;
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
|
||||
.tag-similar-empty {
|
||||
padding: 16px;
|
||||
border-radius: 16px;
|
||||
border: 1px dashed #cbd5e1;
|
||||
background: #f8fafc;
|
||||
color: #64748b;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
@media (max-width: 920px) {
|
||||
.tag-form-hero {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block form_top %}
|
||||
<section class="tag-form-hero">
|
||||
<div>
|
||||
<h2>{% if add %}New Tag{% else %}Edit Tag{% endif %}</h2>
|
||||
<p>Similar tags are shown below while typing.</p>
|
||||
</div>
|
||||
<div class="tag-form-hero__meta">
|
||||
<div>
|
||||
<span>Matches</span>
|
||||
<strong>Current tags</strong>
|
||||
</div>
|
||||
<div>
|
||||
<span>Links</span>
|
||||
<strong>Open filtered snapshots</strong>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
{{ block.super }}
|
||||
{% endblock %}
|
||||
|
||||
{% block after_field_sets %}
|
||||
{{ block.super }}
|
||||
<section
|
||||
id="tag-similar-panel"
|
||||
class="tag-similar-panel"
|
||||
data-search-url="{{ tag_search_api_url }}"
|
||||
>
|
||||
<h3>Similar Tags</h3>
|
||||
<p>Updates while typing.</p>
|
||||
<div id="tag-similar-list" class="tag-similar-list"></div>
|
||||
</section>
|
||||
|
||||
{{ tag_similar_cards|json_script:"abx-tag-similar-data" }}
|
||||
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
const panel = document.getElementById('tag-similar-panel');
|
||||
const list = document.getElementById('tag-similar-list');
|
||||
const nameInput = document.querySelector('input[data-tag-name-input="1"]');
|
||||
if (!panel || !list || !nameInput) return;
|
||||
|
||||
const searchUrl = panel.dataset.searchUrl;
|
||||
let similarCards = JSON.parse(document.getElementById('abx-tag-similar-data').textContent || '[]');
|
||||
let timeoutId = null;
|
||||
|
||||
function escapeHtml(value) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = value == null ? '' : String(value);
|
||||
return div.innerHTML;
|
||||
}
|
||||
|
||||
function getApiKey() {
|
||||
return (window.ARCHIVEBOX_API_KEY || '').trim();
|
||||
}
|
||||
|
||||
function withApiKey(url) {
|
||||
const apiKey = getApiKey();
|
||||
if (!apiKey) return url;
|
||||
const separator = url.includes('?') ? '&' : '?';
|
||||
return url + separator + 'api_key=' + encodeURIComponent(apiKey);
|
||||
}
|
||||
|
||||
function buildHeaders() {
|
||||
const headers = {};
|
||||
const apiKey = getApiKey();
|
||||
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
|
||||
return headers;
|
||||
}
|
||||
|
||||
function render(cards) {
|
||||
const filtered = (cards || []).filter(function (card) {
|
||||
return (card.name || '').toLowerCase() !== (nameInput.value || '').trim().toLowerCase();
|
||||
});
|
||||
|
||||
if (!filtered.length) {
|
||||
list.innerHTML = '<div class="tag-similar-empty">No similar tags.</div>';
|
||||
return;
|
||||
}
|
||||
|
||||
list.innerHTML = filtered.map(function (card) {
|
||||
const snapshots = (card.snapshots || []).slice(0, 3).map(function (snapshot) {
|
||||
return '' +
|
||||
'<span class="tag-similar-snapshot">' +
|
||||
'<img src="' + escapeHtml(snapshot.favicon_url) + '" alt="" onerror="this.style.display=\\'none\\'">' +
|
||||
'<span>' + escapeHtml(snapshot.title) + '</span>' +
|
||||
'</span>';
|
||||
}).join('');
|
||||
|
||||
return '' +
|
||||
'<a class="tag-similar-card" href="' + escapeHtml(card.filter_url) + '">' +
|
||||
'<strong>' + escapeHtml(card.name) + '</strong>' +
|
||||
'<span>' + escapeHtml(card.num_snapshots) + ' snapshots · slug: ' + escapeHtml(card.slug) + '</span>' +
|
||||
'<div class="tag-similar-card__snapshots">' + (snapshots || '<span class="tag-similar-snapshot">No snapshots</span>') + '</div>' +
|
||||
'</a>';
|
||||
}).join('');
|
||||
}
|
||||
|
||||
async function fetchSimilar(query) {
|
||||
const response = await fetch(withApiKey(searchUrl + '?q=' + encodeURIComponent(query || '')), {
|
||||
headers: buildHeaders(),
|
||||
credentials: 'same-origin',
|
||||
});
|
||||
if (!response.ok) return [];
|
||||
const payload = await response.json();
|
||||
return payload.tags || [];
|
||||
}
|
||||
|
||||
nameInput.addEventListener('input', function () {
|
||||
window.clearTimeout(timeoutId);
|
||||
timeoutId = window.setTimeout(async function () {
|
||||
similarCards = await fetchSimilar((nameInput.value || '').trim());
|
||||
render(similarCards);
|
||||
}, 140);
|
||||
});
|
||||
|
||||
render(similarCards);
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
997
archivebox/templates/admin/core/tag/change_list.html
Normal file
997
archivebox/templates/admin/core/tag/change_list.html
Normal file
@@ -0,0 +1,997 @@
|
||||
{% extends "admin/change_list.html" %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} app-core model-tag change-list tag-admin-page{% endblock %}
|
||||
|
||||
{% block object-tools %}{% endblock %}
|
||||
|
||||
{% block extrastyle %}
|
||||
{{ block.super }}
|
||||
<style>
|
||||
.tag-admin-shell {
|
||||
display: grid;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.tag-admin-toolbar {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 12px;
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.tag-admin-panel {
|
||||
flex: 1 1 320px;
|
||||
padding: 12px;
|
||||
border-radius: 16px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background: #fff;
|
||||
box-shadow: 0 8px 18px rgba(15, 23, 42, 0.05);
|
||||
}
|
||||
|
||||
.tag-admin-panel--search {
|
||||
flex: 3 1 360px;
|
||||
}
|
||||
|
||||
.tag-admin-panel--filters {
|
||||
flex: 3 1 440px;
|
||||
}
|
||||
|
||||
.tag-admin-panel--create {
|
||||
flex: 1 1 280px;
|
||||
}
|
||||
|
||||
.tag-admin-panel h2 {
|
||||
margin: 0 0 12px;
|
||||
font-size: 16px;
|
||||
color: #0f172a;
|
||||
}
|
||||
|
||||
.tag-create-form,
|
||||
.tag-search-form {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.tag-input-row {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.tag-create-form .tag-input-row {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr) auto;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.tag-input-row input {
|
||||
flex: 1 1 auto;
|
||||
min-width: 0;
|
||||
height: 40px;
|
||||
box-sizing: border-box;
|
||||
padding: 0 12px;
|
||||
line-height: 1.2;
|
||||
border-radius: 10px;
|
||||
border: 1px solid #cbd5e1;
|
||||
background: #f8fafc;
|
||||
font-size: 13px;
|
||||
color: #0f172a;
|
||||
}
|
||||
|
||||
.tag-input-row input:focus {
|
||||
outline: none;
|
||||
border-color: #0ea5e9;
|
||||
box-shadow: 0 0 0 4px rgba(14, 165, 233, 0.14);
|
||||
background: #fff;
|
||||
}
|
||||
|
||||
.tag-button,
|
||||
.tag-chip-button {
|
||||
border: 0;
|
||||
border-radius: 10px;
|
||||
cursor: pointer;
|
||||
font-weight: 700;
|
||||
transition: transform 0.12s ease, box-shadow 0.12s ease, opacity 0.12s ease;
|
||||
}
|
||||
|
||||
.tag-button:hover,
|
||||
.tag-chip-button:hover {
|
||||
transform: translateY(-1px);
|
||||
box-shadow: 0 8px 20px rgba(15, 23, 42, 0.08);
|
||||
}
|
||||
|
||||
.tag-button:disabled,
|
||||
.tag-chip-button:disabled {
|
||||
cursor: wait;
|
||||
opacity: 0.6;
|
||||
transform: none;
|
||||
box-shadow: none;
|
||||
}
|
||||
|
||||
.tag-button {
|
||||
flex: 0 0 auto;
|
||||
height: 40px;
|
||||
padding: 0 12px;
|
||||
background: linear-gradient(135deg, #0f766e 0%, #0ea5e9 100%);
|
||||
color: #fff;
|
||||
white-space: nowrap;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.tag-toolbar-meta {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 10px;
|
||||
flex-wrap: wrap;
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.tag-toolbar-meta strong {
|
||||
color: #0f172a;
|
||||
}
|
||||
|
||||
.tag-help {
|
||||
margin: 0;
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.tag-filter-grid {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||
}
|
||||
|
||||
.tag-select-field {
|
||||
display: grid;
|
||||
gap: 4px;
|
||||
min-width: 0;
|
||||
font-size: 11px;
|
||||
font-weight: 700;
|
||||
color: #475569;
|
||||
}
|
||||
|
||||
.tag-select-field select {
|
||||
width: 100%;
|
||||
min-width: 0;
|
||||
height: 40px;
|
||||
box-sizing: border-box;
|
||||
padding: 0 10px;
|
||||
line-height: 1.2;
|
||||
border-radius: 10px;
|
||||
border: 1px solid #cbd5e1;
|
||||
background: #f8fafc;
|
||||
color: #0f172a;
|
||||
font-size: 12px;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
.tag-select-field select:focus {
|
||||
outline: none;
|
||||
border-color: #0ea5e9;
|
||||
box-shadow: 0 0 0 4px rgba(14, 165, 233, 0.14);
|
||||
background: #fff;
|
||||
}
|
||||
|
||||
.tag-grid {
|
||||
display: grid;
|
||||
gap: 12px;
|
||||
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
||||
}
|
||||
|
||||
.tag-card {
|
||||
position: relative;
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
padding: 10px;
|
||||
border-radius: 16px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background:
|
||||
linear-gradient(180deg, rgba(255, 255, 255, 0.96) 0%, rgba(248, 250, 252, 0.94) 100%);
|
||||
box-shadow: 0 8px 18px rgba(15, 23, 42, 0.05);
|
||||
transition: transform 0.14s ease, border-color 0.14s ease, box-shadow 0.14s ease;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.tag-card:hover {
|
||||
transform: translateY(-2px);
|
||||
border-color: #93c5fd;
|
||||
box-shadow: 0 14px 26px rgba(15, 23, 42, 0.08);
|
||||
}
|
||||
|
||||
.tag-card__header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
gap: 10px;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.tag-card__title {
|
||||
flex: 1 1 auto;
|
||||
min-width: 0;
|
||||
display: grid;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.tag-card__title strong,
|
||||
.tag-card__rename strong {
|
||||
display: block;
|
||||
font-size: 17px;
|
||||
line-height: 1.1;
|
||||
color: #111827;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.tag-card__count {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
white-space: nowrap;
|
||||
padding: 3px 8px;
|
||||
border-radius: 999px;
|
||||
background: #e0f2fe;
|
||||
color: #075985;
|
||||
font-size: 11px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.tag-card__actions {
|
||||
flex: 0 0 auto;
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
justify-content: flex-end;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
}
|
||||
|
||||
.tag-chip-button {
|
||||
height: 30px;
|
||||
padding: 0 8px;
|
||||
background: #fff;
|
||||
border: 1px solid #dbe4ee;
|
||||
color: #334155;
|
||||
font-size: 11px;
|
||||
}
|
||||
|
||||
.tag-chip-button.is-danger {
|
||||
background: #fff1f2;
|
||||
border-color: #fecdd3;
|
||||
color: #be123c;
|
||||
}
|
||||
|
||||
.tag-card__rename {
|
||||
display: none;
|
||||
gap: 6px;
|
||||
align-items: center;
|
||||
flex-wrap: wrap;
|
||||
margin-top: 2px;
|
||||
}
|
||||
|
||||
.tag-card.is-editing .tag-card__display {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.tag-card.is-editing .tag-card__rename {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.tag-card.is-editing .tag-card__header {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr);
|
||||
}
|
||||
|
||||
.tag-card.is-editing .tag-card__actions {
|
||||
justify-content: flex-start;
|
||||
}
|
||||
|
||||
.tag-card__rename input {
|
||||
flex: 1 1 220px;
|
||||
min-width: 0;
|
||||
height: 34px;
|
||||
padding: 0 10px;
|
||||
border-radius: 10px;
|
||||
border: 1px solid #cbd5e1;
|
||||
background: #fff;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.tag-card__snapshots {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
grid-template-columns: repeat(auto-fit, minmax(130px, 1fr));
|
||||
}
|
||||
|
||||
.tag-snapshot-badge {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
min-width: 0;
|
||||
padding: 6px 8px;
|
||||
border-radius: 12px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background: rgba(255, 255, 255, 0.86);
|
||||
text-decoration: none;
|
||||
color: #0f172a;
|
||||
}
|
||||
|
||||
.tag-snapshot-badge img {
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
border-radius: 4px;
|
||||
flex: 0 0 auto;
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
.tag-snapshot-badge span {
|
||||
min-width: 0;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
font-size: 11px;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.tag-card__empty {
|
||||
padding: 14px;
|
||||
border-radius: 14px;
|
||||
border: 1px dashed #cbd5e1;
|
||||
background: #f8fafc;
|
||||
color: #64748b;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.tag-toast {
|
||||
position: sticky;
|
||||
top: 12px;
|
||||
z-index: 30;
|
||||
display: none;
|
||||
width: fit-content;
|
||||
max-width: min(100%, 420px);
|
||||
padding: 12px 14px;
|
||||
border-radius: 14px;
|
||||
font-size: 13px;
|
||||
font-weight: 700;
|
||||
box-shadow: 0 14px 30px rgba(15, 23, 42, 0.12);
|
||||
}
|
||||
|
||||
.tag-toast.is-visible {
|
||||
display: block;
|
||||
}
|
||||
|
||||
.tag-toast.is-success {
|
||||
background: #dcfce7;
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.tag-toast.is-error {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.tag-empty-state {
|
||||
padding: 24px 18px;
|
||||
border-radius: 16px;
|
||||
border: 1px dashed #cbd5e1;
|
||||
background: #fff;
|
||||
text-align: center;
|
||||
color: #64748b;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div id="content-main">
|
||||
<div
|
||||
id="abx-tag-admin"
|
||||
class="tag-admin-shell"
|
||||
data-search-url="{{ tag_search_api_url }}"
|
||||
data-create-url="{{ tag_create_api_url }}"
|
||||
>
|
||||
<section class="tag-admin-toolbar">
|
||||
<div class="tag-admin-panel tag-admin-panel--search">
|
||||
<div class="tag-search-form">
|
||||
<div class="tag-input-row">
|
||||
<input
|
||||
id="tag-live-search"
|
||||
type="search"
|
||||
placeholder="Search by tag name"
|
||||
value="{{ initial_query }}"
|
||||
autocomplete="off"
|
||||
>
|
||||
</div>
|
||||
<div class="tag-toolbar-meta">
|
||||
<span id="tag-query-label">{% if initial_query %}“{{ initial_query }}”{% else %}All tags{% endif %}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tag-admin-panel tag-admin-panel--filters">
|
||||
<div class="tag-filter-grid">
|
||||
<label class="tag-select-field" for="tag-sort-select">
|
||||
<span>Sort</span>
|
||||
<select id="tag-sort-select">
|
||||
{% for value, label in tag_sort_choices %}
|
||||
<option value="{{ value }}"{% if value == initial_sort %} selected{% endif %}>{{ label }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<label class="tag-select-field" for="tag-created-by-select">
|
||||
<span>Created By</span>
|
||||
<select id="tag-created-by-select">
|
||||
<option value="">All users</option>
|
||||
{% for value, label in tag_created_by_choices %}
|
||||
<option value="{{ value }}"{% if value == initial_created_by %} selected{% endif %}>{{ label }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<label class="tag-select-field" for="tag-year-select">
|
||||
<span>Year</span>
|
||||
<select id="tag-year-select">
|
||||
<option value="">All years</option>
|
||||
{% for value in tag_year_choices %}
|
||||
<option value="{{ value }}"{% if value == initial_year %} selected{% endif %}>{{ value }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tag-admin-panel tag-admin-panel--create">
|
||||
<form id="tag-create-form" class="tag-create-form">
|
||||
{% csrf_token %}
|
||||
<div class="tag-input-row">
|
||||
<input
|
||||
id="tag-create-name"
|
||||
type="text"
|
||||
name="name"
|
||||
placeholder="New tag name"
|
||||
autocomplete="off"
|
||||
value=""
|
||||
>
|
||||
<button class="tag-button" type="submit">Create</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<div id="tag-toast" class="tag-toast" aria-live="polite"></div>
|
||||
<div id="tag-card-grid" class="tag-grid">
|
||||
{% if initial_tag_cards %}
|
||||
{% for card in initial_tag_cards %}
|
||||
<article
|
||||
class="tag-card"
|
||||
data-id="{{ card.id }}"
|
||||
data-filter-url="{{ card.filter_url }}"
|
||||
data-rename-url="{{ card.rename_url }}"
|
||||
data-delete-url="{{ card.delete_url }}"
|
||||
data-export-urls-url="{{ card.export_urls_url }}"
|
||||
data-export-jsonl-url="{{ card.export_jsonl_url }}"
|
||||
>
|
||||
<div class="tag-card__header">
|
||||
<div class="tag-card__title">
|
||||
<div class="tag-card__display">
|
||||
<strong><a href="{{ card.filter_url }}" style="color:inherit;text-decoration:none;">{{ card.name }}</a></strong>
|
||||
</div>
|
||||
<div class="tag-card__rename">
|
||||
<input type="text" value="{{ card.name }}" aria-label="Rename tag {{ card.name }}">
|
||||
<button type="button" class="tag-chip-button" data-action="save-edit">Save</button>
|
||||
<button type="button" class="tag-chip-button" data-action="cancel-edit">Cancel</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="tag-card__actions">
|
||||
<button type="button" class="tag-chip-button" data-action="edit" aria-label="Rename tag" title="Rename tag">✎</button>
|
||||
<button type="button" class="tag-chip-button" data-action="copy-urls">Copy URLs</button>
|
||||
<button type="button" class="tag-chip-button" data-action="download-jsonl">JSONL</button>
|
||||
<button type="button" class="tag-chip-button is-danger" data-action="delete">Delete</button>
|
||||
<span class="tag-card__count">{{ card.num_snapshots }}</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="tag-card__snapshots">
|
||||
{% if card.snapshots %}
|
||||
{% for snapshot in card.snapshots %}
|
||||
<a class="tag-snapshot-badge" href="{{ snapshot.admin_url }}" title="{{ snapshot.url }}">
|
||||
<img src="{{ snapshot.favicon_url }}" alt="" onerror="this.style.display='none'">
|
||||
<span>{{ snapshot.title }}</span>
|
||||
</a>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="tag-card__empty">No snapshots attached yet.</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</article>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="tag-empty-state">No tags.</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{{ initial_tag_cards|json_script:"abx-tag-cards-data" }}
|
||||
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
const shell = document.getElementById('abx-tag-admin');
|
||||
if (!shell) return;
|
||||
|
||||
const initialCards = JSON.parse(document.getElementById('abx-tag-cards-data').textContent || '[]');
|
||||
const searchUrl = shell.dataset.searchUrl;
|
||||
const createUrl = shell.dataset.createUrl;
|
||||
const searchInput = document.getElementById('tag-live-search');
|
||||
const sortSelect = document.getElementById('tag-sort-select');
|
||||
const createdBySelect = document.getElementById('tag-created-by-select');
|
||||
const yearSelect = document.getElementById('tag-year-select');
|
||||
const createForm = document.getElementById('tag-create-form');
|
||||
const createInput = document.getElementById('tag-create-name');
|
||||
const grid = document.getElementById('tag-card-grid');
|
||||
const queryLabel = document.getElementById('tag-query-label');
|
||||
const toast = document.getElementById('tag-toast');
|
||||
let cards = initialCards;
|
||||
let searchTimeout = null;
|
||||
let activeQuery = (searchInput?.value || '').trim();
|
||||
|
||||
function escapeHtml(value) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = value == null ? '' : String(value);
|
||||
return div.innerHTML;
|
||||
}
|
||||
|
||||
function slugify(value) {
|
||||
return String(value || '')
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '') || 'tag';
|
||||
}
|
||||
|
||||
function getCSRFToken() {
|
||||
const input = document.querySelector('input[name="csrfmiddlewaretoken"]');
|
||||
if (input) return input.value;
|
||||
const cookies = document.cookie.split(';');
|
||||
for (const cookieRaw of cookies) {
|
||||
const cookie = cookieRaw.trim();
|
||||
if (cookie.startsWith('csrftoken=')) return cookie.slice('csrftoken='.length);
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function getApiKey() {
|
||||
return (window.ARCHIVEBOX_API_KEY || '').trim();
|
||||
}
|
||||
|
||||
function withApiKey(url) {
|
||||
const apiKey = getApiKey();
|
||||
if (!apiKey) return url;
|
||||
const separator = url.includes('?') ? '&' : '?';
|
||||
return url + separator + 'api_key=' + encodeURIComponent(apiKey);
|
||||
}
|
||||
|
||||
function buildHeaders(isJsonBody) {
|
||||
const headers = {};
|
||||
if (isJsonBody) headers['Content-Type'] = 'application/json';
|
||||
const csrfToken = getCSRFToken();
|
||||
if (csrfToken) headers['X-CSRFToken'] = csrfToken;
|
||||
const apiKey = getApiKey();
|
||||
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
|
||||
return headers;
|
||||
}
|
||||
|
||||
function setToast(message, tone) {
|
||||
toast.textContent = message;
|
||||
toast.className = 'tag-toast is-visible ' + (tone === 'error' ? 'is-error' : 'is-success');
|
||||
window.clearTimeout(setToast._timer);
|
||||
setToast._timer = window.setTimeout(function () {
|
||||
toast.className = 'tag-toast';
|
||||
toast.textContent = '';
|
||||
}, 2600);
|
||||
}
|
||||
|
||||
function getCurrentState(overrides) {
|
||||
const next = overrides || {};
|
||||
return {
|
||||
query: typeof next.query === 'string' ? next.query.trim() : (searchInput?.value || '').trim(),
|
||||
sort: typeof next.sort === 'string' ? next.sort : (sortSelect?.value || 'created_desc'),
|
||||
created_by: typeof next.created_by === 'string' ? next.created_by : (createdBySelect?.value || ''),
|
||||
year: typeof next.year === 'string' ? next.year : (yearSelect?.value || ''),
|
||||
};
|
||||
}
|
||||
|
||||
function syncSearchState(state) {
|
||||
if (searchInput) searchInput.value = state.query;
|
||||
if (sortSelect) sortSelect.value = state.sort;
|
||||
if (createdBySelect) createdBySelect.value = state.created_by;
|
||||
if (yearSelect) yearSelect.value = state.year;
|
||||
}
|
||||
|
||||
function syncLocation(state) {
|
||||
const url = new URL(window.location.href);
|
||||
if (state.query) {
|
||||
url.searchParams.set('q', state.query);
|
||||
} else {
|
||||
url.searchParams.delete('q');
|
||||
}
|
||||
|
||||
if (state.sort && state.sort !== 'created_desc') {
|
||||
url.searchParams.set('sort', state.sort);
|
||||
} else {
|
||||
url.searchParams.delete('sort');
|
||||
}
|
||||
|
||||
if (state.created_by) {
|
||||
url.searchParams.set('created_by', state.created_by);
|
||||
} else {
|
||||
url.searchParams.delete('created_by');
|
||||
}
|
||||
|
||||
if (state.year) {
|
||||
url.searchParams.set('year', state.year);
|
||||
} else {
|
||||
url.searchParams.delete('year');
|
||||
}
|
||||
|
||||
window.history.replaceState({}, '', url.toString());
|
||||
}
|
||||
|
||||
function setMeta(state, count) {
|
||||
const baseLabel = state.query ? '"' + state.query + '"' : 'All tags';
|
||||
queryLabel.textContent = baseLabel + ' · ' + count + ' shown';
|
||||
activeQuery = state.query;
|
||||
}
|
||||
|
||||
function renderCards(nextCards, state) {
|
||||
cards = Array.isArray(nextCards) ? nextCards : [];
|
||||
setMeta(state || getCurrentState(), cards.length);
|
||||
|
||||
if (!cards.length) {
|
||||
grid.innerHTML = '<div class="tag-empty-state">No tags.</div>';
|
||||
return;
|
||||
}
|
||||
|
||||
grid.innerHTML = cards.map(function (card) {
|
||||
const snapshotHtml = (card.snapshots || []).length
|
||||
? card.snapshots.map(function (snapshot) {
|
||||
return '' +
|
||||
'<a class="tag-snapshot-badge" href="' + escapeHtml(snapshot.admin_url) + '" title="' + escapeHtml(snapshot.url) + '">' +
|
||||
'<img src="' + escapeHtml(snapshot.favicon_url) + '" alt="" onerror="this.hidden=true">' +
|
||||
'<span>' + escapeHtml(snapshot.title) + '</span>' +
|
||||
'</a>';
|
||||
}).join('')
|
||||
: '<div class="tag-card__empty">No snapshots attached yet.</div>';
|
||||
|
||||
return '' +
|
||||
'<article class="tag-card" data-id="' + escapeHtml(card.id) + '" data-filter-url="' + escapeHtml(card.filter_url) + '" data-rename-url="' + escapeHtml(card.rename_url) + '" data-delete-url="' + escapeHtml(card.delete_url) + '" data-export-urls-url="' + escapeHtml(card.export_urls_url) + '" data-export-jsonl-url="' + escapeHtml(card.export_jsonl_url) + '">' +
|
||||
'<div class="tag-card__header">' +
|
||||
'<div class="tag-card__title">' +
|
||||
'<div class="tag-card__display">' +
|
||||
'<strong>' + escapeHtml(card.name) + '</strong>' +
|
||||
'</div>' +
|
||||
'<div class="tag-card__rename">' +
|
||||
'<input type="text" value="' + escapeHtml(card.name) + '" aria-label="Rename tag ' + escapeHtml(card.name) + '">' +
|
||||
'<button type="button" class="tag-chip-button" data-action="save-edit">Save</button>' +
|
||||
'<button type="button" class="tag-chip-button" data-action="cancel-edit">Cancel</button>' +
|
||||
'</div>' +
|
||||
'</div>' +
|
||||
'<div class="tag-card__actions">' +
|
||||
'<button type="button" class="tag-chip-button" data-action="edit" aria-label="Rename tag" title="Rename tag">✎</button>' +
|
||||
'<button type="button" class="tag-chip-button" data-action="copy-urls">Copy URLs</button>' +
|
||||
'<button type="button" class="tag-chip-button" data-action="download-jsonl">JSONL</button>' +
|
||||
'<button type="button" class="tag-chip-button is-danger" data-action="delete">Delete</button>' +
|
||||
'<span class="tag-card__count">' + escapeHtml(card.num_snapshots) + '</span>' +
|
||||
'</div>' +
|
||||
'</div>' +
|
||||
'<div class="tag-card__snapshots">' + snapshotHtml + '</div>' +
|
||||
'</article>';
|
||||
}).join('');
|
||||
}
|
||||
|
||||
async function fetchCards(state) {
|
||||
const params = new URLSearchParams();
|
||||
if (state.query) params.set('q', state.query);
|
||||
if (state.sort) params.set('sort', state.sort);
|
||||
if (state.created_by) params.set('created_by', state.created_by);
|
||||
if (state.year) params.set('year', state.year);
|
||||
const url = withApiKey(searchUrl + '?' + params.toString());
|
||||
const response = await fetch(url, {
|
||||
headers: buildHeaders(false),
|
||||
credentials: 'same-origin',
|
||||
});
|
||||
if (!response.ok) {
|
||||
const message = await response.text();
|
||||
throw new Error(message || 'Failed to load matching tags');
|
||||
}
|
||||
const payload = await response.json();
|
||||
return {
|
||||
tags: payload.tags || [],
|
||||
state: {
|
||||
query: state.query,
|
||||
sort: payload.sort || state.sort,
|
||||
created_by: payload.created_by || '',
|
||||
year: payload.year || '',
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function refreshCards(overrides) {
|
||||
const requestedState = getCurrentState(overrides);
|
||||
const result = await fetchCards(requestedState);
|
||||
syncSearchState(result.state);
|
||||
renderCards(result.tags, result.state);
|
||||
syncLocation(result.state);
|
||||
return result.tags;
|
||||
}
|
||||
|
||||
async function submitJson(url, method, payload) {
|
||||
const response = await fetch(withApiKey(url), {
|
||||
method: method,
|
||||
headers: buildHeaders(true),
|
||||
credentials: 'same-origin',
|
||||
body: JSON.stringify(payload || {}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
let message = 'Request failed';
|
||||
try {
|
||||
const data = await response.json();
|
||||
message = data.detail || data.message || message;
|
||||
} catch (_err) {
|
||||
message = await response.text() || message;
|
||||
}
|
||||
throw new Error(message);
|
||||
}
|
||||
if (response.status === 204) return {};
|
||||
return response.json();
|
||||
}
|
||||
|
||||
async function copyTextFromUrl(url) {
|
||||
const response = await fetch(withApiKey(url), {
|
||||
headers: buildHeaders(false),
|
||||
credentials: 'same-origin',
|
||||
});
|
||||
if (!response.ok) throw new Error('Failed to export URLs');
|
||||
const text = await response.text();
|
||||
await copyTextToClipboard(text);
|
||||
return text;
|
||||
}
|
||||
|
||||
async function copyTextToClipboard(text) {
|
||||
if (navigator.clipboard && window.isSecureContext) {
|
||||
try {
|
||||
await navigator.clipboard.writeText(text);
|
||||
return;
|
||||
} catch (_error) {
|
||||
}
|
||||
}
|
||||
|
||||
const textarea = document.createElement('textarea');
|
||||
textarea.value = text;
|
||||
textarea.setAttribute('readonly', '');
|
||||
textarea.style.position = 'fixed';
|
||||
textarea.style.top = '-9999px';
|
||||
textarea.style.left = '-9999px';
|
||||
document.body.appendChild(textarea);
|
||||
textarea.focus();
|
||||
textarea.select();
|
||||
|
||||
const copied = document.execCommand('copy');
|
||||
document.body.removeChild(textarea);
|
||||
if (!copied) {
|
||||
throw new Error('Clipboard write failed');
|
||||
}
|
||||
}
|
||||
|
||||
function getDownloadFilename(response, fallbackFilename) {
|
||||
const disposition = response.headers.get('Content-Disposition') || '';
|
||||
const utf8Match = disposition.match(/filename\\*=UTF-8''([^;]+)/i);
|
||||
if (utf8Match && utf8Match[1]) {
|
||||
return decodeURIComponent(utf8Match[1]);
|
||||
}
|
||||
|
||||
const filenameMatch = disposition.match(/filename="?([^";]+)"?/i);
|
||||
if (filenameMatch && filenameMatch[1]) {
|
||||
return filenameMatch[1];
|
||||
}
|
||||
|
||||
return fallbackFilename;
|
||||
}
|
||||
|
||||
async function downloadFileFromUrl(url, fallbackFilename) {
|
||||
const response = await fetch(withApiKey(url), {
|
||||
headers: buildHeaders(false),
|
||||
credentials: 'same-origin',
|
||||
});
|
||||
if (!response.ok) {
|
||||
let message = 'Download failed';
|
||||
try {
|
||||
const data = await response.json();
|
||||
message = data.detail || data.message || message;
|
||||
} catch (_err) {
|
||||
message = await response.text() || message;
|
||||
}
|
||||
throw new Error(message);
|
||||
}
|
||||
|
||||
const blob = await response.blob();
|
||||
const downloadUrl = URL.createObjectURL(blob);
|
||||
const link = document.createElement('a');
|
||||
link.href = downloadUrl;
|
||||
link.download = getDownloadFilename(response, fallbackFilename);
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
link.remove();
|
||||
window.setTimeout(function () {
|
||||
URL.revokeObjectURL(downloadUrl);
|
||||
}, 1000);
|
||||
}
|
||||
|
||||
createForm?.addEventListener('submit', async function (event) {
|
||||
event.preventDefault();
|
||||
const name = (createInput.value || '').trim();
|
||||
if (!name) {
|
||||
setToast('Enter a tag name first.', 'error');
|
||||
createInput.focus();
|
||||
return;
|
||||
}
|
||||
|
||||
const button = createForm.querySelector('button[type="submit"]');
|
||||
button.disabled = true;
|
||||
try {
|
||||
const result = await submitJson(createUrl, 'POST', { name: name });
|
||||
createInput.value = '';
|
||||
await refreshCards({ query: result.tag_name || name });
|
||||
setToast(result.created ? 'Tag created.' : 'Existing tag loaded.', 'success');
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Failed to create tag.', 'error');
|
||||
} finally {
|
||||
button.disabled = false;
|
||||
}
|
||||
});
|
||||
|
||||
searchInput?.addEventListener('input', function () {
|
||||
window.clearTimeout(searchTimeout);
|
||||
searchTimeout = window.setTimeout(async function () {
|
||||
try {
|
||||
await refreshCards();
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Failed to search tags.', 'error');
|
||||
}
|
||||
}, 150);
|
||||
});
|
||||
|
||||
[sortSelect, createdBySelect, yearSelect].forEach(function (field) {
|
||||
field?.addEventListener('change', async function () {
|
||||
try {
|
||||
await refreshCards();
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Failed to update tag filters.', 'error');
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
grid.addEventListener('click', async function (event) {
|
||||
const actionButton = event.target.closest('[data-action]');
|
||||
const snapshotLink = event.target.closest('.tag-snapshot-badge');
|
||||
if (snapshotLink) return;
|
||||
|
||||
const cardEl = event.target.closest('.tag-card');
|
||||
if (!cardEl) return;
|
||||
|
||||
if (!actionButton) {
|
||||
window.location.href = cardEl.dataset.filterUrl;
|
||||
return;
|
||||
}
|
||||
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
|
||||
const action = actionButton.dataset.action;
|
||||
if (action === 'edit') {
|
||||
cardEl.classList.add('is-editing');
|
||||
const input = cardEl.querySelector('.tag-card__rename input');
|
||||
if (input) {
|
||||
input.focus();
|
||||
input.select();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (action === 'cancel-edit') {
|
||||
cardEl.classList.remove('is-editing');
|
||||
return;
|
||||
}
|
||||
|
||||
if (action === 'save-edit') {
|
||||
const input = cardEl.querySelector('.tag-card__rename input');
|
||||
const nextName = (input?.value || '').trim();
|
||||
if (!nextName) {
|
||||
setToast('Tag name is required.', 'error');
|
||||
input?.focus();
|
||||
return;
|
||||
}
|
||||
|
||||
actionButton.disabled = true;
|
||||
try {
|
||||
await submitJson(cardEl.dataset.renameUrl, 'POST', { name: nextName });
|
||||
await refreshCards();
|
||||
setToast('Tag renamed.', 'success');
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Rename failed.', 'error');
|
||||
} finally {
|
||||
actionButton.disabled = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (action === 'delete') {
|
||||
const tagName = cardEl.querySelector('.tag-card__display strong')?.textContent || 'this tag';
|
||||
if (!window.confirm('Delete "' + tagName + '"? This only removes the tag and its tag links.')) return;
|
||||
|
||||
actionButton.disabled = true;
|
||||
try {
|
||||
await fetch(withApiKey(cardEl.dataset.deleteUrl), {
|
||||
method: 'DELETE',
|
||||
headers: buildHeaders(false),
|
||||
credentials: 'same-origin',
|
||||
}).then(async function (response) {
|
||||
if (!response.ok) {
|
||||
let message = 'Delete failed';
|
||||
try {
|
||||
const payload = await response.json();
|
||||
message = payload.detail || message;
|
||||
} catch (_err) {
|
||||
message = await response.text() || message;
|
||||
}
|
||||
throw new Error(message);
|
||||
}
|
||||
});
|
||||
await refreshCards();
|
||||
setToast('Tag deleted.', 'success');
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Delete failed.', 'error');
|
||||
} finally {
|
||||
actionButton.disabled = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (action === 'copy-urls') {
|
||||
actionButton.disabled = true;
|
||||
try {
|
||||
await copyTextFromUrl(cardEl.dataset.exportUrlsUrl);
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Failed to copy URLs.', 'error');
|
||||
} finally {
|
||||
actionButton.disabled = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (action === 'download-jsonl') {
|
||||
actionButton.disabled = true;
|
||||
try {
|
||||
const tagName = cardEl.querySelector('.tag-card__display strong')?.textContent || 'tag';
|
||||
await downloadFileFromUrl(cardEl.dataset.exportJsonlUrl, 'tag-' + slugify(tagName) + '-snapshots.jsonl');
|
||||
} catch (error) {
|
||||
setToast(error.message || 'Failed to download JSONL.', 'error');
|
||||
} finally {
|
||||
actionButton.disabled = false;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
grid.addEventListener('keydown', function (event) {
|
||||
if (event.key !== 'Enter') return;
|
||||
const input = event.target.closest('.tag-card__rename input');
|
||||
if (!input) return;
|
||||
event.preventDefault();
|
||||
const saveButton = input.closest('.tag-card__rename')?.querySelector('[data-action="save-edit"]');
|
||||
saveButton?.click();
|
||||
});
|
||||
|
||||
const initialState = getCurrentState();
|
||||
renderCards(cards, initialState);
|
||||
syncLocation(initialState);
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
249
archivebox/templates/admin/personas/persona/change_form.html
Normal file
249
archivebox/templates/admin/personas/persona/change_form.html
Normal file
@@ -0,0 +1,249 @@
|
||||
{% extends "admin/change_form.html" %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} app-personas model-persona{% endblock %}
|
||||
|
||||
{% block extrastyle %}
|
||||
{{ block.super }}
|
||||
<style>
|
||||
.persona-import-hero {
|
||||
margin: 0 0 22px;
|
||||
padding: 22px 24px;
|
||||
border-radius: 18px;
|
||||
border: 1px solid #d8dee9;
|
||||
background:
|
||||
radial-gradient(circle at top right, rgba(67, 97, 238, 0.10), transparent 32%),
|
||||
linear-gradient(135deg, #fff7ed 0%, #ffffff 45%, #ecfeff 100%);
|
||||
box-shadow: 0 10px 30px rgba(15, 23, 42, 0.06);
|
||||
display: grid;
|
||||
gap: 18px;
|
||||
grid-template-columns: minmax(0, 1.8fr) minmax(280px, 1fr);
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.persona-import-hero h2 {
|
||||
margin: 0 0 8px;
|
||||
font-size: 28px;
|
||||
line-height: 1.1;
|
||||
color: #111827;
|
||||
}
|
||||
|
||||
.persona-import-hero p {
|
||||
margin: 0;
|
||||
color: #475569;
|
||||
max-width: 70ch;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.persona-import-hero__meta {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
|
||||
}
|
||||
|
||||
.persona-import-hero__stat {
|
||||
padding: 14px 16px;
|
||||
border-radius: 14px;
|
||||
background: rgba(255, 255, 255, 0.86);
|
||||
border: 1px solid rgba(203, 213, 225, 0.85);
|
||||
}
|
||||
|
||||
.persona-import-hero__stat span {
|
||||
display: block;
|
||||
font-size: 11px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.08em;
|
||||
font-weight: 700;
|
||||
color: #64748b;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.persona-import-hero__stat strong,
|
||||
.persona-import-hero__stat code {
|
||||
font-size: 18px;
|
||||
color: #0f172a;
|
||||
}
|
||||
|
||||
.field-import_mode ul,
|
||||
.field-import_discovered_profile ul {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
list-style: none;
|
||||
display: grid;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.field-import_mode ul {
|
||||
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
|
||||
}
|
||||
|
||||
.field-import_discovered_profile ul {
|
||||
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
|
||||
max-height: 460px;
|
||||
overflow: auto;
|
||||
padding-right: 4px;
|
||||
}
|
||||
|
||||
.field-import_mode li,
|
||||
.field-import_discovered_profile li {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.field-import_mode label,
|
||||
.field-import_discovered_profile label {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
align-items: flex-start;
|
||||
min-height: 100%;
|
||||
padding: 14px 16px;
|
||||
border-radius: 14px;
|
||||
border: 1px solid #dbe4ee;
|
||||
background: #fff;
|
||||
box-shadow: 0 1px 2px rgba(15, 23, 42, 0.04);
|
||||
cursor: pointer;
|
||||
transition: transform 0.15s ease, border-color 0.15s ease, box-shadow 0.15s ease;
|
||||
}
|
||||
|
||||
.field-import_mode label:hover,
|
||||
.field-import_discovered_profile label:hover {
|
||||
transform: translateY(-1px);
|
||||
border-color: #7c3aed;
|
||||
box-shadow: 0 8px 20px rgba(124, 58, 237, 0.10);
|
||||
}
|
||||
|
||||
.field-import_mode input[type="radio"],
|
||||
.field-import_discovered_profile input[type="radio"] {
|
||||
margin-top: 3px;
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
|
||||
.abx-import-mode-option,
|
||||
.abx-profile-option {
|
||||
display: grid;
|
||||
gap: 6px;
|
||||
}
|
||||
|
||||
.abx-import-mode-option strong,
|
||||
.abx-profile-option strong {
|
||||
color: #0f172a;
|
||||
font-size: 15px;
|
||||
}
|
||||
|
||||
.abx-import-mode-option span:last-child,
|
||||
.abx-profile-option__meta {
|
||||
color: #64748b;
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.abx-profile-option code {
|
||||
font-size: 11px;
|
||||
line-height: 1.5;
|
||||
white-space: normal;
|
||||
overflow-wrap: anywhere;
|
||||
color: #334155;
|
||||
background: #f8fafc;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 10px;
|
||||
padding: 8px 10px;
|
||||
}
|
||||
|
||||
.abx-persona-path-list,
|
||||
.abx-persona-artifacts {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.abx-persona-path-list div,
|
||||
.abx-persona-artifact {
|
||||
display: grid;
|
||||
gap: 6px;
|
||||
padding: 12px 14px;
|
||||
border-radius: 12px;
|
||||
border: 1px solid #e2e8f0;
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
.abx-persona-path-list code,
|
||||
.abx-persona-artifact code {
|
||||
white-space: normal;
|
||||
overflow-wrap: anywhere;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.abx-artifact-state {
|
||||
display: inline-flex;
|
||||
width: fit-content;
|
||||
align-items: center;
|
||||
border-radius: 999px;
|
||||
padding: 2px 10px;
|
||||
font-size: 11px;
|
||||
font-weight: 700;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
|
||||
.abx-artifact-state--yes {
|
||||
background: #dcfce7;
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.abx-artifact-state--no {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
@media (max-width: 960px) {
|
||||
.persona-import-hero {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block extrahead %}
|
||||
{{ block.super }}
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
const modeInputs = Array.from(document.querySelectorAll('input[name="import_mode"]'));
|
||||
const discoveredRow = document.querySelector('.form-row.field-import_discovered_profile');
|
||||
const sourceRow = document.querySelector('.form-row.field-import_source');
|
||||
const profileRow = document.querySelector('.form-row.field-import_profile_name');
|
||||
|
||||
const updateVisibility = () => {
|
||||
const selected = modeInputs.find((input) => input.checked)?.value || 'none';
|
||||
if (discoveredRow) discoveredRow.style.display = selected === 'discovered' ? '' : 'none';
|
||||
if (sourceRow) sourceRow.style.display = selected === 'custom' ? '' : 'none';
|
||||
if (profileRow) profileRow.style.display = selected === 'custom' ? '' : 'none';
|
||||
};
|
||||
|
||||
modeInputs.forEach((input) => input.addEventListener('change', updateVisibility));
|
||||
updateVisibility();
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
||||
{% block form_top %}
|
||||
<section class="persona-import-hero">
|
||||
<div>
|
||||
<h2>Bootstrap a persona from a real browser session</h2>
|
||||
<p>
|
||||
Pick a local Chromium profile, paste an absolute profile path, or attach to a live CDP endpoint.
|
||||
The form saves the Persona normally, then imports profile files, cookies, and optional tab storage into
|
||||
the Persona's own directories.
|
||||
</p>
|
||||
</div>
|
||||
<div class="persona-import-hero__meta">
|
||||
<div class="persona-import-hero__stat">
|
||||
<span>Detected profiles</span>
|
||||
<strong>{{ detected_profile_count }}</strong>
|
||||
</div>
|
||||
<div class="persona-import-hero__stat">
|
||||
<span>Persona artifacts</span>
|
||||
<code>chrome_user_data</code>
|
||||
<code>cookies.txt</code>
|
||||
<code>auth.json</code>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
{{ block.super }}
|
||||
{% endblock %}
|
||||
@@ -706,14 +706,14 @@
|
||||
? Math.max(0, Math.min(100, extractor.progress))
|
||||
: null;
|
||||
const progressStyle = progress !== null ? ` style="width: ${progress}%;"` : '';
|
||||
const pidHtml = extractor.pid ? `<span class="pid-label compact">pid ${extractor.pid}</span>` : '';
|
||||
const pidHtml = extractor.status === 'started' && extractor.pid ? `<span class="pid-label compact">pid ${extractor.pid}</span>` : '';
|
||||
|
||||
return `
|
||||
<span class="extractor-badge ${extractor.status || 'queued'}">
|
||||
<span class="progress-fill"${progressStyle}></span>
|
||||
<span class="badge-content">
|
||||
<span class="badge-icon">${icon}</span>
|
||||
<span>${extractor.plugin || 'unknown'}</span>
|
||||
<span>${extractor.label || extractor.plugin || 'unknown'}</span>
|
||||
${pidHtml}
|
||||
</span>
|
||||
</span>
|
||||
@@ -742,6 +742,23 @@
|
||||
`;
|
||||
}
|
||||
|
||||
const hasProcessEntries = (snapshot.all_plugins || []).some(extractor => extractor.source === 'process');
|
||||
const hasArchiveResults = (snapshot.all_plugins || []).some(extractor => extractor.source === 'archiveresult');
|
||||
const processOnly = hasProcessEntries && !hasArchiveResults;
|
||||
const runningProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'started').length;
|
||||
const failedProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'failed').length;
|
||||
const snapshotMeta = (snapshot.total_plugins || 0) > 0
|
||||
? processOnly
|
||||
? runningProcessCount > 0
|
||||
? `Running ${runningProcessCount}/${snapshot.total_plugins || 0} setup hooks`
|
||||
: failedProcessCount > 0
|
||||
? `${failedProcessCount} setup hook${failedProcessCount === 1 ? '' : 's'} failed`
|
||||
: `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} setup hooks`
|
||||
: hasProcessEntries
|
||||
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} tasks${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}${runningProcessCount > 0 ? ` <span style="color:#d29922">(${runningProcessCount} hooks running)</span>` : ''}`
|
||||
: `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
|
||||
: 'Waiting for extractors...';
|
||||
|
||||
return `
|
||||
<div class="snapshot-item">
|
||||
<div class="snapshot-header">
|
||||
@@ -750,9 +767,7 @@
|
||||
<div class="snapshot-info">
|
||||
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
|
||||
<div class="snapshot-meta">
|
||||
${(snapshot.total_plugins || 0) > 0
|
||||
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
|
||||
: 'Waiting for extractors...'}
|
||||
${snapshotMeta}
|
||||
</div>
|
||||
</div>
|
||||
${snapshotPidHtml}
|
||||
@@ -762,7 +777,7 @@
|
||||
</div>
|
||||
<div class="snapshot-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
|
||||
<div class="progress-bar snapshot ${((processOnly && runningProcessCount > 0) || (snapshot.status === 'started' && (snapshot.progress || 0) === 0)) ? 'indeterminate' : ''}"
|
||||
style="width: ${snapshot.progress || 0}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -784,6 +799,29 @@
|
||||
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
|
||||
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
|
||||
}
|
||||
let setupHtml = '';
|
||||
if (crawl.setup_plugins && crawl.setup_plugins.length > 0) {
|
||||
const setupSummary = `${crawl.setup_completed_plugins || 0}/${crawl.setup_total_plugins || 0} setup tasks${(crawl.setup_failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${crawl.setup_failed_plugins} failed)</span>` : ''}`;
|
||||
const sortedSetup = [...crawl.setup_plugins].sort((a, b) =>
|
||||
(a.plugin || '').localeCompare(b.plugin || '')
|
||||
);
|
||||
setupHtml = `
|
||||
<div class="snapshot-item">
|
||||
<div class="snapshot-header">
|
||||
<div class="snapshot-header-link">
|
||||
<span class="snapshot-icon">⚙</span>
|
||||
<div class="snapshot-info">
|
||||
<div class="snapshot-url">Crawl Setup</div>
|
||||
<div class="snapshot-meta">${setupSummary}</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="extractor-list">
|
||||
${sortedSetup.map(e => renderExtractor(e)).join('')}
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
// Show warning if crawl is stuck (queued but can't start)
|
||||
let warningHtml = '';
|
||||
@@ -847,6 +885,7 @@
|
||||
${warningHtml}
|
||||
<div class="crawl-body">
|
||||
<div class="snapshot-list">
|
||||
${setupHtml}
|
||||
${snapshotsHtml}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,4 @@
|
||||
{% load static tz admin_urls %}
|
||||
{% load static tz admin_urls core_tags %}
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
@@ -9,6 +9,10 @@
|
||||
<link rel="stylesheet" href="{% static 'admin/css/base.css' %}">
|
||||
<link rel="stylesheet" href="{% static 'admin.css' %}">
|
||||
<link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">
|
||||
{% api_token as api_token %}
|
||||
<script>
|
||||
window.ARCHIVEBOX_API_KEY = "{{ api_token|escapejs }}";
|
||||
</script>
|
||||
|
||||
<script src="{% static 'jquery.min.js' %}"></script>
|
||||
{% block extra_head %}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
<a href="/admin/core/tag/">Tags</a> |
|
||||
<a href="/admin/core/archiveresult/?o=-1">Log</a>
|
||||
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
|
||||
<a href="/api">API</a> |
|
||||
<a href="/api/v1/docs">API</a> |
|
||||
<a href="{% url 'public-index' %}">Public</a> |
|
||||
<a href="/admin/">Admin</a>
|
||||
|
||||
|
||||
@@ -456,6 +456,9 @@
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .card-text {
|
||||
display: none;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumbnail-text-header,
|
||||
.thumb-card:has([data-compact]) .thumbnail-compact-icon,
|
||||
.thumb-card:has([data-compact]) .thumbnail-compact-label {
|
||||
@@ -620,8 +623,9 @@
|
||||
<div class="header-top container-fluid">
|
||||
<div class="row nav">
|
||||
<div class="col-lg-2" style="line-height: 50px; vertical-align: middle">
|
||||
<a href="../../index.html" class="header-archivebox" title="Go to Main Index...">
|
||||
<img src="/static/archive.png" alt="Archive Icon">
|
||||
{% public_base_url as public_base %}
|
||||
<a href="{% if public_base %}{{ public_base }}/public/{% else %}/{% endif %}" class="header-archivebox" title="Go to Public Index...">
|
||||
<img src="{% if public_base %}{{ public_base }}/static/archive.png{% else %}/static/archive.png{% endif %}" alt="Archive Icon">
|
||||
ArchiveBox
|
||||
</a>
|
||||
</div>
|
||||
@@ -683,12 +687,10 @@
|
||||
<div class="info-chunk">
|
||||
<h5>🗃 Snapshot: <a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|default:id|truncatechars:24}}</small></code></a></h5>
|
||||
<a href="{% snapshot_url snapshot 'index.json' %}" title="JSON summary of archived link.">JSON</a> |
|
||||
<a href="{% snapshot_url snapshot 'warc/' %}" title="Any WARC archives for the page">WARC</a> |
|
||||
<a href="{% snapshot_url snapshot 'media/' %}" title="Audio, Video, and Subtitle files.">Media</a> |
|
||||
<a href="{% snapshot_url snapshot 'git/' %}" title="Any git repos at the url">Git</a> |
|
||||
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse the full SNAP_DIR for this snapshot">See all files...</a> |
|
||||
<a href="{% admin_base_url %}/admin/core/snapshot/?q={{snapshot_id|default:id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> |
|
||||
<a href="{% admin_base_url %}/admin/core/snapshot/{{snapshot_id|default:id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> |
|
||||
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Webserver-provided index of files directory.">See all files...</a><br/>
|
||||
<a href="https://web.archive.org/web/{{url}}" title="Search for a copy of the URL saved in Archive.org" target="_blank" rel="noreferrer">Archive.org</a><br/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -713,12 +715,12 @@
|
||||
<a href="{{display_url}}" data-no-preview="1" title="Download output file" download>⬇️</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
<a href="{{ display_url }}" target="preview">
|
||||
<h4 class="card-title">{% plugin_icon result_info.name %} {{ result_info.name|plugin_name|truncatechars:20 }}</h4>
|
||||
</a>
|
||||
<a href="{{ display_url }}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>{{ result_info.path }}</code></p>
|
||||
</a>
|
||||
<a href="{{ display_url }}" target="preview">
|
||||
<h4 class="card-title">{{ result_info.name|title }}</h4>
|
||||
</a>
|
||||
{% if result_info.result %}
|
||||
{% with plugin_base=result_info.name|plugin_name %}
|
||||
{% if plugin_base == 'ytdlp' or plugin_base == 'yt-dlp' or plugin_base == 'youtube-dl' %}
|
||||
|
||||
@@ -902,9 +902,9 @@
|
||||
<div class="header-top">
|
||||
<div class="header-nav">
|
||||
<div class="header-col header-left" style="line-height: 58px; vertical-align: middle">
|
||||
<a href="/" class="header-archivebox" title="Go to Main Index...">
|
||||
{% web_base_url as web_base %}
|
||||
<img src="{% if web_base %}//{{ web_base|cut:'http://'|cut:'https://' }}/static/archive.png{% else %}{% static 'archive.png' %}{% endif %}" alt="Archive Icon">
|
||||
{% public_base_url as public_base %}
|
||||
<a href="{% if public_base %}{{ public_base }}/public/{% else %}/{% endif %}" class="header-archivebox" title="Go to Public Index...">
|
||||
<img src="{% if public_base %}{{ public_base }}/static/archive.png{% else %}{% static 'archive.png' %}{% endif %}" alt="Archive Icon">
|
||||
ArchiveBox
|
||||
</a>
|
||||
</div>
|
||||
@@ -996,8 +996,7 @@
|
||||
<br/>
|
||||
<div class="external-links">
|
||||
📁
|
||||
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse files for this snapshot" target="_blank">FILES</a> | 🗃️
|
||||
<a href="{% snapshot_url snapshot warc_path %}" title="Download the ArchiveBox-generated WARC file" target="_blank">WARC</a> |
|
||||
<a href="{% snapshot_base_url snapshot %}/?files=1" title="Browse the full SNAP_DIR for this snapshot" target="_blank">See all files...</a> |
|
||||
<a href="https://web.archive.org/web/{{url}}" title="Search for a copy of the URL saved in Archive.org" target="_blank" rel="noreferrer">🏛️ Archive.org</a>
|
||||
<!--<a href="https://archive.md/{{url}}" title="Search for a copy of the URL saved in Archive.today" target="_blank" rel="noreferrer">Archive.today</a> | -->
|
||||
<!--<a href="https://ghostarchive.org/search?term={{url}}" title="Search for a copy of the URL saved in GhostArchive.org" target="_blank" rel="noreferrer">More...</a>-->
|
||||
@@ -1010,7 +1009,7 @@
|
||||
|
||||
|
||||
{% for result in archiveresults %}
|
||||
{% with display_path=result.path|default:result.result.embed_path display_url='' %}
|
||||
{% with display_path=result.path display_url='' %}
|
||||
{% if display_path %}{% snapshot_url snapshot display_path as display_url %}{% endif %}
|
||||
<div class="thumb-card{% if forloop.first %} selected-card{% endif %}"{% if display_url %} data-preview-url="{{display_url}}"{% endif %}>
|
||||
<div class="thumb-body">
|
||||
|
||||
@@ -78,6 +78,7 @@ textarea, select, input[type="text"] {
|
||||
box-shadow: 4px 4px 4px rgba(0,0,0,0.02);
|
||||
width: 100%;
|
||||
padding: 8px 12px;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
@@ -85,6 +86,10 @@ textarea {
|
||||
min-height: 300px;
|
||||
}
|
||||
|
||||
input[type="text"] {
|
||||
min-height: 42px;
|
||||
}
|
||||
|
||||
textarea[rows="3"] {
|
||||
min-height: 80px;
|
||||
}
|
||||
@@ -153,6 +158,13 @@ select {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.settings-row {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(260px, 340px) minmax(420px, 1fr);
|
||||
gap: 18px;
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.form-field label {
|
||||
display: block;
|
||||
font-size: 16px;
|
||||
@@ -160,6 +172,234 @@ select {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.field-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.field-header label {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
.url-workbench {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr) minmax(280px, 360px);
|
||||
gap: 18px;
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.url-editor-column {
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.url-editor-shell {
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.url-editor-shell textarea[name="url"] {
|
||||
position: relative;
|
||||
z-index: 2;
|
||||
background: transparent;
|
||||
color: #1f2937;
|
||||
-webkit-text-fill-color: #1f2937;
|
||||
caret-color: #1f2937;
|
||||
min-height: 240px;
|
||||
height: 240px;
|
||||
line-height: 1.5;
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
.url-editor-shell textarea[name="url"]::selection {
|
||||
background: rgba(0, 72, 130, 0.18);
|
||||
}
|
||||
|
||||
.url-highlight-layer {
|
||||
position: absolute;
|
||||
inset: 2px;
|
||||
z-index: 1;
|
||||
margin: 0;
|
||||
padding: 8px 12px;
|
||||
overflow: auto;
|
||||
pointer-events: none;
|
||||
white-space: pre-wrap;
|
||||
overflow-wrap: anywhere;
|
||||
word-break: break-word;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
color: transparent;
|
||||
background: transparent;
|
||||
border-radius: 2px;
|
||||
scrollbar-width: none;
|
||||
}
|
||||
|
||||
.url-highlight-layer::-webkit-scrollbar {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.url-highlight-segment {
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.detected-urls-panel {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
min-height: 240px;
|
||||
padding: 12px 14px;
|
||||
background: linear-gradient(180deg, #fff 0%, #f6f8fb 100%);
|
||||
border: 1px solid #d7e2eb;
|
||||
border-radius: 8px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.detected-urls-header {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
justify-content: space-between;
|
||||
gap: 12px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.detected-urls-summary {
|
||||
font-size: 12px;
|
||||
color: #5f6c78;
|
||||
}
|
||||
|
||||
.detected-urls-list {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
display: grid;
|
||||
align-content: start;
|
||||
gap: 8px;
|
||||
overflow: auto;
|
||||
padding-right: 4px;
|
||||
}
|
||||
|
||||
.detected-urls-empty {
|
||||
padding: 8px 0;
|
||||
color: #6b7280;
|
||||
font-size: 13px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.detected-url-item {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
padding: 10px 12px;
|
||||
border-left: 4px solid var(--detected-url-border, #d0d7de);
|
||||
border-radius: 6px;
|
||||
background: linear-gradient(90deg, var(--detected-url-bg, rgba(0, 0, 0, 0.03)), rgba(255, 255, 255, 0.96) 28%);
|
||||
}
|
||||
|
||||
.detected-url-topline {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.detected-url-controls {
|
||||
display: flex;
|
||||
flex-wrap: nowrap;
|
||||
gap: 6px;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.detected-url-number {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
border-radius: 999px;
|
||||
background: rgba(15, 23, 42, 0.08);
|
||||
color: #24303b;
|
||||
font-size: 10px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.detected-url-body {
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.detected-url-value {
|
||||
display: block;
|
||||
font-size: 12px;
|
||||
line-height: 1.45;
|
||||
color: #1f2937;
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
|
||||
.detected-url-toggle-btn {
|
||||
flex: 0 0 auto;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
padding: 4px 8px;
|
||||
min-height: 24px;
|
||||
border: 1px solid rgba(148, 163, 184, 0.4);
|
||||
border-radius: 999px;
|
||||
background: rgba(148, 163, 184, 0.12);
|
||||
color: #64748b;
|
||||
font-size: 11px;
|
||||
font-weight: 700;
|
||||
line-height: 1;
|
||||
white-space: nowrap;
|
||||
transition: background-color 120ms ease, border-color 120ms ease, color 120ms ease;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.detected-url-toggle-btn:hover {
|
||||
background: rgba(15, 23, 42, 0.08);
|
||||
}
|
||||
|
||||
.detected-url-toggle-btn-inactive:hover {
|
||||
border-color: rgba(180, 35, 24, 0.28);
|
||||
background: rgba(180, 35, 24, 0.10);
|
||||
color: #b42318;
|
||||
}
|
||||
|
||||
.detected-url-toggle-btn-active:hover {
|
||||
border-color: rgba(22, 101, 52, 0.28);
|
||||
background: rgba(22, 101, 52, 0.10);
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.detected-url-toggle-btn-disabled,
|
||||
.detected-url-toggle-btn-disabled:hover {
|
||||
border-color: rgba(203, 213, 225, 0.55);
|
||||
background: rgba(226, 232, 240, 0.45);
|
||||
color: #94a3b8;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.detected-url-message {
|
||||
margin-top: 4px;
|
||||
font-size: 11px;
|
||||
color: #617080;
|
||||
line-height: 1.45;
|
||||
}
|
||||
|
||||
.detected-url-allowlisted .detected-url-value {
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.detected-url-denied .detected-url-value {
|
||||
color: #b42318;
|
||||
text-decoration: line-through;
|
||||
text-decoration-thickness: 1.5px;
|
||||
}
|
||||
|
||||
.detected-url-denied .detected-url-message {
|
||||
color: #b42318;
|
||||
}
|
||||
|
||||
.detected-url-filtered .detected-url-value {
|
||||
color: #6b7280;
|
||||
}
|
||||
|
||||
.form-field .help-text {
|
||||
font-size: 12px;
|
||||
color: #666;
|
||||
@@ -173,7 +413,137 @@ select {
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
/* Checkbox fields (for overwrite, update, index_only) */
|
||||
.tag-editor-container {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 8px 12px;
|
||||
min-height: 44px;
|
||||
background: #fff;
|
||||
border: 2px solid #004882;
|
||||
border-radius: 4px;
|
||||
box-shadow: 4px 4px 4px rgba(0,0,0,0.02);
|
||||
cursor: text;
|
||||
}
|
||||
|
||||
.tag-editor-container:focus-within {
|
||||
border-color: #2c7ec1;
|
||||
}
|
||||
|
||||
.tag-pills {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 6px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.tag-pill {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
padding: 4px 8px 4px 10px;
|
||||
background: var(--tag-bg, #e2e8f0);
|
||||
color: var(--tag-fg, #1e293b);
|
||||
border-radius: 16px;
|
||||
border: 1px solid var(--tag-border, #cbd5e1);
|
||||
font-size: 13px;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.tag-remove-btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
border: 1px solid rgba(15, 23, 42, 0.12);
|
||||
border-radius: 50%;
|
||||
background: rgba(15, 23, 42, 0.08);
|
||||
color: inherit;
|
||||
font-size: 14px;
|
||||
line-height: 1;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.tag-inline-input {
|
||||
flex: 1;
|
||||
min-width: 120px;
|
||||
padding: 4px 0;
|
||||
border: none !important;
|
||||
box-shadow: none !important;
|
||||
outline: none;
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
.tag-inline-input::placeholder {
|
||||
color: #7c8b98;
|
||||
}
|
||||
|
||||
.url-filters-widget textarea {
|
||||
min-height: 58px;
|
||||
font-family: monospace;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.url-filters-field > label {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.url-filters-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.url-filter-label-row {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
flex-wrap: nowrap;
|
||||
gap: 10px;
|
||||
width: 100%;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
|
||||
.url-filters-column .url-filter-label {
|
||||
display: block;
|
||||
font-size: 14px;
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
.url-filter-label-main {
|
||||
font-weight: 600;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.url-filter-label-note {
|
||||
display: inline-block;
|
||||
flex: 0 0 auto;
|
||||
margin-left: auto;
|
||||
font-size: 12px;
|
||||
color: #7a7a7a;
|
||||
font-weight: 400;
|
||||
font-style: italic;
|
||||
text-align: right;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.url-filters-toggle {
|
||||
display: inline-flex !important;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
margin-top: 10px;
|
||||
font-size: 14px !important;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.url-filters-toggle input[type="checkbox"] {
|
||||
width: auto;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.checkbox-field {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
@@ -193,7 +563,6 @@ select {
|
||||
/* URL Counter */
|
||||
.url-counter {
|
||||
display: inline-block;
|
||||
margin-top: 8px;
|
||||
padding: 4px 10px;
|
||||
font-size: 13px;
|
||||
font-weight: 600;
|
||||
@@ -209,13 +578,27 @@ select {
|
||||
border-color: #c3e6cb;
|
||||
}
|
||||
|
||||
@media (max-width: 1020px) {
|
||||
.settings-row {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.url-workbench {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.url-filters-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Plugin Presets */
|
||||
.plugin-presets {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
margin-bottom: 20px;
|
||||
margin-bottom: 18px;
|
||||
padding: 15px;
|
||||
background-color: #f8f9fa;
|
||||
border: 1px solid #dee2e6;
|
||||
@@ -254,11 +637,18 @@ select {
|
||||
|
||||
/* Plugin groups */
|
||||
.plugin-group {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
padding: 14px 16px;
|
||||
background-color: white;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 6px;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.plugin-groups-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, minmax(280px, 1fr));
|
||||
gap: 16px;
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
.plugin-group-header {
|
||||
@@ -268,6 +658,7 @@ select {
|
||||
margin-bottom: 12px;
|
||||
padding-bottom: 8px;
|
||||
border-bottom: 2px solid #004882;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.plugin-group-header label {
|
||||
@@ -277,6 +668,12 @@ select {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.plugin-group-note {
|
||||
font-size: 12px;
|
||||
color: #7a7a7a;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.select-all-btn {
|
||||
padding: 4px 12px;
|
||||
font-size: 12px;
|
||||
@@ -293,42 +690,105 @@ select {
|
||||
|
||||
.plugin-checkboxes {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
||||
gap: 8px;
|
||||
grid-template-columns: 1fr;
|
||||
gap: 6px;
|
||||
}
|
||||
|
||||
.plugin-checkboxes ul {
|
||||
list-style-type: none;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
display: contents;
|
||||
.plugin-checkboxes > div {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
gap: 6px 10px;
|
||||
}
|
||||
|
||||
.plugin-checkboxes li {
|
||||
.plugin-checkboxes > div > div {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
padding: 6px;
|
||||
padding: 6px 8px;
|
||||
border: 1px solid #e3e8ef;
|
||||
background-color: #fff;
|
||||
border-radius: 4px;
|
||||
transition: background-color 0.2s;
|
||||
}
|
||||
|
||||
.plugin-checkboxes li:hover {
|
||||
.plugin-checkboxes > div > div:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.plugin-checkboxes input[type="checkbox"] {
|
||||
grid-column: 2;
|
||||
grid-row: 1 / span 2;
|
||||
margin: 0;
|
||||
margin-top: 2px;
|
||||
width: auto;
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
|
||||
.plugin-checkboxes label {
|
||||
#add-form .plugin-checkboxes label {
|
||||
display: grid !important;
|
||||
grid-template-columns: 18px 16px minmax(0, 1fr);
|
||||
column-gap: 8px;
|
||||
row-gap: 3px;
|
||||
align-items: start;
|
||||
width: 100%;
|
||||
margin: 0;
|
||||
font-size: 14px;
|
||||
font-weight: normal;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.plugin-choice-name {
|
||||
grid-column: 3;
|
||||
grid-row: 1;
|
||||
font-weight: 500;
|
||||
color: #1f2937;
|
||||
}
|
||||
|
||||
#add-form .plugin-choice-icon {
|
||||
grid-column: 1;
|
||||
grid-row: 1 / span 2;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
color: #7a7a7a;
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
|
||||
#add-form .plugin-choice-icon .abx-output-icon {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
#add-form .plugin-choice-icon svg {
|
||||
width: 18px;
|
||||
height: 18px;
|
||||
}
|
||||
|
||||
#add-form .plugin-choice-description {
|
||||
grid-column: 3;
|
||||
grid-row: 2;
|
||||
margin-left: 0;
|
||||
display: inline-block;
|
||||
font-size: 12px;
|
||||
color: #7a7a7a !important;
|
||||
text-decoration: none !important;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
#add-form .plugin-checkboxes label a.plugin-choice-description:link,
|
||||
#add-form .plugin-checkboxes label a.plugin-choice-description:visited,
|
||||
#add-form .plugin-checkboxes label a.plugin-choice-description:active {
|
||||
color: #7a7a7a !important;
|
||||
text-decoration: none !important;
|
||||
}
|
||||
|
||||
#add-form .plugin-checkboxes label a.plugin-choice-description:hover,
|
||||
#add-form .plugin-checkboxes label a.plugin-choice-description:focus {
|
||||
color: #4b5563 !important;
|
||||
text-decoration: underline !important;
|
||||
}
|
||||
|
||||
/* Advanced section (collapsible) */
|
||||
.advanced-section {
|
||||
background-color: white;
|
||||
@@ -388,6 +848,14 @@ input:focus, select:focus, textarea:focus, button:focus {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.plugin-groups-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.plugin-checkboxes > div {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.plugin-group-header {
|
||||
flex-direction: column;
|
||||
align-items: flex-start;
|
||||
|
||||
@@ -477,6 +477,10 @@ body.model-snapshot.change-list #content .object-tools {
|
||||
max-width: 220px;
|
||||
}
|
||||
|
||||
#content td.field-tags_inline .tag-editor-inline.readonly {
|
||||
padding-right: 0;
|
||||
}
|
||||
|
||||
#content th.field-tags_inline,
|
||||
#content td.field-tags_inline {
|
||||
max-width: 220px;
|
||||
@@ -610,6 +614,56 @@ body.model-snapshot.change-list #content .object-tools {
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list td.field-cmd_str {
|
||||
width: 300px !important;
|
||||
max-width: 300px !important;
|
||||
min-width: 300px !important;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list td.field-cmd_str > div,
|
||||
body.model-archiveresult.change-list #result_list td.field-cmd_str code {
|
||||
max-width: 300px !important;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list {
|
||||
table-layout: fixed;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list th.column-cmd_str,
|
||||
body.model-archiveresult.change-list #result_list td.field-cmd_str {
|
||||
width: 300px !important;
|
||||
max-width: 300px !important;
|
||||
min-width: 300px !important;
|
||||
overflow: hidden !important;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list th.column-process_link,
|
||||
body.model-archiveresult.change-list #result_list td.field-process_link {
|
||||
width: 72px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list th.column-machine_link,
|
||||
body.model-archiveresult.change-list #result_list td.field-machine_link {
|
||||
width: 180px;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list td.field-snapshot_info a {
|
||||
display: block;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list td.field-cmd_str > div,
|
||||
body.model-archiveresult.change-list #result_list td.field-cmd_str code {
|
||||
width: 300px !important;
|
||||
min-width: 300px !important;
|
||||
max-width: 300px !important;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body.filters-collapsed #content #changelist-filter {
|
||||
display: none !important;
|
||||
}
|
||||
@@ -637,10 +691,49 @@ body.filters-collapsed .filtered div.xfull {
|
||||
font-variant: small-caps;
|
||||
}
|
||||
|
||||
#result_list tbody td.field-status {
|
||||
#result_list tbody td.field-status,
|
||||
#result_list tbody td.field-status_badge {
|
||||
font-variant: small-caps;
|
||||
}
|
||||
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist .changelist-form-container {
|
||||
gap: 0 !important;
|
||||
}
|
||||
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist .changelist-form-container > div,
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist .results,
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist .paginator,
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist #toolbar,
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist #changelist-form,
|
||||
body.model-archiveresult.filters-collapsed.change-list #changelist #result_list {
|
||||
width: 100% !important;
|
||||
max-width: 100% !important;
|
||||
margin-right: 0 !important;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list tbody tr {
|
||||
transition: background-color 0.15s ease, opacity 0.15s ease;
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.started),
|
||||
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.backoff) {
|
||||
background: rgba(251, 191, 36, 0.14);
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.failed) {
|
||||
background: rgba(239, 68, 68, 0.12);
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.succeeded) {
|
||||
background: rgba(34, 197, 94, 0.11);
|
||||
}
|
||||
|
||||
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.skipped),
|
||||
body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.noresults) {
|
||||
background: rgba(148, 163, 184, 0.10);
|
||||
opacity: 0.82;
|
||||
}
|
||||
|
||||
.inline-group .tabular td.original p {
|
||||
margin-top: -28px;
|
||||
}
|
||||
@@ -697,6 +790,7 @@ tbody .output-link:hover {opacity: 1;}
|
||||
.status-badge.failed { background: #fee2e2; color: #ef4444; }
|
||||
.status-badge.backoff { background: #fef3c7; color: #f59e0b; }
|
||||
.status-badge.skipped { background: #f3f4f6; color: #6b7280; }
|
||||
.status-badge.noresults { background: #f1f5f9; color: #64748b; }
|
||||
|
||||
/* Progress Bar */
|
||||
.snapshot-progress-bar {
|
||||
|
||||
195
archivebox/tests/test_add_view.py
Normal file
195
archivebox/tests/test_add_view.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import re
|
||||
|
||||
import pytest
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.config.common import SERVER_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.core.models import Tag
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
User = get_user_model()
|
||||
WEB_HOST = 'web.archivebox.localhost:8000'
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return User.objects.create_superuser(
|
||||
username='addviewadmin',
|
||||
email='addviewadmin@test.com',
|
||||
password='testpassword',
|
||||
)
|
||||
|
||||
|
||||
def test_add_view_renders_tag_editor_and_url_filter_fields(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
|
||||
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
|
||||
body = response.content.decode()
|
||||
|
||||
assert response.status_code == 200
|
||||
assert 'tag-editor-container' in body
|
||||
assert 'name="url_filters_allowlist"' in body
|
||||
assert 'name="url_filters_denylist"' in body
|
||||
assert 'Same domain only' in body
|
||||
assert 'name="persona"' in body
|
||||
assert 'Overwrite existing snapshots' not in body
|
||||
assert 'Update/retry previously failed URLs' not in body
|
||||
assert 'Index only dry run (add crawl but don't archive yet)' in body
|
||||
assert 'name="notes"' in body
|
||||
assert '<input type="text" name="notes"' in body
|
||||
assert body.index('name="persona"') < body.index('<h3>Crawl Plugins</h3>')
|
||||
assert 'data-url-regex=' in body
|
||||
assert 'id="url-highlight-layer"' in body
|
||||
assert 'id="detected-urls-list"' in body
|
||||
assert 'detected-url-toggle-btn' in body
|
||||
|
||||
|
||||
def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, 'SEARCH_BACKEND_ENGINE', 'sqlite')
|
||||
|
||||
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
|
||||
body = response.content.decode()
|
||||
|
||||
assert response.status_code == 200
|
||||
assert re.search(
|
||||
r'<input type="checkbox" name="search_plugins" value="search_backend_sqlite"[^>]* checked\b',
|
||||
body,
|
||||
)
|
||||
assert "const requiredSearchPlugin = 'search_backend_sqlite';" in body
|
||||
|
||||
|
||||
def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.post(
|
||||
reverse('add'),
|
||||
data={
|
||||
'url': 'https://example.com\nhttps://cdn.example.com/asset.js',
|
||||
'tag': 'alpha,beta',
|
||||
'depth': '1',
|
||||
'url_filters_allowlist': 'example.com\n*.example.com',
|
||||
'url_filters_denylist': 'cdn.example.com',
|
||||
'notes': 'Created from /add/',
|
||||
'schedule': '',
|
||||
'persona': 'Default',
|
||||
'index_only': '',
|
||||
'config': '{}',
|
||||
},
|
||||
HTTP_HOST=WEB_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
assert crawl is not None
|
||||
assert crawl.tags_str == 'alpha,beta'
|
||||
assert crawl.notes == 'Created from /add/'
|
||||
assert crawl.config.get('DEFAULT_PERSONA') == 'Default'
|
||||
assert crawl.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
|
||||
assert crawl.config['URL_DENYLIST'] == 'cdn.example.com'
|
||||
assert 'OVERWRITE' not in crawl.config
|
||||
assert 'ONLY_NEW' not in crawl.config
|
||||
|
||||
|
||||
def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.post(
|
||||
reverse('add'),
|
||||
data={
|
||||
'url': '\n'.join([
|
||||
'https://sweeting.me,https://google.com',
|
||||
'Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com',
|
||||
'[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))',
|
||||
'{"items":["https://example.com/three"]}',
|
||||
'csv,https://example.com/four',
|
||||
]),
|
||||
'tag': '',
|
||||
'depth': '0',
|
||||
'url_filters_allowlist': '',
|
||||
'url_filters_denylist': '',
|
||||
'notes': '',
|
||||
'schedule': '',
|
||||
'persona': 'Default',
|
||||
'index_only': '',
|
||||
'config': '{}',
|
||||
},
|
||||
HTTP_HOST=WEB_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
assert crawl is not None
|
||||
assert crawl.urls == '\n'.join([
|
||||
'https://sweeting.me',
|
||||
'https://google.com',
|
||||
'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'https://news.ycombinator.com',
|
||||
'https://en.wikipedia.org/wiki/Classification_(machine_learning)',
|
||||
'https://example.com/three',
|
||||
'https://example.com/four',
|
||||
])
|
||||
|
||||
|
||||
def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'window.ARCHIVEBOX_API_KEY' in response.content
|
||||
|
||||
|
||||
def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings):
|
||||
settings.PUBLIC_SNAPSHOTS_LIST = False
|
||||
settings.PUBLIC_INDEX = False
|
||||
Tag.objects.create(name='archive')
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tags_autocomplete'),
|
||||
{'q': 'a'},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings):
|
||||
settings.PUBLIC_SNAPSHOTS_LIST = True
|
||||
settings.PUBLIC_INDEX = False
|
||||
Tag.objects.create(name='archive')
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tags_autocomplete'),
|
||||
{'q': 'a'},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()['tags'][0]['name'] == 'archive'
|
||||
|
||||
|
||||
def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings):
|
||||
settings.PUBLIC_SNAPSHOTS_LIST = False
|
||||
settings.PUBLIC_INDEX = False
|
||||
Tag.objects.create(name='archive')
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tags_autocomplete'),
|
||||
{'q': 'a'},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()['tags'][0]['name'] == 'archive'
|
||||
151
archivebox/tests/test_admin_config_widget.py
Normal file
151
archivebox/tests/test_admin_config_widget.py
Normal file
@@ -0,0 +1,151 @@
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
|
||||
|
||||
def test_key_value_widget_renders_enum_autocomplete_metadata(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
lambda self: {
|
||||
'CHROME_WAIT_FOR': {
|
||||
'plugin': 'chrome',
|
||||
'type': 'string',
|
||||
'default': 'networkidle2',
|
||||
'description': 'Page load completion condition',
|
||||
'enum': ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(
|
||||
KeyValueWidget().render(
|
||||
'config',
|
||||
{'CHROME_WAIT_FOR': 'load'},
|
||||
attrs={'id': 'id_config'},
|
||||
)
|
||||
)
|
||||
|
||||
assert '"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"]' in html
|
||||
assert 'class="kv-value-options"' in html
|
||||
assert 'class="kv-help"' in html
|
||||
assert 'configureValueInput_id_config' in html
|
||||
assert 'describeMeta_id_config' in html
|
||||
assert 'validateValueAgainstMeta_id_config' in html
|
||||
|
||||
|
||||
def test_key_value_widget_renders_numeric_and_pattern_constraints(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
lambda self: {
|
||||
'TIMEOUT': {
|
||||
'plugin': 'base',
|
||||
'type': 'integer',
|
||||
'default': 60,
|
||||
'description': 'Timeout in seconds',
|
||||
'minimum': 5,
|
||||
'maximum': 120,
|
||||
},
|
||||
'CHROME_RESOLUTION': {
|
||||
'plugin': 'chrome',
|
||||
'type': 'string',
|
||||
'default': '1440,2000',
|
||||
'description': 'Viewport resolution',
|
||||
'pattern': '^\\d+,\\d+$',
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
|
||||
|
||||
assert '"minimum": 5' in html
|
||||
assert '"maximum": 120' in html
|
||||
assert '"pattern": "^\\\\d+,\\\\d+$"' in html
|
||||
assert 'Expected: ' in html
|
||||
assert 'Example: ' in html
|
||||
assert 'setValueValidationState_id_config' in html
|
||||
assert 'coerceValueForStorage_id_config' in html
|
||||
|
||||
|
||||
def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
lambda self: {
|
||||
'DEBUG': {
|
||||
'plugin': 'base',
|
||||
'type': 'boolean',
|
||||
'default': False,
|
||||
'description': 'Enable debug mode',
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(KeyValueWidget().render('config', {'DEBUG': 'True'}, attrs={'id': 'id_config'}))
|
||||
|
||||
assert "enumValues = ['True', 'False']" in html
|
||||
assert "raw.toLowerCase()" in html
|
||||
assert "lowered === 'true' || raw === '1'" in html
|
||||
assert "lowered === 'false' || raw === '0'" in html
|
||||
|
||||
|
||||
def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
lambda self: {
|
||||
'WGET_ARGS_EXTRA': {
|
||||
'plugin': 'wget',
|
||||
'type': 'array',
|
||||
'default': [],
|
||||
'description': 'Extra arguments to append to wget command',
|
||||
},
|
||||
'SAVE_ALLOWLIST': {
|
||||
'plugin': 'base',
|
||||
'type': 'object',
|
||||
'default': {},
|
||||
'description': 'Regex allowlist mapped to enabled methods',
|
||||
},
|
||||
'WGET_BINARY': {
|
||||
'plugin': 'wget',
|
||||
'type': 'string',
|
||||
'default': 'wget',
|
||||
'description': 'Path to wget binary',
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
|
||||
|
||||
assert 'Example: ["--extra-arg"]' in html
|
||||
assert 'Example: {"^https://example\\\\.com": ["wget"]}' in html
|
||||
assert 'Example: wget or /usr/bin/wget' in html
|
||||
assert 'validateBinaryValue_id_config' in html
|
||||
assert "meta.key.endsWith('_BINARY')" in html
|
||||
assert "Binary paths cannot contain quotes" in html
|
||||
|
||||
|
||||
def test_key_value_widget_falls_back_to_binary_validation_for_unknown_binary_keys(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
lambda self: {
|
||||
'CHROME_BINARY': {
|
||||
'plugin': 'base',
|
||||
'type': 'string',
|
||||
'default': '',
|
||||
'description': 'Resolved Chromium/Chrome binary path shared across plugins',
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(
|
||||
KeyValueWidget().render(
|
||||
'config',
|
||||
{'NODE_BINARY': '/opt/homebrew/bin/node'},
|
||||
attrs={'id': 'id_config'},
|
||||
)
|
||||
)
|
||||
|
||||
assert 'function getMetaForKey_id_config' in html
|
||||
assert "if (key.endsWith('_BINARY'))" in html
|
||||
assert 'Path to binary executable' in html
|
||||
127
archivebox/tests/test_admin_links.py
Normal file
127
archivebox/tests/test_admin_links.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import pytest
|
||||
from django.contrib.admin.sites import AdminSite
|
||||
from uuid import uuid4
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def _create_snapshot():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
return Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
|
||||
def _create_machine():
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.objects.create(
|
||||
guid=f'test-guid-{uuid4()}',
|
||||
hostname='test-host',
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer='Test',
|
||||
hw_product='Test Product',
|
||||
hw_uuid=f'test-hw-{uuid4()}',
|
||||
os_arch='arm64',
|
||||
os_family='darwin',
|
||||
os_platform='macOS',
|
||||
os_release='14.0',
|
||||
os_kernel='Darwin',
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
|
||||
|
||||
def _create_iface(machine):
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
|
||||
return NetworkInterface.objects.create(
|
||||
machine=machine,
|
||||
mac_address='00:11:22:33:44:66',
|
||||
ip_public='203.0.113.11',
|
||||
ip_local='10.0.0.11',
|
||||
dns_server='1.1.1.1',
|
||||
hostname='test-host',
|
||||
iface='en0',
|
||||
isp='Test ISP',
|
||||
city='Test City',
|
||||
region='Test Region',
|
||||
country='Test Country',
|
||||
)
|
||||
|
||||
|
||||
def test_archiveresult_admin_links_plugin_and_process():
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
iface = _create_iface(_create_machine())
|
||||
process = Process.objects.create(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(snapshot.output_dir / 'wget'),
|
||||
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
result = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin='wget',
|
||||
hook_name='on_Snapshot__06_wget.finite.bg.py',
|
||||
process=process,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
)
|
||||
|
||||
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
|
||||
|
||||
plugin_html = str(admin.plugin_with_icon(result))
|
||||
process_html = str(admin.process_link(result))
|
||||
|
||||
assert '/admin/environment/plugins/builtin.wget/' in plugin_html
|
||||
assert f'/admin/machine/process/{process.id}/change' in process_html
|
||||
|
||||
|
||||
def test_process_admin_links_binary_and_iface():
|
||||
from archivebox.machine.admin import ProcessAdmin
|
||||
from archivebox.machine.models import Binary, Process
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='wget',
|
||||
abspath='/usr/local/bin/wget',
|
||||
version='1.21.2',
|
||||
binprovider='env',
|
||||
binproviders='env',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
iface=iface,
|
||||
binary=binary,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd='/tmp/wget',
|
||||
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
|
||||
admin = ProcessAdmin(Process, AdminSite())
|
||||
|
||||
binary_html = str(admin.binary_link(process))
|
||||
iface_html = str(admin.iface_link(process))
|
||||
|
||||
assert f'/admin/machine/binary/{binary.id}/change' in binary_html
|
||||
assert f'/admin/machine/networkinterface/{iface.id}/change' in iface_html
|
||||
@@ -9,11 +9,13 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import uuid
|
||||
from typing import cast
|
||||
from django.test import override_settings
|
||||
from django.urls import reverse
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import UserManager
|
||||
from django.utils import timezone
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
@@ -195,6 +197,232 @@ class TestAdminSnapshotListView:
|
||||
assert b'snapshot-view-list' in response.content
|
||||
assert b'snapshot-view-grid' in response.content
|
||||
|
||||
def test_binary_change_view_renders(self, client, admin_user, db):
|
||||
"""Binary admin change form should load without FieldError."""
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
|
||||
machine = Machine.objects.create(
|
||||
guid=f'test-guid-{uuid.uuid4()}',
|
||||
hostname='test-host',
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer='Test',
|
||||
hw_product='Test Product',
|
||||
hw_uuid=f'test-hw-{uuid.uuid4()}',
|
||||
os_arch='x86_64',
|
||||
os_family='darwin',
|
||||
os_platform='darwin',
|
||||
os_release='test',
|
||||
os_kernel='test-kernel',
|
||||
stats={},
|
||||
)
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='gallery-dl',
|
||||
binproviders='env',
|
||||
binprovider='env',
|
||||
abspath='/opt/homebrew/bin/gallery-dl',
|
||||
version='1.26.9',
|
||||
sha256='abc123',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
url = f'/admin/machine/binary/{binary.pk}/change/'
|
||||
response = client.get(url, HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'gallery-dl' in response.content
|
||||
|
||||
def test_change_view_renders_real_redo_failed_action(self, client, admin_user, snapshot):
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
url = reverse('admin:core_snapshot_change', args=[snapshot.pk])
|
||||
response = client.get(url, HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert f'/admin/core/snapshot/{snapshot.pk}/redo-failed/'.encode() in response.content
|
||||
|
||||
def test_redo_failed_action_requeues_snapshot(self, client, admin_user, snapshot, monkeypatch):
|
||||
import archivebox.core.admin_snapshots as admin_snapshots
|
||||
|
||||
queued = []
|
||||
|
||||
def fake_bg_archive_snapshot(obj, overwrite=False, methods=None):
|
||||
queued.append((str(obj.pk), overwrite, methods))
|
||||
return 1
|
||||
|
||||
monkeypatch.setattr(admin_snapshots, 'bg_archive_snapshot', fake_bg_archive_snapshot)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
url = reverse('admin:core_snapshot_redo_failed', args=[snapshot.pk])
|
||||
response = client.post(url, HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 302
|
||||
assert queued == [(str(snapshot.pk), False, None)]
|
||||
assert response['Location'].endswith(f'/admin/core/snapshot/{snapshot.pk}/change/')
|
||||
|
||||
|
||||
class TestArchiveResultAdminListView:
|
||||
def test_list_view_renders_readonly_tags_and_noresults_status(self, client, admin_user, snapshot):
|
||||
from archivebox.core.models import ArchiveResult, Tag
|
||||
|
||||
tag = Tag.objects.create(name='Alpha Research')
|
||||
snapshot.tags.add(tag)
|
||||
ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin='title',
|
||||
status=ArchiveResult.StatusChoices.NORESULTS,
|
||||
output_str='No title found',
|
||||
)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
response = client.get(reverse('admin:core_archiveresult_changelist'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'Alpha Research' in response.content
|
||||
assert b'tag-editor-inline readonly' in response.content
|
||||
assert b'No Results' in response.content
|
||||
|
||||
def test_archiveresult_model_has_no_retry_at_field(self):
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
assert 'retry_at' not in {field.name for field in ArchiveResult._meta.fields}
|
||||
|
||||
|
||||
class TestLiveProgressView:
|
||||
def test_live_progress_routes_crawl_process_rows_to_crawl_setup(self, client, admin_user, snapshot, db):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
machine_models._CURRENT_MACHINE = None
|
||||
machine = Machine.current()
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=43210,
|
||||
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js', '--url=https://example.com'],
|
||||
env={
|
||||
'CRAWL_ID': str(snapshot.crawl_id),
|
||||
'SNAPSHOT_ID': str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
|
||||
setup_entry = next(item for item in active_crawl['setup_plugins'] if item['source'] == 'process')
|
||||
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
|
||||
assert setup_entry['label'] == 'chrome wait'
|
||||
assert setup_entry['status'] == 'started'
|
||||
assert active_crawl['worker_pid'] == 43210
|
||||
assert active_snapshot['all_plugins'] == []
|
||||
|
||||
def test_live_progress_uses_snapshot_process_rows_before_archiveresults(self, client, admin_user, snapshot, db):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
machine_models._CURRENT_MACHINE = None
|
||||
machine = Machine.current()
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=43211,
|
||||
cmd=['/plugins/title/on_Snapshot__10_title.py', '--url=https://example.com'],
|
||||
env={
|
||||
'CRAWL_ID': str(snapshot.crawl_id),
|
||||
'SNAPSHOT_ID': str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
|
||||
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
|
||||
assert active_snapshot['all_plugins'][0]['source'] == 'process'
|
||||
assert active_snapshot['all_plugins'][0]['label'] == 'title'
|
||||
assert active_snapshot['all_plugins'][0]['status'] == 'started'
|
||||
assert active_snapshot['worker_pid'] == 43211
|
||||
|
||||
def test_live_progress_merges_process_rows_with_archiveresults_when_present(self, client, admin_user, snapshot, db):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
machine_models._CURRENT_MACHINE = None
|
||||
machine = Machine.current()
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=54321,
|
||||
cmd=['/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
|
||||
env={
|
||||
'CRAWL_ID': str(snapshot.crawl_id),
|
||||
'SNAPSHOT_ID': str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin='title',
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
|
||||
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
|
||||
sources = {item['source'] for item in active_snapshot['all_plugins']}
|
||||
plugins = {item['plugin'] for item in active_snapshot['all_plugins']}
|
||||
assert sources == {'archiveresult', 'process'}
|
||||
assert 'title' in plugins
|
||||
assert 'chrome' in plugins
|
||||
|
||||
def test_live_progress_omits_pid_for_exited_process_rows(self, client, admin_user, snapshot, db):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
machine_models._CURRENT_MACHINE = None
|
||||
machine = Machine.current()
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.EXITED,
|
||||
exit_code=0,
|
||||
pid=99999,
|
||||
cmd=['/plugins/title/on_Snapshot__10_title.py', '--url=https://example.com'],
|
||||
env={
|
||||
'CRAWL_ID': str(snapshot.crawl_id),
|
||||
'SNAPSHOT_ID': str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
ended_at=timezone.now(),
|
||||
)
|
||||
|
||||
client.login(username='testadmin', password='testpassword')
|
||||
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
|
||||
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
|
||||
process_entry = next(item for item in active_snapshot['all_plugins'] if item['source'] == 'process')
|
||||
assert process_entry['status'] == 'succeeded'
|
||||
assert 'pid' not in process_entry
|
||||
|
||||
|
||||
class TestAdminSnapshotSearch:
|
||||
"""Tests for admin snapshot search functionality."""
|
||||
|
||||
305
archivebox/tests/test_archive_result_service.py
Normal file
305
archivebox/tests/test_archive_result_service.py
Normal file
@@ -0,0 +1,305 @@
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
from django.db import connection
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def _cleanup_machine_process_rows() -> None:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("DELETE FROM machine_process")
|
||||
|
||||
|
||||
def _create_snapshot():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
return Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
|
||||
def _create_machine():
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.objects.create(
|
||||
guid=f'test-guid-{uuid4()}',
|
||||
hostname='test-host',
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer='Test',
|
||||
hw_product='Test Product',
|
||||
hw_uuid=f'test-hw-{uuid4()}',
|
||||
os_arch='arm64',
|
||||
os_family='darwin',
|
||||
os_platform='macOS',
|
||||
os_release='14.0',
|
||||
os_kernel='Darwin',
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
|
||||
|
||||
def _create_iface(machine):
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
|
||||
return NetworkInterface.objects.create(
|
||||
machine=machine,
|
||||
mac_address='00:11:22:33:44:55',
|
||||
ip_public='203.0.113.10',
|
||||
ip_local='10.0.0.10',
|
||||
dns_server='1.1.1.1',
|
||||
hostname='test-host',
|
||||
iface='en0',
|
||||
isp='Test ISP',
|
||||
city='Test City',
|
||||
region='Test Region',
|
||||
country='Test Country',
|
||||
)
|
||||
|
||||
|
||||
def test_process_completed_projects_inline_archiveresult():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "wget"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
(plugin_dir / "index.html").write_text("<html>ok</html>")
|
||||
|
||||
bus = create_bus(name="test_inline_archiveresult")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=["index.html"],
|
||||
process_id="proc-inline",
|
||||
snapshot_id=str(snapshot.id),
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "wget",
|
||||
"hook_name": "on_Snapshot__06_wget.finite.bg",
|
||||
"status": "succeeded",
|
||||
"output_str": "wget/index.html",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
|
||||
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
assert result.output_str == "wget/index.html"
|
||||
assert "index.html" in result.output_files
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_process_completed_projects_synthetic_failed_archiveresult():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "chrome"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bus = create_bus(name="test_synthetic_archiveresult")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="chrome",
|
||||
hook_name="on_Snapshot__11_chrome_wait",
|
||||
stdout="",
|
||||
stderr="Hook timed out after 60 seconds",
|
||||
exit_code=-1,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[],
|
||||
process_id="proc-failed",
|
||||
snapshot_id=str(snapshot.id),
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:01:00+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"plugin": "chrome",
|
||||
"hook_name": "on_Snapshot__11_chrome_wait",
|
||||
"status": "failed",
|
||||
"output_str": "Hook timed out after 60 seconds",
|
||||
"error": "Hook timed out after 60 seconds",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
|
||||
assert result.status == ArchiveResult.StatusChoices.FAILED
|
||||
assert result.output_str == "Hook timed out after 60 seconds"
|
||||
assert "Hook timed out" in result.notes
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_process_completed_projects_noresults_archiveresult():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bus = create_bus(name="test_noresults_archiveresult")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[],
|
||||
process_id="proc-noresults",
|
||||
snapshot_id=str(snapshot.id),
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "title",
|
||||
"hook_name": "on_Snapshot__54_title.js",
|
||||
"status": "noresults",
|
||||
"output_str": "No title found",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
|
||||
assert result.status == ArchiveResult.StatusChoices.NORESULTS
|
||||
assert result.output_str == "No title found"
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
|
||||
from archivebox.machine.models import Binary, NetworkInterface
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
|
||||
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='postlight-parser',
|
||||
abspath='/tmp/postlight-parser',
|
||||
version='2.2.3',
|
||||
binprovider='npm',
|
||||
binproviders='npm',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
bus = create_bus(name="test_process_started_binary_hydration")
|
||||
service = ProcessService(bus)
|
||||
event = ProcessStartedEvent(
|
||||
plugin_name="mercury",
|
||||
hook_name="on_Snapshot__57_mercury.py",
|
||||
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py",
|
||||
hook_args=["--url=https://example.com"],
|
||||
output_dir="/tmp/mercury",
|
||||
env={
|
||||
"MERCURY_BINARY": binary.abspath,
|
||||
"NODE_BINARY": "/tmp/node",
|
||||
},
|
||||
timeout=60,
|
||||
pid=4321,
|
||||
process_id="proc-mercury",
|
||||
snapshot_id="",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
)
|
||||
|
||||
service._project_started(event)
|
||||
|
||||
process = service._get_or_create_process(event)
|
||||
assert process.binary_id == binary.id
|
||||
assert process.iface_id == iface.id
|
||||
|
||||
|
||||
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch):
|
||||
from archivebox.machine.models import Binary, NetworkInterface
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
|
||||
|
||||
node = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='node',
|
||||
abspath='/tmp/node',
|
||||
version='22.0.0',
|
||||
binprovider='env',
|
||||
binproviders='env',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
bus = create_bus(name="test_process_started_node_fallback")
|
||||
service = ProcessService(bus)
|
||||
event = ProcessStartedEvent(
|
||||
plugin_name="parse_dom_outlinks",
|
||||
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
||||
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js",
|
||||
hook_args=["--url=https://example.com"],
|
||||
output_dir="/tmp/parse-dom-outlinks",
|
||||
env={
|
||||
"NODE_BINARY": node.abspath,
|
||||
},
|
||||
timeout=60,
|
||||
pid=9876,
|
||||
process_id="proc-parse-dom-outlinks",
|
||||
snapshot_id="",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
)
|
||||
|
||||
service._project_started(event)
|
||||
|
||||
process = service._get_or_create_process(event)
|
||||
assert process.binary_id == node.id
|
||||
assert process.iface_id == iface.id
|
||||
@@ -44,6 +44,27 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
|
||||
assert snapshots[0][0] == 'https://example.com'
|
||||
|
||||
|
||||
def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict):
|
||||
"""Background add should create root snapshots immediately so the queue is visible in the DB."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--bg', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshots = c.execute("SELECT url, status FROM core_snapshot").fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(snapshots) == 1
|
||||
assert snapshots[0][0] == 'https://example.com'
|
||||
assert snapshots[0][1] == 'queued'
|
||||
|
||||
|
||||
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add command creates a Crawl record in the database."""
|
||||
os.chdir(tmp_path)
|
||||
@@ -217,6 +238,32 @@ def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extrac
|
||||
assert persona_id
|
||||
assert default_persona == 'Default'
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
|
||||
|
||||
|
||||
def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
[
|
||||
'archivebox', 'add', '--index-only', '--depth=0',
|
||||
'--domain-allowlist=example.com,*.example.com',
|
||||
'--domain-denylist=static.example.com',
|
||||
'https://example.com',
|
||||
],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
allowlist, denylist = c.execute(
|
||||
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert allowlist == 'example.com,*.example.com'
|
||||
assert denylist == 'static.example.com'
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
|
||||
|
||||
|
||||
|
||||
@@ -16,6 +16,13 @@ from archivebox.tests.conftest import (
|
||||
create_test_url,
|
||||
)
|
||||
|
||||
PROJECTOR_TEST_ENV = {
|
||||
'PLUGINS': 'favicon',
|
||||
'SAVE_FAVICON': 'True',
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
}
|
||||
|
||||
|
||||
class TestArchiveResultCreate:
|
||||
"""Tests for `archivebox archiveresult create`."""
|
||||
@@ -38,13 +45,14 @@ class TestArchiveResultCreate:
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
# Should have the Snapshot passed through and ArchiveResult created
|
||||
# Should have the Snapshot passed through and an ArchiveResult request emitted
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Snapshot' in types
|
||||
assert 'ArchiveResult' in types
|
||||
|
||||
ar = next(r for r in records if r['type'] == 'ArchiveResult')
|
||||
assert ar['plugin'] == 'title'
|
||||
assert 'id' not in ar
|
||||
|
||||
def test_create_with_specific_plugin(self, initialized_archive):
|
||||
"""Create archive result for specific plugin."""
|
||||
@@ -122,15 +130,33 @@ class TestArchiveResultList:
|
||||
|
||||
def test_list_filter_by_status(self, initialized_archive):
|
||||
"""Filter archive results by status."""
|
||||
# Create snapshot and archive result
|
||||
# Create snapshot and materialize an archive result via the runner
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
created = parse_jsonl_output(
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
data_dir=initialized_archive,
|
||||
)[0]
|
||||
)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=queued'],
|
||||
stdin=json.dumps(created),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--status=queued'],
|
||||
@@ -147,21 +173,28 @@ class TestArchiveResultList:
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=title'],
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['plugin'] == 'title'
|
||||
assert r['plugin'] == 'favicon'
|
||||
|
||||
def test_list_with_limit(self, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
@@ -170,11 +203,18 @@ class TestArchiveResultList:
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--limit=2'],
|
||||
@@ -196,11 +236,22 @@ class TestArchiveResultUpdate:
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
stdout_run, _, _ = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
stdout_list, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = parse_jsonl_output(stdout_list)[0]
|
||||
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=failed'],
|
||||
@@ -225,11 +276,22 @@ class TestArchiveResultDelete:
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
stdout_run, _, _ = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
stdout_list, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = parse_jsonl_output(stdout_list)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'delete'],
|
||||
@@ -247,11 +309,22 @@ class TestArchiveResultDelete:
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
stdout_run, _, _ = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
stdout_list, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = parse_jsonl_output(stdout_list)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'delete', '--yes'],
|
||||
|
||||
@@ -83,7 +83,7 @@ class TestCrawlCreate:
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert 'test-tag' in records[0].get('tags_str', '')
|
||||
assert 'test-tag' in records[0].get('tags', '')
|
||||
|
||||
def test_create_pass_through_other_types(self, initialized_archive):
|
||||
"""Pass-through records of other types unchanged."""
|
||||
|
||||
@@ -173,6 +173,20 @@ def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
|
||||
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
|
||||
|
||||
|
||||
def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path):
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
(tmp_path / "parse_html_urls").mkdir()
|
||||
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
|
||||
'{"url":"https://docs.sweeting.me/s/youtube-favorites)**"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
urls = collect_urls_from_plugins(tmp_path)
|
||||
assert len(urls) == 1
|
||||
assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites"
|
||||
|
||||
|
||||
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
|
||||
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
|
||||
url = create_test_url()
|
||||
@@ -269,8 +283,13 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
|
||||
)
|
||||
assert ar_create_code == 0, ar_create_stderr
|
||||
|
||||
created_records = parse_jsonl_output(ar_create_stdout)
|
||||
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
|
||||
run_archivebox_cmd(
|
||||
["run"],
|
||||
stdin=ar_create_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PIPE_TEST_ENV,
|
||||
)
|
||||
|
||||
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
@@ -278,6 +297,8 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
|
||||
)
|
||||
assert list_code == 0, list_stderr
|
||||
_assert_stdout_is_jsonl_only(list_stdout)
|
||||
listed_records = parse_jsonl_output(list_stdout)
|
||||
archiveresult = next(record for record in listed_records if record.get("type") == "ArchiveResult")
|
||||
|
||||
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
||||
["run"],
|
||||
|
||||
@@ -8,6 +8,9 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
@@ -266,3 +269,182 @@ class TestRunEmpty:
|
||||
|
||||
assert code == 0
|
||||
assert 'No records to process' in stderr
|
||||
|
||||
|
||||
class TestRunDaemonMode:
|
||||
def test_run_daemon_processes_stdin_before_runner(self, monkeypatch):
|
||||
from archivebox.cli import archivebox_run
|
||||
|
||||
class FakeStdin:
|
||||
def isatty(self):
|
||||
return False
|
||||
|
||||
monkeypatch.setattr(sys, "stdin", FakeStdin())
|
||||
calls = []
|
||||
monkeypatch.setattr(
|
||||
archivebox_run,
|
||||
"process_stdin_records",
|
||||
lambda: calls.append("stdin") or 0,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
archivebox_run,
|
||||
"run_runner",
|
||||
lambda daemon=False: calls.append(f"runner:{daemon}") or 0,
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit) as exit_info:
|
||||
archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None)
|
||||
|
||||
assert exit_info.value.code == 0
|
||||
assert calls == ["stdin", "runner:True"]
|
||||
|
||||
def test_run_daemon_skips_runner_if_stdin_processing_fails(self, monkeypatch):
|
||||
from archivebox.cli import archivebox_run
|
||||
|
||||
class FakeStdin:
|
||||
def isatty(self):
|
||||
return False
|
||||
|
||||
monkeypatch.setattr(sys, "stdin", FakeStdin())
|
||||
monkeypatch.setattr(archivebox_run, "process_stdin_records", lambda: 1)
|
||||
monkeypatch.setattr(
|
||||
archivebox_run,
|
||||
"run_runner",
|
||||
lambda daemon=False: (_ for _ in ()).throw(AssertionError("runner should not start after stdin failure")),
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit) as exit_info:
|
||||
archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None)
|
||||
|
||||
assert exit_info.value.code == 1
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestRecoverOrphanedCrawls:
|
||||
def test_recover_orphaned_crawl_requeues_started_crawl_without_active_processes(self):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.runner import recover_orphaned_crawls
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=None,
|
||||
)
|
||||
|
||||
recovered = recover_orphaned_crawls()
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert recovered == 1
|
||||
assert crawl.status == Crawl.StatusChoices.STARTED
|
||||
assert crawl.retry_at is not None
|
||||
|
||||
def test_recover_orphaned_crawl_skips_active_child_processes(self):
|
||||
import archivebox.machine.models as machine_models
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Machine, Process
|
||||
from archivebox.services.runner import recover_orphaned_crawls
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=None,
|
||||
)
|
||||
|
||||
machine_models._CURRENT_MACHINE = None
|
||||
machine = Machine.current()
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js'],
|
||||
env={
|
||||
'CRAWL_ID': str(crawl.id),
|
||||
'SNAPSHOT_ID': str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
recovered = recover_orphaned_crawls()
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert recovered == 0
|
||||
assert crawl.retry_at is None
|
||||
|
||||
def test_recover_orphaned_crawl_seals_when_all_snapshots_are_already_sealed(self):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.runner import recover_orphaned_crawls
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
)
|
||||
|
||||
recovered = recover_orphaned_crawls()
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert recovered == 1
|
||||
assert crawl.status == Crawl.StatusChoices.SEALED
|
||||
assert crawl.retry_at is None
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestRecoverOrphanedSnapshots:
|
||||
def test_recover_orphaned_snapshot_requeues_started_snapshot_without_active_processes(self):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.runner import recover_orphaned_snapshots
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
|
||||
recovered = recover_orphaned_snapshots()
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
crawl.refresh_from_db()
|
||||
|
||||
assert recovered == 1
|
||||
assert snapshot.status == Snapshot.StatusChoices.QUEUED
|
||||
assert snapshot.retry_at is not None
|
||||
assert crawl.status == Crawl.StatusChoices.QUEUED
|
||||
assert crawl.retry_at is not None
|
||||
|
||||
@@ -6,6 +6,15 @@ Verify server can start (basic smoke tests only, no full server testing).
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from unittest.mock import Mock
|
||||
|
||||
|
||||
def test_sqlite_connections_use_explicit_30_second_busy_timeout():
|
||||
from archivebox.core.settings import SQLITE_CONNECTION_OPTIONS
|
||||
|
||||
assert SQLITE_CONNECTION_OPTIONS["OPTIONS"]["timeout"] == 30
|
||||
assert "PRAGMA busy_timeout = 30000;" in SQLITE_CONNECTION_OPTIONS["OPTIONS"]["init_command"]
|
||||
|
||||
|
||||
def test_server_shows_usage_info(tmp_path, process):
|
||||
@@ -39,3 +48,64 @@ def test_server_init_flag(tmp_path, process):
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--init' in result.stdout or 'init' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_runner_worker_uses_current_interpreter():
|
||||
"""The supervised runner should use the active Python environment, not PATH."""
|
||||
from archivebox.workers.supervisord_util import RUNNER_WORKER
|
||||
|
||||
assert RUNNER_WORKER["command"] == f"{sys.executable} -m archivebox run --daemon"
|
||||
|
||||
|
||||
def test_reload_workers_use_current_interpreter_and_supervisord_managed_runner():
|
||||
from archivebox.workers.supervisord_util import RUNNER_WATCH_WORKER, RUNSERVER_WORKER
|
||||
|
||||
runserver = RUNSERVER_WORKER("127.0.0.1", "8000", reload=True, pidfile="/tmp/runserver.pid")
|
||||
watcher = RUNNER_WATCH_WORKER("/tmp/runserver.pid")
|
||||
|
||||
assert runserver["name"] == "worker_runserver"
|
||||
assert runserver["command"] == f"{sys.executable} -m archivebox manage runserver 127.0.0.1:8000"
|
||||
assert 'ARCHIVEBOX_RUNSERVER="1"' in runserver["environment"]
|
||||
assert 'ARCHIVEBOX_AUTORELOAD="1"' in runserver["environment"]
|
||||
assert 'ARCHIVEBOX_RUNSERVER_PIDFILE="/tmp/runserver.pid"' in runserver["environment"]
|
||||
|
||||
assert watcher["name"] == "worker_runner_watch"
|
||||
assert watcher["command"] == f"{sys.executable} -m archivebox manage runner_watch --pidfile=/tmp/runserver.pid"
|
||||
|
||||
|
||||
def test_stop_existing_background_runner_cleans_up_and_stops_orchestrators():
|
||||
from archivebox.cli.archivebox_server import stop_existing_background_runner
|
||||
|
||||
runner_a = Mock()
|
||||
runner_a.kill_tree = Mock()
|
||||
runner_a.terminate = Mock()
|
||||
runner_b = Mock()
|
||||
runner_b.kill_tree = Mock(side_effect=RuntimeError("boom"))
|
||||
runner_b.terminate = Mock()
|
||||
|
||||
process_model = Mock()
|
||||
process_model.StatusChoices.RUNNING = "running"
|
||||
process_model.TypeChoices.ORCHESTRATOR = "orchestrator"
|
||||
queryset = Mock()
|
||||
queryset.order_by.return_value = [runner_a, runner_b]
|
||||
process_model.objects.filter.return_value = queryset
|
||||
|
||||
supervisor = Mock()
|
||||
stop_worker = Mock()
|
||||
log = Mock()
|
||||
|
||||
stopped = stop_existing_background_runner(
|
||||
machine=Mock(),
|
||||
process_model=process_model,
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
log=log,
|
||||
)
|
||||
|
||||
assert stopped == 2
|
||||
assert process_model.cleanup_stale_running.call_count == 2
|
||||
stop_worker.assert_any_call(supervisor, "worker_runner")
|
||||
stop_worker.assert_any_call(supervisor, "worker_runner_watch")
|
||||
runner_a.kill_tree.assert_called_once_with(graceful_timeout=2.0)
|
||||
runner_b.terminate.assert_called_once_with(graceful_timeout=2.0)
|
||||
log.assert_called_once()
|
||||
|
||||
@@ -74,7 +74,7 @@ class TestSnapshotCreate:
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert 'test-tag' in records[0].get('tags_str', '')
|
||||
assert 'test-tag' in records[0].get('tags', '')
|
||||
|
||||
def test_create_pass_through_other_types(self, initialized_archive):
|
||||
"""Pass-through records of other types unchanged."""
|
||||
|
||||
326
archivebox/tests/test_config_views.py
Normal file
326
archivebox/tests/test_config_views.py
Normal file
@@ -0,0 +1,326 @@
|
||||
from datetime import timedelta
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
from django.test import RequestFactory
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.config import views as config_views
|
||||
from archivebox.core import views as core_views
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def test_get_db_binaries_by_name_collapses_youtube_dl_aliases(monkeypatch):
|
||||
now = timezone.now()
|
||||
records = [
|
||||
SimpleNamespace(
|
||||
name='youtube-dl',
|
||||
version='',
|
||||
binprovider='',
|
||||
abspath='/usr/bin/youtube-dl',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
modified_at=now,
|
||||
),
|
||||
SimpleNamespace(
|
||||
name='yt-dlp',
|
||||
version='2026.03.01',
|
||||
binprovider='pip',
|
||||
abspath='/usr/bin/yt-dlp',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
modified_at=now + timedelta(seconds=1),
|
||||
),
|
||||
]
|
||||
|
||||
monkeypatch.setattr(config_views.Binary, 'objects', SimpleNamespace(all=lambda: records))
|
||||
|
||||
binaries = config_views.get_db_binaries_by_name()
|
||||
|
||||
assert 'yt-dlp' in binaries
|
||||
assert 'youtube-dl' not in binaries
|
||||
assert binaries['yt-dlp'].version == '2026.03.01'
|
||||
|
||||
|
||||
def test_binaries_list_view_uses_db_version_and_hides_youtube_dl_alias(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
db_binary = SimpleNamespace(
|
||||
name='youtube-dl',
|
||||
version='2026.03.01',
|
||||
binprovider='pip',
|
||||
abspath='/usr/bin/yt-dlp',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
sha256='',
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
|
||||
|
||||
context = config_views.binaries_list_view.__wrapped__(request)
|
||||
|
||||
assert len(context['table']['Binary Name']) == 1
|
||||
assert str(context['table']['Binary Name'][0].link_item) == 'yt-dlp'
|
||||
assert context['table']['Found Version'][0] == '✅ 2026.03.01'
|
||||
assert context['table']['Provided By'][0] == 'pip'
|
||||
assert context['table']['Found Abspath'][0] == '/usr/bin/yt-dlp'
|
||||
|
||||
|
||||
def test_binaries_list_view_only_shows_persisted_records(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
|
||||
|
||||
context = config_views.binaries_list_view.__wrapped__(request)
|
||||
|
||||
assert context['table']['Binary Name'] == []
|
||||
assert context['table']['Found Version'] == []
|
||||
assert context['table']['Provided By'] == []
|
||||
assert context['table']['Found Abspath'] == []
|
||||
|
||||
|
||||
def test_binary_detail_view_uses_canonical_db_record(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/youtube-dl/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
db_binary = SimpleNamespace(
|
||||
id='019d14cc-6c40-7793-8ff1-0f8bb050e8a3',
|
||||
name='yt-dlp',
|
||||
version='2026.03.01',
|
||||
binprovider='pip',
|
||||
abspath='/usr/bin/yt-dlp',
|
||||
sha256='abc123',
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
|
||||
|
||||
context = config_views.binary_detail_view.__wrapped__(request, key='youtube-dl')
|
||||
section = context['data'][0]
|
||||
|
||||
assert context['title'] == 'yt-dlp'
|
||||
assert section['fields']['name'] == 'yt-dlp'
|
||||
assert section['fields']['version'] == '2026.03.01'
|
||||
assert section['fields']['binprovider'] == 'pip'
|
||||
assert section['fields']['abspath'] == '/usr/bin/yt-dlp'
|
||||
assert '/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp' in section['description']
|
||||
|
||||
|
||||
def test_binary_detail_view_marks_unrecorded_binary(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/wget/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
|
||||
|
||||
context = config_views.binary_detail_view.__wrapped__(request, key='wget')
|
||||
section = context['data'][0]
|
||||
|
||||
assert section['description'] == 'No persisted Binary record found'
|
||||
assert section['fields']['status'] == 'unrecorded'
|
||||
assert section['fields']['binprovider'] == 'not recorded'
|
||||
|
||||
|
||||
def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/plugins/builtin.example/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
plugin_config = {
|
||||
'title': 'Example Plugin',
|
||||
'description': 'Example config used to verify plugin metadata rendering.',
|
||||
'type': 'object',
|
||||
'required_plugins': ['chrome'],
|
||||
'required_binaries': ['example-cli'],
|
||||
'output_mimetypes': ['text/plain', 'application/json'],
|
||||
'properties': {
|
||||
'EXAMPLE_ENABLED': {
|
||||
'type': 'boolean',
|
||||
'description': 'Enable the example plugin.',
|
||||
'x-fallback': 'CHECK_SSL_VALIDITY',
|
||||
},
|
||||
'EXAMPLE_BINARY': {
|
||||
'type': 'string',
|
||||
'default': 'gallery-dl',
|
||||
'description': 'Filesystem path for example output.',
|
||||
'x-aliases': ['USE_EXAMPLE_BINARY'],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_filesystem_plugins', lambda: {
|
||||
'builtin.example': {
|
||||
'id': 'builtin.example',
|
||||
'name': 'example',
|
||||
'source': 'builtin',
|
||||
'path': '/plugins/example',
|
||||
'hooks': ['on_Snapshot__01_example.py'],
|
||||
'config': plugin_config,
|
||||
}
|
||||
})
|
||||
monkeypatch.setattr(config_views, 'get_machine_admin_url', lambda: '/admin/machine/machine/test-machine/change/')
|
||||
|
||||
context = config_views.plugin_detail_view.__wrapped__(request, key='builtin.example')
|
||||
|
||||
assert context['title'] == 'example'
|
||||
assert len(context['data']) == 5
|
||||
|
||||
summary_section, hooks_section, metadata_section, config_section, properties_section = context['data']
|
||||
|
||||
assert summary_section['fields'] == {
|
||||
'id': 'builtin.example',
|
||||
'name': 'example',
|
||||
'source': 'builtin',
|
||||
}
|
||||
assert '/plugins/example' in summary_section['description']
|
||||
assert 'https://archivebox.github.io/abx-plugins/#example' in summary_section['description']
|
||||
|
||||
assert hooks_section['name'] == 'Hooks'
|
||||
assert hooks_section['fields'] == {}
|
||||
assert 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py' in hooks_section['description']
|
||||
assert 'on_Snapshot__01_example.py' in hooks_section['description']
|
||||
|
||||
assert metadata_section['name'] == 'Plugin Metadata'
|
||||
assert metadata_section['fields'] == {}
|
||||
assert 'Example Plugin' in metadata_section['description']
|
||||
assert 'Example config used to verify plugin metadata rendering.' in metadata_section['description']
|
||||
assert 'https://archivebox.github.io/abx-plugins/#chrome' in metadata_section['description']
|
||||
assert '/admin/environment/binaries/example-cli/' in metadata_section['description']
|
||||
assert 'text/plain' in metadata_section['description']
|
||||
assert 'application/json' in metadata_section['description']
|
||||
|
||||
assert config_section['name'] == 'config.json'
|
||||
assert config_section['fields'] == {}
|
||||
assert '<pre style=' in config_section['description']
|
||||
assert 'EXAMPLE_ENABLED' in config_section['description']
|
||||
assert '<span style="color: #0550ae;">"properties"</span>' in config_section['description']
|
||||
|
||||
assert properties_section['name'] == 'Config Properties'
|
||||
assert properties_section['fields'] == {}
|
||||
assert '/admin/machine/machine/test-machine/change/' in properties_section['description']
|
||||
assert '/admin/machine/binary/' in properties_section['description']
|
||||
assert '/admin/environment/binaries/' in properties_section['description']
|
||||
assert 'EXAMPLE_ENABLED' in properties_section['description']
|
||||
assert 'boolean' in properties_section['description']
|
||||
assert 'Enable the example plugin.' in properties_section['description']
|
||||
assert '/admin/environment/config/EXAMPLE_ENABLED/' in properties_section['description']
|
||||
assert '/admin/environment/config/CHECK_SSL_VALIDITY/' in properties_section['description']
|
||||
assert '/admin/environment/config/USE_EXAMPLE_BINARY/' in properties_section['description']
|
||||
assert '/admin/environment/binaries/gallery-dl/' in properties_section['description']
|
||||
assert 'EXAMPLE_BINARY' in properties_section['description']
|
||||
|
||||
|
||||
def test_get_config_definition_link_keeps_core_config_search_link(monkeypatch):
|
||||
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: None)
|
||||
|
||||
url, label = core_views.get_config_definition_link('CHECK_SSL_VALIDITY')
|
||||
|
||||
assert 'github.com/search' in url
|
||||
assert 'CHECK_SSL_VALIDITY' in url
|
||||
assert label == 'archivebox/config'
|
||||
|
||||
|
||||
def test_get_config_definition_link_uses_plugin_config_json_for_plugin_options(monkeypatch):
|
||||
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / 'parse_dom_outlinks'
|
||||
|
||||
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: 'parse_dom_outlinks')
|
||||
monkeypatch.setattr(core_views, 'iter_plugin_dirs', lambda: [plugin_dir])
|
||||
|
||||
url, label = core_views.get_config_definition_link('PARSE_DOM_OUTLINKS_ENABLED')
|
||||
|
||||
assert url == 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json'
|
||||
assert label == 'abx_plugins/plugins/parse_dom_outlinks/config.json'
|
||||
|
||||
|
||||
def test_live_config_value_view_renames_source_field_and_uses_plugin_definition_link(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
|
||||
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {})
|
||||
monkeypatch.setattr(core_views, 'get_config', lambda: {'PARSE_DOM_OUTLINKS_ENABLED': True})
|
||||
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
|
||||
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
|
||||
monkeypatch.setattr(core_views, 'find_config_source', lambda key, merged: 'Default')
|
||||
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
|
||||
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: False))
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
monkeypatch.setattr(Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-id', config={})))
|
||||
monkeypatch.setattr(BaseConfigSet, 'load_from_file', classmethod(lambda cls, path: {}))
|
||||
monkeypatch.setattr(
|
||||
core_views,
|
||||
'get_config_definition_link',
|
||||
lambda key: (
|
||||
'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json',
|
||||
'abx_plugins/plugins/parse_dom_outlinks/config.json',
|
||||
),
|
||||
)
|
||||
|
||||
context = core_views.live_config_value_view.__wrapped__(request, key='PARSE_DOM_OUTLINKS_ENABLED')
|
||||
section = context['data'][0]
|
||||
|
||||
assert 'Currently read from' in section['fields']
|
||||
assert 'Source' not in section['fields']
|
||||
assert section['fields']['Currently read from'] == 'Default'
|
||||
assert 'abx_plugins/plugins/parse_dom_outlinks/config.json' in section['help_texts']['Type']
|
||||
|
||||
|
||||
def test_find_config_source_prefers_environment_over_machine_and_file(monkeypatch):
|
||||
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
monkeypatch.setattr(
|
||||
Machine,
|
||||
'current',
|
||||
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
BaseConfigSet,
|
||||
'load_from_file',
|
||||
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
|
||||
)
|
||||
|
||||
assert core_views.find_config_source('CHECK_SSL_VALIDITY', {'CHECK_SSL_VALIDITY': False}) == 'Environment'
|
||||
|
||||
|
||||
def test_live_config_value_view_priority_text_matches_runtime_precedence(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/config/CHECK_SSL_VALIDITY/')
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
|
||||
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {'CHECK_SSL_VALIDITY': True})
|
||||
monkeypatch.setattr(core_views, 'get_config', lambda: {'CHECK_SSL_VALIDITY': False})
|
||||
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
|
||||
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
|
||||
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
monkeypatch.setattr(
|
||||
Machine,
|
||||
'current',
|
||||
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
BaseConfigSet,
|
||||
'load_from_file',
|
||||
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
|
||||
)
|
||||
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: True))
|
||||
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
|
||||
|
||||
context = core_views.live_config_value_view.__wrapped__(request, key='CHECK_SSL_VALIDITY')
|
||||
section = context['data'][0]
|
||||
|
||||
assert section['fields']['Currently read from'] == 'Environment'
|
||||
help_text = section['help_texts']['Currently read from']
|
||||
assert help_text.index('Environment') < help_text.index('Machine') < help_text.index('Config File') < help_text.index('Default')
|
||||
assert 'Configuration Sources (highest priority first):' in section['help_texts']['Value']
|
||||
220
archivebox/tests/test_crawl_admin.py
Normal file
220
archivebox/tests/test_crawl_admin.py
Normal file
@@ -0,0 +1,220 @@
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import UserManager
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.crawls.admin import CrawlAdminForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
User = get_user_model()
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return cast(UserManager, User.objects).create_superuser(
|
||||
username='crawladmin',
|
||||
email='crawladmin@test.com',
|
||||
password='testpassword',
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def crawl(admin_user):
|
||||
return Crawl.objects.create(
|
||||
urls='https://example.com\nhttps://example.org',
|
||||
tags_str='alpha,beta',
|
||||
created_by=admin_user,
|
||||
)
|
||||
|
||||
|
||||
def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
|
||||
response = client.get(
|
||||
reverse('admin:crawls_crawl_change', args=[crawl.pk]),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'name="tags_editor"' in response.content
|
||||
assert b'tag-editor-container' in response.content
|
||||
assert b'alpha' in response.content
|
||||
assert b'beta' in response.content
|
||||
|
||||
|
||||
def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
|
||||
response = client.get(
|
||||
reverse('admin:crawls_crawl_add'),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'name="url_filters_allowlist"' in response.content
|
||||
assert b'name="url_filters_denylist"' in response.content
|
||||
assert b'Same domain only' in response.content
|
||||
|
||||
|
||||
def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
|
||||
form = CrawlAdminForm(
|
||||
data={
|
||||
'created_at': crawl.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'urls': crawl.urls,
|
||||
'config': '{}',
|
||||
'max_depth': '0',
|
||||
'tags_editor': 'alpha, beta, Alpha, gamma',
|
||||
'url_filters_allowlist': 'example.com\n*.example.com',
|
||||
'url_filters_denylist': 'static.example.com',
|
||||
'persona_id': '',
|
||||
'label': '',
|
||||
'notes': '',
|
||||
'schedule': '',
|
||||
'status': crawl.status,
|
||||
'retry_at': crawl.retry_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'created_by': str(admin_user.pk),
|
||||
'num_uses_failed': '0',
|
||||
'num_uses_succeeded': '0',
|
||||
},
|
||||
instance=crawl,
|
||||
)
|
||||
|
||||
assert form.is_valid(), form.errors
|
||||
|
||||
updated = form.save()
|
||||
updated.refresh_from_db()
|
||||
assert updated.tags_str == 'alpha,beta,gamma'
|
||||
assert updated.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
|
||||
assert updated.config['URL_DENYLIST'] == 'static.example.com'
|
||||
|
||||
|
||||
def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com/remove-me',
|
||||
created_by=admin_user,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
crawl=crawl,
|
||||
url='https://example.com/remove-me',
|
||||
)
|
||||
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
response = client.post(
|
||||
reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk]),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()['ok'] is True
|
||||
assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert 'https://example.com/remove-me' not in crawl.urls
|
||||
|
||||
|
||||
def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join([
|
||||
'https://cdn.example.com/asset.js',
|
||||
'https://cdn.example.com/second.js',
|
||||
'https://example.com/root',
|
||||
]),
|
||||
created_by=admin_user,
|
||||
)
|
||||
queued_snapshot = Snapshot.objects.create(
|
||||
crawl=crawl,
|
||||
url='https://cdn.example.com/asset.js',
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
preserved_snapshot = Snapshot.objects.create(
|
||||
crawl=crawl,
|
||||
url='https://example.com/root',
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
response = client.post(
|
||||
reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, queued_snapshot.pk]),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['ok'] is True
|
||||
assert payload['domain'] == 'cdn.example.com'
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert crawl.get_url_denylist(use_effective_config=False) == ['cdn.example.com']
|
||||
assert 'https://cdn.example.com/asset.js' not in crawl.urls
|
||||
assert 'https://cdn.example.com/second.js' not in crawl.urls
|
||||
assert 'https://example.com/root' in crawl.urls
|
||||
assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
|
||||
assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
|
||||
|
||||
|
||||
def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
|
||||
snapshot = Snapshot.from_json(
|
||||
{'url': 'https://docs.sweeting.me/s/youtube-favorites)**'},
|
||||
overrides={'crawl': crawl},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
|
||||
assert snapshot is not None
|
||||
assert snapshot.url == 'https://docs.sweeting.me/s/youtube-favorites'
|
||||
|
||||
|
||||
def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join([
|
||||
'https://example.com/root',
|
||||
'https://static.example.com/app.js',
|
||||
'https://other.test/page',
|
||||
]),
|
||||
created_by=admin_user,
|
||||
config={
|
||||
'URL_ALLOWLIST': 'example.com',
|
||||
'URL_DENYLIST': 'static.example.com',
|
||||
},
|
||||
)
|
||||
|
||||
created = crawl.create_snapshots_from_urls()
|
||||
|
||||
assert [snapshot.url for snapshot in created] == ['https://example.com/root']
|
||||
|
||||
|
||||
def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join([
|
||||
'https://example.com/root',
|
||||
'https://example.com/path,with,commas',
|
||||
'https://other.test/page',
|
||||
]),
|
||||
created_by=admin_user,
|
||||
config={
|
||||
'URL_ALLOWLIST': r'^https://example\.com/(root|path,with,commas)$' + '\n' + r'^https://other\.test/page$',
|
||||
'URL_DENYLIST': r'^https://example\.com/path,with,commas$',
|
||||
},
|
||||
)
|
||||
|
||||
assert crawl.get_url_allowlist(use_effective_config=False) == [
|
||||
r'^https://example\.com/(root|path,with,commas)$',
|
||||
r'^https://other\.test/page$',
|
||||
]
|
||||
assert crawl.get_url_denylist(use_effective_config=False) == [
|
||||
r'^https://example\.com/path,with,commas$',
|
||||
]
|
||||
|
||||
created = crawl.create_snapshots_from_urls()
|
||||
|
||||
assert [snapshot.url for snapshot in created] == [
|
||||
'https://example.com/root',
|
||||
'https://other.test/page',
|
||||
]
|
||||
@@ -14,7 +14,7 @@ Tests cover:
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import cast
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
@@ -89,11 +89,45 @@ class TestMachineModel(TestCase):
|
||||
assert result is not None
|
||||
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
|
||||
|
||||
def test_machine_from_jsonl_strips_legacy_chromium_version(self):
|
||||
"""Machine.from_json() should ignore legacy browser version keys."""
|
||||
Machine.current() # Ensure machine exists
|
||||
record = {
|
||||
'config': {
|
||||
'WGET_BINARY': '/usr/bin/wget',
|
||||
'CHROMIUM_VERSION': '123.4.5',
|
||||
},
|
||||
}
|
||||
|
||||
result = Machine.from_json(record)
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
assert result is not None
|
||||
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
|
||||
self.assertNotIn('CHROMIUM_VERSION', result.config)
|
||||
|
||||
def test_machine_from_jsonl_invalid(self):
|
||||
"""Machine.from_json() should return None for invalid records."""
|
||||
result = Machine.from_json({'invalid': 'record'})
|
||||
self.assertIsNone(result)
|
||||
|
||||
def test_machine_current_strips_legacy_chromium_version(self):
|
||||
"""Machine.current() should clean legacy browser version keys from persisted config."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
machine = Machine.current()
|
||||
machine.config = {
|
||||
'CHROME_BINARY': '/tmp/chromium',
|
||||
'CHROMIUM_VERSION': '123.4.5',
|
||||
}
|
||||
machine.save(update_fields=['config'])
|
||||
models._CURRENT_MACHINE = machine
|
||||
|
||||
refreshed = Machine.current()
|
||||
|
||||
self.assertEqual(refreshed.config.get('CHROME_BINARY'), '/tmp/chromium')
|
||||
self.assertNotIn('CHROMIUM_VERSION', refreshed.config)
|
||||
|
||||
def test_machine_manager_current(self):
|
||||
"""Machine.objects.current() should return current machine."""
|
||||
machine = Machine.current()
|
||||
@@ -131,6 +165,36 @@ class TestNetworkInterfaceModel(TestCase):
|
||||
interface = NetworkInterface.current()
|
||||
self.assertIsNotNone(interface)
|
||||
|
||||
def test_networkinterface_current_refresh_creates_new_interface_when_properties_change(self):
|
||||
"""Refreshing should persist a new NetworkInterface row when the host network fingerprint changes."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
first = {
|
||||
'mac_address': 'aa:bb:cc:dd:ee:01',
|
||||
'ip_public': '1.1.1.1',
|
||||
'ip_local': '192.168.1.10',
|
||||
'dns_server': '8.8.8.8',
|
||||
'hostname': 'host-a',
|
||||
'iface': 'en0',
|
||||
'isp': 'ISP A',
|
||||
'city': 'City',
|
||||
'region': 'Region',
|
||||
'country': 'Country',
|
||||
}
|
||||
second = {
|
||||
**first,
|
||||
'ip_public': '2.2.2.2',
|
||||
'ip_local': '10.0.0.5',
|
||||
}
|
||||
|
||||
with patch.object(models, 'get_host_network', side_effect=[first, second]):
|
||||
interface1 = NetworkInterface.current(refresh=True)
|
||||
interface2 = NetworkInterface.current(refresh=True)
|
||||
|
||||
self.assertNotEqual(interface1.id, interface2.id)
|
||||
self.assertEqual(interface1.machine_id, interface2.machine_id)
|
||||
self.assertEqual(NetworkInterface.objects.filter(machine=interface1.machine).count(), 2)
|
||||
|
||||
|
||||
class TestBinaryModel(TestCase):
|
||||
"""Test the Binary model."""
|
||||
@@ -360,6 +424,8 @@ class TestProcessCurrent(TestCase):
|
||||
self.assertEqual(proc.pid, os.getpid())
|
||||
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
|
||||
self.assertIsNotNone(proc.machine)
|
||||
self.assertIsNotNone(proc.iface)
|
||||
self.assertEqual(proc.iface.machine_id, proc.machine_id)
|
||||
self.assertIsNotNone(proc.started_at)
|
||||
|
||||
def test_process_current_caches(self):
|
||||
@@ -375,6 +441,12 @@ class TestProcessCurrent(TestCase):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
|
||||
|
||||
def test_process_detect_type_runner_watch(self):
|
||||
"""runner_watch should be classified as a worker, not the orchestrator itself."""
|
||||
with patch('sys.argv', ['archivebox', 'manage', 'runner_watch', '--pidfile=/tmp/runserver.pid']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.WORKER)
|
||||
|
||||
def test_process_detect_type_cli(self):
|
||||
"""_detect_process_type should detect CLI commands."""
|
||||
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
|
||||
@@ -387,6 +459,27 @@ class TestProcessCurrent(TestCase):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.BINARY)
|
||||
|
||||
def test_process_proc_allows_interpreter_wrapped_script(self):
|
||||
"""Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
|
||||
proc = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
cmd=['/tmp/on_Crawl__90_chrome_launch.daemon.bg.js', '--url=https://example.com/'],
|
||||
pid=12345,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
os_proc = Mock()
|
||||
os_proc.create_time.return_value = proc.started_at.timestamp()
|
||||
os_proc.cmdline.return_value = [
|
||||
'node',
|
||||
'/tmp/on_Crawl__90_chrome_launch.daemon.bg.js',
|
||||
'--url=https://example.com/',
|
||||
]
|
||||
|
||||
with patch('archivebox.machine.models.psutil.Process', return_value=os_proc):
|
||||
self.assertIs(proc.proc, os_proc)
|
||||
|
||||
|
||||
class TestProcessHierarchy(TestCase):
|
||||
"""Test Process parent/child relationships."""
|
||||
|
||||
191
archivebox/tests/test_persona_admin.py
Normal file
191
archivebox/tests/test_persona_admin.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import pytest
|
||||
from typing import cast
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import UserManager
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.personas.importers import (
|
||||
PersonaImportResult,
|
||||
discover_persona_template_profiles,
|
||||
import_persona_from_source,
|
||||
resolve_browser_profile_source,
|
||||
resolve_custom_import_source,
|
||||
)
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
User = get_user_model()
|
||||
ADMIN_HOST = "admin.archivebox.localhost:8000"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return cast(UserManager, User.objects).create_superuser(
|
||||
username="personaadmin",
|
||||
email="personaadmin@test.com",
|
||||
password="testpassword",
|
||||
)
|
||||
|
||||
|
||||
def _make_profile_source(tmp_path):
|
||||
user_data_dir = tmp_path / "Chrome User Data"
|
||||
profile_dir = user_data_dir / "Default"
|
||||
profile_dir.mkdir(parents=True)
|
||||
(profile_dir / "Preferences").write_text("{}")
|
||||
return resolve_browser_profile_source(
|
||||
browser="chrome",
|
||||
user_data_dir=user_data_dir,
|
||||
profile_dir="Default",
|
||||
browser_binary="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
)
|
||||
|
||||
|
||||
def test_resolve_custom_import_source_accepts_exact_profile_dir(tmp_path):
|
||||
user_data_dir = tmp_path / "Brave User Data"
|
||||
profile_dir = user_data_dir / "Profile 2"
|
||||
profile_dir.mkdir(parents=True)
|
||||
(profile_dir / "Preferences").write_text("{}")
|
||||
|
||||
source = resolve_custom_import_source(str(profile_dir))
|
||||
|
||||
assert source.kind == "browser-profile"
|
||||
assert source.user_data_dir == user_data_dir.resolve()
|
||||
assert source.profile_dir == "Profile 2"
|
||||
|
||||
|
||||
def test_resolve_custom_import_source_accepts_cdp_url():
|
||||
source = resolve_custom_import_source("ws://127.0.0.1:9222/devtools/browser/test-session")
|
||||
|
||||
assert source.kind == "cdp"
|
||||
assert source.cdp_url == "ws://127.0.0.1:9222/devtools/browser/test-session"
|
||||
|
||||
|
||||
def test_discover_persona_template_profiles_finds_chrome_profile_dirs(tmp_path):
|
||||
personas_dir = tmp_path / "personas"
|
||||
chrome_profile = personas_dir / "ExistingPersona" / "chrome_profile"
|
||||
default_profile = chrome_profile / "Default"
|
||||
default_profile.mkdir(parents=True)
|
||||
(default_profile / "Preferences").write_text("{}")
|
||||
|
||||
discovered = discover_persona_template_profiles(personas_dir=personas_dir)
|
||||
|
||||
assert len(discovered) == 1
|
||||
assert discovered[0].browser == "persona"
|
||||
assert discovered[0].source_name == "ExistingPersona"
|
||||
assert discovered[0].profile_dir == "Default"
|
||||
assert discovered[0].user_data_dir == chrome_profile.resolve()
|
||||
|
||||
|
||||
def test_discover_persona_template_profiles_finds_home_abx_personas(monkeypatch, tmp_path):
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
monkeypatch.setattr(CONSTANTS, "PERSONAS_DIR", tmp_path / "missing-data-personas")
|
||||
monkeypatch.setattr("archivebox.personas.importers.Path.home", lambda: tmp_path)
|
||||
|
||||
chrome_profile = tmp_path / ".config" / "abx" / "personas" / "HomePersona" / "chrome_profile"
|
||||
default_profile = chrome_profile / "Default"
|
||||
default_profile.mkdir(parents=True)
|
||||
(default_profile / "Preferences").write_text("{}")
|
||||
|
||||
discovered = discover_persona_template_profiles()
|
||||
|
||||
assert len(discovered) == 1
|
||||
assert discovered[0].browser == "persona"
|
||||
assert discovered[0].source_name == "HomePersona"
|
||||
assert discovered[0].profile_dir == "Default"
|
||||
assert discovered[0].user_data_dir == chrome_profile.resolve()
|
||||
|
||||
|
||||
def test_persona_admin_add_view_renders_import_ui(client, admin_user, monkeypatch, tmp_path):
|
||||
source = _make_profile_source(tmp_path)
|
||||
monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
|
||||
monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
|
||||
|
||||
client.login(username="personaadmin", password="testpassword")
|
||||
response = client.get(reverse("admin:personas_persona_add"), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b"Bootstrap a persona from a real browser session" in response.content
|
||||
assert b"Google Chrome / Default" in response.content
|
||||
assert b"auth.json" in response.content
|
||||
|
||||
|
||||
def test_import_persona_from_source_copies_user_agent_to_persona_config(admin_user, monkeypatch, tmp_path):
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
source = _make_profile_source(tmp_path)
|
||||
persona = Persona.objects.create(name="AgentPersona", created_by=admin_user)
|
||||
|
||||
def fake_export_browser_state(**kwargs):
|
||||
return True, {"user_agent": "Mozilla/5.0 Test Imported UA"}, "ok"
|
||||
|
||||
monkeypatch.setattr("archivebox.personas.importers.export_browser_state", fake_export_browser_state)
|
||||
|
||||
result = import_persona_from_source(
|
||||
persona,
|
||||
source,
|
||||
copy_profile=False,
|
||||
import_cookies=False,
|
||||
capture_storage=False,
|
||||
)
|
||||
|
||||
persona.refresh_from_db()
|
||||
assert result.user_agent_imported is True
|
||||
assert persona.config["USER_AGENT"] == "Mozilla/5.0 Test Imported UA"
|
||||
|
||||
|
||||
def test_persona_admin_add_post_runs_shared_importer(client, admin_user, monkeypatch, tmp_path):
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
source = _make_profile_source(tmp_path)
|
||||
monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
|
||||
monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
|
||||
|
||||
calls = {}
|
||||
|
||||
def fake_import(persona, selected_source, **kwargs):
|
||||
calls["persona_name"] = persona.name
|
||||
calls["source"] = selected_source
|
||||
calls["kwargs"] = kwargs
|
||||
(persona.path / "cookies.txt").parent.mkdir(parents=True, exist_ok=True)
|
||||
(persona.path / "cookies.txt").write_text("# Netscape HTTP Cookie File\n")
|
||||
(persona.path / "auth.json").write_text('{"TYPE":"auth","cookies":[],"localStorage":{},"sessionStorage":{}}\n')
|
||||
return PersonaImportResult(
|
||||
source=selected_source,
|
||||
profile_copied=True,
|
||||
cookies_imported=True,
|
||||
storage_captured=True,
|
||||
)
|
||||
|
||||
monkeypatch.setattr("archivebox.personas.forms.import_persona_from_source", fake_import)
|
||||
|
||||
client.login(username="personaadmin", password="testpassword")
|
||||
response = client.post(
|
||||
reverse("admin:personas_persona_add"),
|
||||
{
|
||||
"name": "ImportedPersona",
|
||||
"created_by": str(admin_user.pk),
|
||||
"config": "{}",
|
||||
"import_mode": "discovered",
|
||||
"import_discovered_profile": source.choice_value,
|
||||
"import_copy_profile": "on",
|
||||
"import_extract_cookies": "on",
|
||||
"import_capture_storage": "on",
|
||||
"_save": "Save",
|
||||
},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
persona = Persona.objects.get(name="ImportedPersona")
|
||||
assert calls["persona_name"] == "ImportedPersona"
|
||||
assert calls["source"].profile_dir == "Default"
|
||||
assert calls["kwargs"] == {
|
||||
"copy_profile": True,
|
||||
"import_cookies": True,
|
||||
"capture_storage": True,
|
||||
}
|
||||
assert persona.COOKIES_FILE.endswith("cookies.txt")
|
||||
assert persona.AUTH_STORAGE_FILE.endswith("auth.json")
|
||||
640
archivebox/tests/test_runner.py
Normal file
640
archivebox/tests/test_runner.py
Normal file
@@ -0,0 +1,640 @@
|
||||
import asyncio
|
||||
import subprocess
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
from django.test import RequestFactory
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
class _DummyBus:
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
|
||||
async def stop(self):
|
||||
return None
|
||||
|
||||
|
||||
class _DummyService:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class _DummyAbxServices:
|
||||
def __init__(self):
|
||||
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
|
||||
|
||||
async def _wait(self):
|
||||
return None
|
||||
|
||||
|
||||
async def _call_sync(func, *args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://blog.sweeting.me\nhttps://sweeting.me',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
snapshot_a = Snapshot.objects.create(
|
||||
url='https://blog.sweeting.me',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
snapshot_b = Snapshot.objects.create(
|
||||
url='https://sweeting.me',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
created_buses: list[_DummyBus] = []
|
||||
|
||||
def fake_create_bus(*, name, total_timeout=3600.0, **kwargs):
|
||||
bus = _DummyBus(name)
|
||||
created_buses.append(bus)
|
||||
return bus
|
||||
|
||||
monkeypatch.setattr(runner_module, 'create_bus', fake_create_bus)
|
||||
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
|
||||
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
|
||||
download_calls = []
|
||||
|
||||
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
|
||||
download_calls.append(
|
||||
{
|
||||
'url': url,
|
||||
'bus': bus,
|
||||
'snapshot_id': config_overrides['SNAPSHOT_ID'],
|
||||
'source_url': config_overrides['SOURCE_URL'],
|
||||
'abx_snapshot_id': snapshot.id,
|
||||
}
|
||||
)
|
||||
await asyncio.sleep(0)
|
||||
return []
|
||||
|
||||
monkeypatch.setattr(runner_module, 'download', fake_download)
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
snapshot_data = {
|
||||
str(snapshot_a.id): {
|
||||
'id': str(snapshot_a.id),
|
||||
'url': snapshot_a.url,
|
||||
'title': snapshot_a.title,
|
||||
'timestamp': snapshot_a.timestamp,
|
||||
'bookmarked_at': snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
|
||||
'created_at': snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
|
||||
'tags': snapshot_a.tags_str(),
|
||||
'depth': snapshot_a.depth,
|
||||
'parent_snapshot_id': str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
|
||||
'output_dir': str(snapshot_a.output_dir),
|
||||
'config': crawl_runner._snapshot_config(snapshot_a),
|
||||
},
|
||||
str(snapshot_b.id): {
|
||||
'id': str(snapshot_b.id),
|
||||
'url': snapshot_b.url,
|
||||
'title': snapshot_b.title,
|
||||
'timestamp': snapshot_b.timestamp,
|
||||
'bookmarked_at': snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
|
||||
'created_at': snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
|
||||
'tags': snapshot_b.tags_str(),
|
||||
'depth': snapshot_b.depth,
|
||||
'parent_snapshot_id': str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
|
||||
'output_dir': str(snapshot_b.output_dir),
|
||||
'config': crawl_runner._snapshot_config(snapshot_b),
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(crawl_runner, '_load_snapshot_run_data', lambda snapshot_id: snapshot_data[snapshot_id])
|
||||
|
||||
async def run_both():
|
||||
await asyncio.gather(
|
||||
crawl_runner._run_snapshot(str(snapshot_a.id)),
|
||||
crawl_runner._run_snapshot(str(snapshot_b.id)),
|
||||
)
|
||||
|
||||
asyncio.run(run_both())
|
||||
|
||||
assert len(download_calls) == 2
|
||||
assert {call['snapshot_id'] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
|
||||
assert {call['source_url'] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
|
||||
assert len({id(call['bus']) for call in download_calls}) == 2
|
||||
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
|
||||
|
||||
|
||||
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
popen_calls = []
|
||||
|
||||
class DummyPopen:
|
||||
def __init__(self, args, **kwargs):
|
||||
popen_calls.append((args, kwargs))
|
||||
|
||||
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
|
||||
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
|
||||
monkeypatch.setattr(
|
||||
machine_models.Process.objects,
|
||||
'filter',
|
||||
lambda **kwargs: SimpleNamespace(exists=lambda: False),
|
||||
)
|
||||
monkeypatch.setattr(runner_module.subprocess, 'Popen', DummyPopen)
|
||||
|
||||
started = runner_module.ensure_background_runner(allow_under_pytest=True)
|
||||
|
||||
assert started is True
|
||||
assert len(popen_calls) == 1
|
||||
assert popen_calls[0][0] == [runner_module.sys.executable, '-m', 'archivebox', 'run', '--daemon']
|
||||
assert popen_calls[0][1]['stdin'] is subprocess.DEVNULL
|
||||
|
||||
|
||||
def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
|
||||
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
|
||||
monkeypatch.setattr(
|
||||
machine_models.Process.objects,
|
||||
'filter',
|
||||
lambda **kwargs: SimpleNamespace(exists=lambda: True),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
runner_module.subprocess,
|
||||
'Popen',
|
||||
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('runner should not be spawned')),
|
||||
)
|
||||
|
||||
started = runner_module.ensure_background_runner(allow_under_pytest=True)
|
||||
|
||||
assert started is False
|
||||
|
||||
|
||||
def test_runner_prepare_refreshes_network_interface_and_attaches_current_process(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
|
||||
class _Iface:
|
||||
id = 'iface-1'
|
||||
machine = SimpleNamespace(id='machine-1')
|
||||
machine_id = 'machine-1'
|
||||
|
||||
saved_updates = []
|
||||
|
||||
class _Proc:
|
||||
iface_id = None
|
||||
machine_id = 'machine-1'
|
||||
iface = None
|
||||
machine = None
|
||||
|
||||
def save(self, *, update_fields):
|
||||
saved_updates.append(tuple(update_fields))
|
||||
|
||||
proc = _Proc()
|
||||
|
||||
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
|
||||
monkeypatch.setattr(runner_module, 'create_bus', lambda **kwargs: _DummyBus(kwargs['name']))
|
||||
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
|
||||
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
from archivebox.config import configset as configset_module
|
||||
|
||||
refresh_calls = []
|
||||
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
|
||||
monkeypatch.setattr(Process, 'current', classmethod(lambda cls: proc))
|
||||
monkeypatch.setattr(configset_module, 'get_config', lambda **kwargs: {})
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
crawl_runner._prepare()
|
||||
|
||||
assert refresh_calls == [True]
|
||||
assert proc.iface is not None
|
||||
assert proc.machine == proc.iface.machine
|
||||
assert saved_updates == [('iface', 'machine', 'modified_at')]
|
||||
|
||||
|
||||
def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
|
||||
|
||||
user = get_user_model().objects.create_superuser(
|
||||
username='runner-api-admin',
|
||||
email='runner-api-admin@example.com',
|
||||
password='testpassword',
|
||||
)
|
||||
request = RequestFactory().post('/api/v1/crawls')
|
||||
request.user = user
|
||||
|
||||
crawl = create_crawl(
|
||||
request,
|
||||
CrawlCreateSchema(
|
||||
urls=['https://example.com'],
|
||||
max_depth=0,
|
||||
tags=[],
|
||||
tags_str='',
|
||||
label='',
|
||||
notes='',
|
||||
config={},
|
||||
),
|
||||
)
|
||||
|
||||
assert str(crawl.id)
|
||||
assert crawl.status == 'queued'
|
||||
assert crawl.retry_at is not None
|
||||
|
||||
|
||||
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
||||
import asgiref.sync
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
'sync_to_async',
|
||||
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
|
||||
)
|
||||
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
|
||||
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
assert crawl.status != Crawl.StatusChoices.SEALED
|
||||
assert crawl.retry_at is not None
|
||||
|
||||
|
||||
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
||||
import asgiref.sync
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, 'create_bus', lambda *args, **kwargs: _DummyBus('runner'))
|
||||
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
|
||||
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(crawl, 'cleanup', lambda: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
|
||||
|
||||
sync_to_async_wrapped: list[str] = []
|
||||
sync_to_async_active = False
|
||||
|
||||
def fake_sync_to_async(func, thread_sensitive=True):
|
||||
async def wrapper(*args, **kwargs):
|
||||
nonlocal sync_to_async_active
|
||||
sync_to_async_wrapped.append(getattr(func, '__name__', repr(func)))
|
||||
previous = sync_to_async_active
|
||||
sync_to_async_active = True
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
sync_to_async_active = previous
|
||||
return wrapper
|
||||
|
||||
def guarded_is_finished():
|
||||
assert sync_to_async_active is True
|
||||
return False
|
||||
|
||||
monkeypatch.setattr(asgiref.sync, 'sync_to_async', fake_sync_to_async)
|
||||
monkeypatch.setattr(crawl, 'is_finished', guarded_is_finished)
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert crawl.status == Crawl.StatusChoices.STARTED
|
||||
assert crawl.retry_at is not None
|
||||
assert 'guarded_is_finished' in sync_to_async_wrapped
|
||||
|
||||
|
||||
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
|
||||
async def run_test():
|
||||
task = asyncio.get_running_loop().create_future()
|
||||
task.set_exception(RuntimeError('snapshot failed'))
|
||||
crawl_runner.snapshot_tasks['snap-1'] = task
|
||||
with pytest.raises(RuntimeError, match='snapshot failed'):
|
||||
await crawl_runner._wait_for_snapshot_tasks()
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
|
||||
async def finish_snapshot() -> None:
|
||||
await asyncio.sleep(0)
|
||||
|
||||
async def run_test():
|
||||
task = asyncio.create_task(finish_snapshot())
|
||||
crawl_runner.snapshot_tasks['snap-1'] = task
|
||||
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
|
||||
assert crawl_runner.snapshot_tasks == {}
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
import asgiref.sync
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
'sync_to_async',
|
||||
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
|
||||
)
|
||||
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
|
||||
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
|
||||
|
||||
cleanup_calls = []
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: cleanup_calls.append('abx_cleanup') or asyncio.sleep(0))
|
||||
monkeypatch.setattr(crawl, 'cleanup', lambda: cleanup_calls.append('crawl_cleanup'))
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
assert cleanup_calls == ['crawl_cleanup', 'abx_cleanup']
|
||||
|
||||
|
||||
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||
from abx_dl.models import Process as AbxProcess, now_iso
|
||||
from abx_dl.services.process_service import ProcessService
|
||||
from abx_dl.events import ProcessCompletedEvent
|
||||
|
||||
service = object.__new__(ProcessService)
|
||||
service.emit_jsonl = False
|
||||
emitted_events = []
|
||||
|
||||
async def fake_emit_event(event, *, detach_from_parent):
|
||||
emitted_events.append((event, detach_from_parent))
|
||||
|
||||
async def fake_stream_stdout(**kwargs):
|
||||
try:
|
||||
await asyncio.Event().wait()
|
||||
except asyncio.CancelledError:
|
||||
return ["daemon output\n"]
|
||||
|
||||
service._emit_event = fake_emit_event
|
||||
monkeypatch.setattr(service, '_stream_stdout', fake_stream_stdout)
|
||||
|
||||
class FakeAsyncProcess:
|
||||
def __init__(self):
|
||||
self.pid = 42424
|
||||
self.returncode = None
|
||||
|
||||
async def wait(self):
|
||||
await asyncio.sleep(0)
|
||||
self.returncode = 0
|
||||
return 0
|
||||
|
||||
plugin_output_dir = tmp_path / 'chrome'
|
||||
plugin_output_dir.mkdir()
|
||||
stdout_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stdout.log'
|
||||
stderr_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stderr.log'
|
||||
stderr_file.write_text('')
|
||||
pid_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.pid'
|
||||
pid_file.write_text('12345')
|
||||
|
||||
proc = AbxProcess(
|
||||
cmd=['hook'],
|
||||
pwd=str(plugin_output_dir),
|
||||
timeout=60,
|
||||
started_at=now_iso(),
|
||||
plugin='chrome',
|
||||
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
|
||||
)
|
||||
process = FakeAsyncProcess()
|
||||
event = SimpleNamespace(
|
||||
plugin_name='chrome',
|
||||
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
|
||||
hook_path='hook',
|
||||
hook_args=['--url=https://example.org/'],
|
||||
env={},
|
||||
output_dir=str(plugin_output_dir),
|
||||
timeout=60,
|
||||
snapshot_id='snap-1',
|
||||
is_background=True,
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
await asyncio.wait_for(
|
||||
service._monitor_background_process(
|
||||
event=event,
|
||||
proc=proc,
|
||||
process=process,
|
||||
plugin_output_dir=plugin_output_dir,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
pid_file=pid_file,
|
||||
files_before=set(),
|
||||
),
|
||||
timeout=0.5,
|
||||
)
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
assert pid_file.exists() is False
|
||||
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events)
|
||||
|
||||
|
||||
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=runner_module.timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(type(snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
|
||||
run_calls: list[tuple[str, list[str] | None, bool]] = []
|
||||
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
|
||||
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
|
||||
|
||||
result = runner_module.run_pending_crawls(daemon=False)
|
||||
|
||||
assert result == 0
|
||||
assert run_calls == [(str(crawl.id), [str(snapshot.id)], False)]
|
||||
|
||||
|
||||
def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
older_crawl = Crawl.objects.create(
|
||||
urls='https://older.example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
older_snapshot = Snapshot.objects.create(
|
||||
url='https://older.example.com',
|
||||
crawl=older_crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=runner_module.timezone.now(),
|
||||
)
|
||||
newer_crawl = Crawl.objects.create(
|
||||
urls='https://newer.example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=runner_module.timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(type(older_snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(older_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(newer_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
|
||||
run_calls: list[tuple[str, list[str] | None, bool]] = []
|
||||
|
||||
class _StopScheduling(Exception):
|
||||
pass
|
||||
|
||||
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
|
||||
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
|
||||
raise _StopScheduling
|
||||
|
||||
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
|
||||
|
||||
with pytest.raises(_StopScheduling):
|
||||
runner_module.run_pending_crawls(daemon=False)
|
||||
|
||||
assert run_calls == [(str(newer_crawl.id), None, False)]
|
||||
205
archivebox/tests/test_tag_admin.py
Normal file
205
archivebox/tests/test_tag_admin.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import UserManager
|
||||
from django.urls import reverse
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
User = get_user_model()
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return cast(UserManager, User.objects).create_superuser(
|
||||
username='tagadmin',
|
||||
email='tagadmin@test.com',
|
||||
password='testpassword',
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_token(admin_user):
|
||||
from archivebox.api.auth import get_or_create_api_token
|
||||
|
||||
token = get_or_create_api_token(admin_user)
|
||||
assert token is not None
|
||||
return token.token
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def crawl(admin_user):
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
return Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by=admin_user,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tagged_data(crawl, admin_user):
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
tag = Tag.objects.create(name='Alpha Research', created_by=admin_user)
|
||||
first = Snapshot.objects.create(
|
||||
url='https://example.com/one',
|
||||
title='Example One',
|
||||
crawl=crawl,
|
||||
)
|
||||
second = Snapshot.objects.create(
|
||||
url='https://example.com/two',
|
||||
title='Example Two',
|
||||
crawl=crawl,
|
||||
)
|
||||
first.tags.add(tag)
|
||||
second.tags.add(tag)
|
||||
return tag, [first, second]
|
||||
|
||||
|
||||
def test_tag_admin_changelist_renders_custom_ui(client, admin_user, tagged_data):
|
||||
client.login(username='tagadmin', password='testpassword')
|
||||
|
||||
response = client.get(reverse('admin:core_tag_changelist'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'id="tag-live-search"' in response.content
|
||||
assert b'id="tag-sort-select"' in response.content
|
||||
assert b'id="tag-created-by-select"' in response.content
|
||||
assert b'id="tag-year-select"' in response.content
|
||||
assert b'id="tag-has-snapshots-select"' in response.content
|
||||
assert b'Alpha Research' in response.content
|
||||
assert b'class="tag-card"' in response.content
|
||||
|
||||
|
||||
def test_tag_admin_add_view_renders_similar_tag_reference(client, admin_user):
|
||||
client.login(username='tagadmin', password='testpassword')
|
||||
|
||||
response = client.get(reverse('admin:core_tag_add'), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'Similar Tags' in response.content
|
||||
assert b'data-tag-name-input="1"' in response.content
|
||||
|
||||
|
||||
def test_tag_search_api_returns_card_payload(client, api_token, tagged_data):
|
||||
tag, snapshots = tagged_data
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:search_tags'),
|
||||
{'q': 'Alpha', 'api_key': api_token},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['sort'] == 'created_desc'
|
||||
assert payload['created_by'] == ''
|
||||
assert payload['year'] == ''
|
||||
assert payload['has_snapshots'] == 'all'
|
||||
assert payload['tags'][0]['id'] == tag.id
|
||||
assert payload['tags'][0]['name'] == 'Alpha Research'
|
||||
assert payload['tags'][0]['num_snapshots'] == 2
|
||||
assert payload['tags'][0]['snapshots'][0]['title'] in {'Example One', 'Example Two'}
|
||||
assert payload['tags'][0]['export_jsonl_url'].endswith(f'/api/v1/core/tag/{tag.id}/snapshots.jsonl')
|
||||
assert payload['tags'][0]['filter_url'].endswith(f'/admin/core/snapshot/?tags__id__exact={tag.id}')
|
||||
assert {snapshot['url'] for snapshot in payload['tags'][0]['snapshots']} == {snap.url for snap in snapshots}
|
||||
|
||||
|
||||
def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data):
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
other_user = cast(UserManager, User.objects).create_user(
|
||||
username='tagother',
|
||||
email='tagother@test.com',
|
||||
password='unused',
|
||||
)
|
||||
tag_with_snapshots = tagged_data[0]
|
||||
empty_tag = Tag.objects.create(name='Zulu Empty', created_by=other_user)
|
||||
alpha_tag = Tag.objects.create(name='Alpha Empty', created_by=other_user)
|
||||
Snapshot.objects.create(
|
||||
url='https://example.com/three',
|
||||
title='Example Three',
|
||||
crawl=crawl,
|
||||
).tags.add(alpha_tag)
|
||||
|
||||
Tag.objects.filter(pk=empty_tag.pk).update(created_at=timezone.make_aware(datetime(2024, 1, 1, 12, 0, 0)))
|
||||
Tag.objects.filter(pk=alpha_tag.pk).update(created_at=timezone.make_aware(datetime(2025, 1, 1, 12, 0, 0)))
|
||||
Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0)))
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:search_tags'),
|
||||
{
|
||||
'sort': 'name_desc',
|
||||
'created_by': str(other_user.pk),
|
||||
'year': '2024',
|
||||
'has_snapshots': 'no',
|
||||
'api_key': api_token,
|
||||
},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['sort'] == 'name_desc'
|
||||
assert payload['created_by'] == str(other_user.pk)
|
||||
assert payload['year'] == '2024'
|
||||
assert payload['has_snapshots'] == 'no'
|
||||
assert [tag['name'] for tag in payload['tags']] == ['Zulu Empty']
|
||||
|
||||
|
||||
def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
|
||||
tag, _ = tagged_data
|
||||
|
||||
response = client.post(
|
||||
f"{reverse('api-1:rename_tag', args=[tag.id])}?api_key={api_token}",
|
||||
data=json.dumps({'name': 'Alpha Archive'}),
|
||||
content_type='application/json',
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
tag.refresh_from_db()
|
||||
assert tag.name == 'Alpha Archive'
|
||||
assert tag.slug == 'alpha-archive'
|
||||
|
||||
|
||||
def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data):
|
||||
tag, _ = tagged_data
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tag_snapshots_export', args=[tag.id]),
|
||||
{'api_key': api_token},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response['Content-Type'].startswith('application/x-ndjson')
|
||||
assert f'tag-{tag.slug}-snapshots.jsonl' in response['Content-Disposition']
|
||||
body = response.content.decode()
|
||||
assert '"type": "Snapshot"' in body
|
||||
assert '"tags": "Alpha Research"' in body
|
||||
|
||||
|
||||
def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data):
|
||||
tag, snapshots = tagged_data
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tag_urls_export', args=[tag.id]),
|
||||
{'api_key': api_token},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response['Content-Type'].startswith('text/plain')
|
||||
assert f'tag-{tag.slug}-urls.txt' in response['Content-Disposition']
|
||||
exported_urls = set(filter(None, response.content.decode().splitlines()))
|
||||
assert exported_urls == {snapshot.url for snapshot in snapshots}
|
||||
@@ -55,6 +55,7 @@ def _build_script(body: str) -> str:
|
||||
get_admin_host,
|
||||
get_api_host,
|
||||
get_web_host,
|
||||
get_public_host,
|
||||
get_snapshot_host,
|
||||
get_original_host,
|
||||
get_listen_subdomain,
|
||||
@@ -198,6 +199,7 @@ class TestUrlRouting:
|
||||
web_host = get_web_host()
|
||||
admin_host = get_admin_host()
|
||||
api_host = get_api_host()
|
||||
public_host = get_public_host()
|
||||
snapshot_host = get_snapshot_host(snapshot_id)
|
||||
original_host = get_original_host(domain)
|
||||
base_host = SERVER_CONFIG.LISTEN_HOST
|
||||
@@ -208,6 +210,7 @@ class TestUrlRouting:
|
||||
assert web_host == "web.archivebox.localhost:8000"
|
||||
assert admin_host == "admin.archivebox.localhost:8000"
|
||||
assert api_host == "api.archivebox.localhost:8000"
|
||||
assert public_host == "public.archivebox.localhost:8000"
|
||||
assert snapshot_host == f"{snapshot_id}.archivebox.localhost:8000"
|
||||
assert original_host == f"{domain}.archivebox.localhost:8000"
|
||||
assert get_listen_subdomain(web_host) == "web"
|
||||
@@ -302,6 +305,20 @@ class TestUrlRouting:
|
||||
assert resp.status_code == 200
|
||||
assert response_body(resp) == response_file.read_bytes()
|
||||
|
||||
resp = client.get("/index.html", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
snapshot_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert f"http://{snapshot_host}/" in snapshot_html
|
||||
assert "See all files..." in snapshot_html
|
||||
assert ">WARC<" not in snapshot_html
|
||||
assert ">Media<" not in snapshot_html
|
||||
assert ">Git<" not in snapshot_html
|
||||
|
||||
resp = client.get("/?files=1", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
files_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert output_rel.split("/", 1)[0] in files_html
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
)
|
||||
@@ -479,6 +496,7 @@ class TestUrlRouting:
|
||||
snapshot_host = get_snapshot_host(snapshot_id)
|
||||
admin_host = get_admin_host()
|
||||
web_host = get_web_host()
|
||||
public_host = get_public_host()
|
||||
|
||||
client = Client()
|
||||
|
||||
@@ -491,10 +509,17 @@ class TestUrlRouting:
|
||||
assert resp.status_code == 200
|
||||
live_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert f"http://{snapshot_host}/" in live_html
|
||||
assert "http://web.archivebox.localhost:8000" in live_html
|
||||
assert f"http://{public_host}/static/archive.png" in live_html
|
||||
assert ">WARC<" not in live_html
|
||||
assert ">Media<" not in live_html
|
||||
assert ">Git<" not in live_html
|
||||
|
||||
static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore")
|
||||
assert f"http://{snapshot_host}/" in static_html
|
||||
assert f"http://{public_host}/static/archive.png" in static_html
|
||||
assert ">WARC<" not in static_html
|
||||
assert ">Media<" not in static_html
|
||||
assert ">Git<" not in static_html
|
||||
|
||||
client.login(username="testadmin", password="testpassword")
|
||||
resp = client.get(f"/admin/core/snapshot/{snapshot_id}/change/", HTTP_HOST=admin_host)
|
||||
|
||||
@@ -19,12 +19,19 @@ class Command(BaseCommand):
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import psutil
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.machine.models import Machine, Process
|
||||
from archivebox.workers.supervisord_util import (
|
||||
RUNNER_WORKER,
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
start_worker,
|
||||
stop_worker,
|
||||
)
|
||||
|
||||
pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
|
||||
if not pidfile:
|
||||
@@ -32,11 +39,38 @@ class Command(BaseCommand):
|
||||
|
||||
interval = max(0.2, float(kwargs.get("interval", 1.0)))
|
||||
last_pid = None
|
||||
runner_proc: subprocess.Popen[bytes] | None = None
|
||||
|
||||
def stop_duplicate_watchers() -> None:
|
||||
current_pid = os.getpid()
|
||||
for proc in psutil.process_iter(["pid", "cmdline"]):
|
||||
if proc.info["pid"] == current_pid:
|
||||
continue
|
||||
cmdline = proc.info.get("cmdline") or []
|
||||
if not cmdline:
|
||||
continue
|
||||
if "runner_watch" not in " ".join(cmdline):
|
||||
continue
|
||||
if not any(str(arg) == f"--pidfile={pidfile}" or str(arg) == pidfile for arg in cmdline):
|
||||
continue
|
||||
try:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=2.0)
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
|
||||
try:
|
||||
proc.kill()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
|
||||
def get_supervisor():
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is None:
|
||||
raise RuntimeError("runner_watch requires a running supervisord process")
|
||||
return supervisor
|
||||
|
||||
stop_duplicate_watchers()
|
||||
start_worker(get_supervisor(), RUNNER_WORKER, lazy=True)
|
||||
|
||||
def restart_runner() -> None:
|
||||
nonlocal runner_proc
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
machine = Machine.current()
|
||||
|
||||
@@ -55,29 +89,18 @@ class Command(BaseCommand):
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if runner_proc and runner_proc.poll() is None:
|
||||
try:
|
||||
runner_proc.terminate()
|
||||
runner_proc.wait(timeout=2.0)
|
||||
except Exception:
|
||||
try:
|
||||
runner_proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
supervisor = get_supervisor()
|
||||
|
||||
runner_proc = subprocess.Popen(
|
||||
[sys.executable, '-m', 'archivebox', 'run', '--daemon'],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True,
|
||||
)
|
||||
try:
|
||||
stop_worker(supervisor, RUNNER_WORKER["name"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
start_worker(supervisor, RUNNER_WORKER)
|
||||
|
||||
def runner_running() -> bool:
|
||||
return Process.objects.filter(
|
||||
machine=Machine.current(),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
).exists()
|
||||
proc = get_worker(get_supervisor(), RUNNER_WORKER["name"])
|
||||
return bool(proc and proc.get("statename") == "RUNNING")
|
||||
|
||||
while True:
|
||||
try:
|
||||
|
||||
@@ -6,6 +6,7 @@ import socket
|
||||
import psutil
|
||||
import shutil
|
||||
import subprocess
|
||||
import shlex
|
||||
|
||||
from typing import Dict, cast, Iterator
|
||||
from pathlib import Path
|
||||
@@ -29,24 +30,63 @@ WORKERS_DIR_NAME = "workers"
|
||||
# Global reference to supervisord process for cleanup
|
||||
_supervisord_proc = None
|
||||
|
||||
|
||||
def _shell_join(args: list[str]) -> str:
|
||||
return shlex.join(args)
|
||||
|
||||
RUNNER_WORKER = {
|
||||
"name": "worker_runner",
|
||||
"command": "archivebox run --daemon",
|
||||
"autostart": "true",
|
||||
"command": _shell_join([sys.executable, "-m", "archivebox", "run", "--daemon"]),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_runner.log",
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
|
||||
RUNNER_WATCH_WORKER = lambda pidfile: {
|
||||
"name": "worker_runner_watch",
|
||||
"command": _shell_join([sys.executable, "-m", "archivebox", "manage", "runner_watch", f"--pidfile={pidfile}"]),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_runner_watch.log",
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
|
||||
SERVER_WORKER = lambda host, port: {
|
||||
"name": "worker_daphne",
|
||||
"command": f"{sys.executable} -m daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
|
||||
"command": _shell_join([sys.executable, "-m", "daphne", f"--bind={host}", f"--port={port}", "--application-close-timeout=600", "archivebox.core.asgi:application"]),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_daphne.log",
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
|
||||
|
||||
def RUNSERVER_WORKER(host: str, port: str, *, reload: bool, pidfile: str | None = None, nothreading: bool = False):
|
||||
command = [sys.executable, "-m", "archivebox", "manage", "runserver", f"{host}:{port}"]
|
||||
if not reload:
|
||||
command.append("--noreload")
|
||||
if nothreading:
|
||||
command.append("--nothreading")
|
||||
|
||||
environment = ['ARCHIVEBOX_RUNSERVER="1"']
|
||||
if reload:
|
||||
assert pidfile, "RUNSERVER_WORKER requires a pidfile when reload=True"
|
||||
environment.extend([
|
||||
'ARCHIVEBOX_AUTORELOAD="1"',
|
||||
f'ARCHIVEBOX_RUNSERVER_PIDFILE="{pidfile}"',
|
||||
])
|
||||
|
||||
return {
|
||||
"name": "worker_runserver",
|
||||
"command": _shell_join(command),
|
||||
"environment": ",".join(environment),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_runserver.log",
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
|
||||
def is_port_in_use(host: str, port: int) -> bool:
|
||||
"""Check if a port is already in use."""
|
||||
try:
|
||||
@@ -511,16 +551,30 @@ def watch_worker(supervisor, daemon_name, interval=5):
|
||||
|
||||
|
||||
|
||||
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False, debug=False, reload=False, nothreading=False):
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
|
||||
|
||||
bg_workers = [RUNNER_WORKER]
|
||||
if debug:
|
||||
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid') if reload else None
|
||||
server_worker = RUNSERVER_WORKER(host=host, port=port, reload=reload, pidfile=pidfile, nothreading=nothreading)
|
||||
bg_workers: list[tuple[dict[str, str], bool]] = (
|
||||
[(RUNNER_WORKER, True), (RUNNER_WATCH_WORKER(pidfile), False)] if reload else [(RUNNER_WORKER, False)]
|
||||
)
|
||||
log_files = ['logs/worker_runserver.log', 'logs/worker_runner.log']
|
||||
if reload:
|
||||
log_files.insert(1, 'logs/worker_runner_watch.log')
|
||||
else:
|
||||
server_worker = SERVER_WORKER(host=host, port=port)
|
||||
bg_workers = [(RUNNER_WORKER, False)]
|
||||
log_files = ['logs/worker_daphne.log', 'logs/worker_runner.log']
|
||||
|
||||
print()
|
||||
start_worker(supervisor, SERVER_WORKER(host=host, port=port))
|
||||
start_worker(supervisor, server_worker)
|
||||
print()
|
||||
for worker in bg_workers:
|
||||
start_worker(supervisor, worker)
|
||||
for worker, lazy in bg_workers:
|
||||
start_worker(supervisor, worker, lazy=lazy)
|
||||
print()
|
||||
|
||||
if not daemonize:
|
||||
@@ -529,7 +583,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
|
||||
sys.stdout.flush()
|
||||
tail_multiple_worker_logs(
|
||||
log_files=['logs/worker_daphne.log', 'logs/worker_runner.log'],
|
||||
log_files=log_files,
|
||||
follow=True,
|
||||
proc=_supervisord_proc, # Stop tailing when supervisord exits
|
||||
)
|
||||
|
||||
@@ -50,10 +50,11 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
downloaded_at=None,
|
||||
)
|
||||
crawl_id = getattr(snapshot, 'crawl_id', None)
|
||||
if crawl_id:
|
||||
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
|
||||
Crawl.objects.filter(id=crawl_id).update(
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
@@ -75,10 +76,11 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
downloaded_at=None,
|
||||
)
|
||||
crawl_id = getattr(snapshot, 'crawl_id', None)
|
||||
if crawl_id:
|
||||
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
|
||||
Crawl.objects.filter(id=crawl_id).update(
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
393
bin/release.sh
393
bin/release.sh
@@ -1,36 +1,373 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
### Bash Environment Setup
|
||||
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
|
||||
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
|
||||
# set -o xtrace
|
||||
set -o errexit
|
||||
set -o errtrace
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
IFS=$'\n'
|
||||
set -Eeuo pipefail
|
||||
IFS=$'\n\t'
|
||||
|
||||
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
|
||||
cd "$REPO_DIR"
|
||||
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
WORKSPACE_DIR="$(cd "${REPO_DIR}/.." && pwd)"
|
||||
cd "${REPO_DIR}"
|
||||
|
||||
TAG_PREFIX="v"
|
||||
PYPI_PACKAGE="archivebox"
|
||||
|
||||
# Run the linters and tests
|
||||
# ./bin/lint.sh
|
||||
# ./bin/test.sh
|
||||
source_optional_env() {
|
||||
if [[ -f "${REPO_DIR}/.env" ]]; then
|
||||
set -a
|
||||
# shellcheck disable=SC1091
|
||||
source "${REPO_DIR}/.env"
|
||||
set +a
|
||||
fi
|
||||
}
|
||||
|
||||
# # Run all the build scripts
|
||||
# ./bin/build_git.sh
|
||||
# ./bin/build_docs.sh
|
||||
# ./bin/build_pip.sh
|
||||
# ./bin/build_docker.sh
|
||||
repo_slug() {
|
||||
python3 - <<'PY'
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
# Push relase to public repositories
|
||||
# ./bin/release_docs.sh
|
||||
./bin/release_git.sh "$@"
|
||||
./bin/release_pip.sh "$@"
|
||||
./bin/release_deb.sh "$@"
|
||||
./bin/release_brew.sh "$@"
|
||||
./bin/release_docker.sh "$@"
|
||||
remote = subprocess.check_output(
|
||||
['git', 'remote', 'get-url', 'origin'],
|
||||
text=True,
|
||||
).strip()
|
||||
|
||||
VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')"
|
||||
echo "[√] Done. Published version v$VERSION"
|
||||
patterns = [
|
||||
r'github\.com[:/](?P<slug>[^/]+/[^/.]+)(?:\.git)?$',
|
||||
r'github\.com/(?P<slug>[^/]+/[^/.]+)(?:\.git)?$',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, remote)
|
||||
if match:
|
||||
print(match.group('slug'))
|
||||
raise SystemExit(0)
|
||||
|
||||
raise SystemExit(f'Unable to parse GitHub repo slug from remote: {remote}')
|
||||
PY
|
||||
}
|
||||
|
||||
default_branch() {
|
||||
if [[ -n "${DEFAULT_BRANCH:-}" ]]; then
|
||||
echo "${DEFAULT_BRANCH}"
|
||||
return 0
|
||||
fi
|
||||
if git symbolic-ref refs/remotes/origin/HEAD >/dev/null 2>&1; then
|
||||
git symbolic-ref refs/remotes/origin/HEAD | sed 's#^refs/remotes/origin/##'
|
||||
return 0
|
||||
fi
|
||||
git remote show origin | sed -n '/HEAD branch/s/.*: //p' | head -n 1
|
||||
}
|
||||
|
||||
current_version() {
|
||||
python3 - <<'PY'
|
||||
from pathlib import Path
|
||||
import json
|
||||
import re
|
||||
|
||||
versions = []
|
||||
pyproject_text = Path('pyproject.toml').read_text()
|
||||
pyproject_match = re.search(r'^version = "([^"]+)"$', pyproject_text, re.MULTILINE)
|
||||
if pyproject_match:
|
||||
versions.append(pyproject_match.group(1))
|
||||
|
||||
package_json = json.loads(Path('etc/package.json').read_text())
|
||||
if 'version' in package_json:
|
||||
versions.append(package_json['version'])
|
||||
|
||||
def parse(version: str) -> tuple[int, int, int, int, int]:
|
||||
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
|
||||
if not match:
|
||||
raise SystemExit(f'Unsupported version format: {version}')
|
||||
major, minor, patch, rc = match.groups()
|
||||
rc_value = int(rc) if rc else (0 if 'rc' in version else 10_000)
|
||||
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, rc_value)
|
||||
|
||||
print(max(versions, key=parse))
|
||||
PY
|
||||
}
|
||||
|
||||
bump_version() {
|
||||
python3 - <<'PY'
|
||||
from pathlib import Path
|
||||
import json
|
||||
import re
|
||||
|
||||
def parse(version: str) -> tuple[int, int, int, int, int]:
|
||||
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
|
||||
if not match:
|
||||
raise SystemExit(f'Unsupported version format: {version}')
|
||||
major, minor, patch, rc = match.groups()
|
||||
rc_value = int(rc) if rc else (0 if 'rc' in version else 10_000)
|
||||
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, rc_value)
|
||||
|
||||
pyproject_path = Path('pyproject.toml')
|
||||
pyproject_text = pyproject_path.read_text()
|
||||
pyproject_match = re.search(r'^version = "([^"]+)"$', pyproject_text, re.MULTILINE)
|
||||
if not pyproject_match:
|
||||
raise SystemExit('Failed to find version in pyproject.toml')
|
||||
|
||||
package_path = Path('etc/package.json')
|
||||
package_json = json.loads(package_path.read_text())
|
||||
if 'version' not in package_json:
|
||||
raise SystemExit('Failed to find version in etc/package.json')
|
||||
|
||||
current_version = max([pyproject_match.group(1), package_json['version']], key=parse)
|
||||
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', current_version)
|
||||
major, minor, patch, rc = match.groups()
|
||||
if 'rc' in current_version:
|
||||
rc_number = int(rc or '0') + 1
|
||||
next_version = f'{major}.{minor}.{patch}rc{rc_number}'
|
||||
else:
|
||||
next_version = f'{major}.{minor}.{int(patch) + 1}'
|
||||
|
||||
pyproject_path.write_text(
|
||||
re.sub(r'^version = "[^"]+"$', f'version = "{next_version}"', pyproject_text, count=1, flags=re.MULTILINE)
|
||||
)
|
||||
package_json['version'] = next_version
|
||||
package_path.write_text(json.dumps(package_json, indent=2) + '\n')
|
||||
print(next_version)
|
||||
PY
|
||||
}
|
||||
|
||||
read_repo_version() {
|
||||
local repo_dir="$1"
|
||||
if [[ ! -f "${repo_dir}/pyproject.toml" ]]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
python3 - "${repo_dir}/pyproject.toml" <<'PY'
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
|
||||
text = Path(sys.argv[1]).read_text()
|
||||
match = re.search(r'^version = "([^"]+)"$', text, re.MULTILINE)
|
||||
if not match:
|
||||
raise SystemExit('Failed to find version')
|
||||
print(match.group(1))
|
||||
PY
|
||||
}
|
||||
|
||||
update_internal_dependencies() {
|
||||
local abxbus_version abx_pkg_version abx_plugins_version abx_dl_version
|
||||
|
||||
abxbus_version="$(read_repo_version "${WORKSPACE_DIR}/abxbus" || true)"
|
||||
abx_pkg_version="$(read_repo_version "${WORKSPACE_DIR}/abx-pkg" || true)"
|
||||
abx_plugins_version="$(read_repo_version "${WORKSPACE_DIR}/abx-plugins" || true)"
|
||||
abx_dl_version="$(read_repo_version "${WORKSPACE_DIR}/abx-dl" || true)"
|
||||
|
||||
python3 - "${abxbus_version}" "${abx_pkg_version}" "${abx_plugins_version}" "${abx_dl_version}" <<'PY'
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
|
||||
path = Path('pyproject.toml')
|
||||
text = path.read_text()
|
||||
for name, version in (
|
||||
('abxbus', sys.argv[1]),
|
||||
('abx-pkg', sys.argv[2]),
|
||||
('abx-plugins', sys.argv[3]),
|
||||
('abx-dl', sys.argv[4]),
|
||||
):
|
||||
if version:
|
||||
text = re.sub(rf'("{re.escape(name)}>=)[^"]+(")', rf'\g<1>{version}\2', text)
|
||||
path.write_text(text)
|
||||
PY
|
||||
}
|
||||
|
||||
compare_versions() {
|
||||
python3 - "$1" "$2" <<'PY'
|
||||
import re
|
||||
import sys
|
||||
|
||||
def parse(version: str) -> tuple[int, int, int, int, int]:
|
||||
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
|
||||
if not match:
|
||||
raise SystemExit(f'Unsupported version format: {version}')
|
||||
major, minor, patch, rc = match.groups()
|
||||
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, int(rc or '0'))
|
||||
|
||||
left, right = sys.argv[1], sys.argv[2]
|
||||
if parse(left) > parse(right):
|
||||
print('gt')
|
||||
elif parse(left) == parse(right):
|
||||
print('eq')
|
||||
else:
|
||||
print('lt')
|
||||
PY
|
||||
}
|
||||
|
||||
latest_release_version() {
|
||||
local slug="$1"
|
||||
local raw_tags
|
||||
raw_tags="$(gh api "repos/${slug}/releases?per_page=100" --jq '.[].tag_name' || true)"
|
||||
RELEASE_TAGS="${raw_tags}" TAG_PREFIX_VALUE="${TAG_PREFIX}" python3 - <<'PY'
|
||||
import os
|
||||
import re
|
||||
|
||||
def parse(version: str) -> tuple[int, int, int, int, int]:
|
||||
match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version)
|
||||
if not match:
|
||||
return (-1, -1, -1, -1, -1)
|
||||
major, minor, patch, rc = match.groups()
|
||||
return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, int(rc or '0'))
|
||||
|
||||
prefix = os.environ.get('TAG_PREFIX_VALUE', '')
|
||||
versions = [line.strip() for line in os.environ.get('RELEASE_TAGS', '').splitlines() if line.strip()]
|
||||
if prefix:
|
||||
versions = [version[len(prefix):] if version.startswith(prefix) else version for version in versions]
|
||||
if not versions:
|
||||
print('')
|
||||
else:
|
||||
print(max(versions, key=parse))
|
||||
PY
|
||||
}
|
||||
|
||||
wait_for_runs() {
|
||||
local slug="$1"
|
||||
local event="$2"
|
||||
local sha="$3"
|
||||
local label="$4"
|
||||
local runs_json
|
||||
local attempts=0
|
||||
|
||||
while :; do
|
||||
runs_json="$(GH_FORCE_TTY=0 GH_PAGER=cat gh run list --repo "${slug}" --event "${event}" --commit "${sha}" --limit 20 --json databaseId,status,conclusion,workflowName)"
|
||||
if [[ "$(jq 'length' <<<"${runs_json}")" -gt 0 ]]; then
|
||||
break
|
||||
fi
|
||||
attempts=$((attempts + 1))
|
||||
if [[ "${attempts}" -ge 30 ]]; then
|
||||
echo "Timed out waiting for ${label} workflows to start" >&2
|
||||
return 1
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
|
||||
while read -r run_id; do
|
||||
gh run watch "${run_id}" --repo "${slug}" --exit-status
|
||||
done < <(jq -r '.[].databaseId' <<<"${runs_json}")
|
||||
}
|
||||
|
||||
wait_for_pypi() {
|
||||
local package_name="$1"
|
||||
local expected_version="$2"
|
||||
local attempts=0
|
||||
local published_version
|
||||
|
||||
while :; do
|
||||
published_version="$(curl -fsSL "https://pypi.org/pypi/${package_name}/json" | jq -r '.info.version')"
|
||||
if [[ "${published_version}" == "${expected_version}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
attempts=$((attempts + 1))
|
||||
if [[ "${attempts}" -ge 30 ]]; then
|
||||
echo "Timed out waiting for ${package_name}==${expected_version} on PyPI" >&2
|
||||
return 1
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
}
|
||||
|
||||
run_checks() {
|
||||
uv sync --all-extras --all-groups --no-cache --upgrade
|
||||
uv build --all
|
||||
}
|
||||
|
||||
validate_release_state() {
|
||||
local slug="$1"
|
||||
local branch="$2"
|
||||
local current latest relation
|
||||
|
||||
if [[ "$(git branch --show-current)" != "${branch}" ]]; then
|
||||
echo "Skipping release-state validation on non-default branch $(git branch --show-current)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
current="$(current_version)"
|
||||
latest="$(latest_release_version "${slug}")"
|
||||
if [[ -z "${latest}" ]]; then
|
||||
echo "No published releases found for ${slug}; release state is valid"
|
||||
return 0
|
||||
fi
|
||||
|
||||
relation="$(compare_versions "${current}" "${latest}")"
|
||||
if [[ "${relation}" == "lt" ]]; then
|
||||
echo "Current version ${current} is behind latest published version ${latest}" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Release state is valid: local=${current} latest=${latest}"
|
||||
}
|
||||
|
||||
create_release() {
|
||||
local slug="$1"
|
||||
local version="$2"
|
||||
local prerelease_args=()
|
||||
if [[ "${version}" == *rc* ]]; then
|
||||
prerelease_args+=(--prerelease)
|
||||
fi
|
||||
|
||||
gh release create "${TAG_PREFIX}${version}" \
|
||||
--repo "${slug}" \
|
||||
--target "$(git rev-parse HEAD)" \
|
||||
--title "${TAG_PREFIX}${version}" \
|
||||
--generate-notes \
|
||||
"${prerelease_args[@]}"
|
||||
}
|
||||
|
||||
publish_artifacts() {
|
||||
local version="$1"
|
||||
local pypi_token="${UV_PUBLISH_TOKEN:-${PYPI_TOKEN:-${PYPI_PAT_SECRET:-}}}"
|
||||
|
||||
if [[ -n "${pypi_token}" ]]; then
|
||||
UV_PUBLISH_TOKEN="${pypi_token}" uv publish --username=__token__ dist/*
|
||||
elif [[ -n "${GITHUB_ACTIONS:-}" ]]; then
|
||||
uv publish --trusted-publishing always dist/*
|
||||
else
|
||||
echo "Missing PyPI credentials: set UV_PUBLISH_TOKEN or PYPI_TOKEN" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
wait_for_pypi "${PYPI_PACKAGE}" "${version}"
|
||||
}
|
||||
|
||||
main() {
|
||||
local slug branch version latest relation
|
||||
|
||||
source_optional_env
|
||||
slug="$(repo_slug)"
|
||||
branch="$(default_branch)"
|
||||
|
||||
if [[ "${GITHUB_EVENT_NAME:-}" == "push" ]]; then
|
||||
validate_release_state "${slug}" "${branch}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ "$(git branch --show-current)" != "${branch}" ]]; then
|
||||
echo "Release must run from ${branch}, found $(git branch --show-current)" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
update_internal_dependencies
|
||||
version="$(bump_version)"
|
||||
run_checks
|
||||
|
||||
git add -A
|
||||
git commit -m "release: ${TAG_PREFIX}${version}"
|
||||
git push origin "${branch}"
|
||||
|
||||
wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push"
|
||||
|
||||
publish_artifacts "${version}"
|
||||
create_release "${slug}" "${version}"
|
||||
|
||||
latest="$(latest_release_version "${slug}")"
|
||||
relation="$(compare_versions "${latest}" "${version}")"
|
||||
if [[ "${relation}" != "eq" ]]; then
|
||||
echo "GitHub release version mismatch: expected ${version}, got ${latest}" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Released ${PYPI_PACKAGE} ${version}"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "archivebox"
|
||||
version = "0.9.10rc1"
|
||||
version = "0.9.10rc2"
|
||||
requires-python = ">=3.13"
|
||||
description = "Self-hosted internet archiving solution."
|
||||
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
|
||||
@@ -78,9 +78,10 @@ dependencies = [
|
||||
"w3lib>=2.2.1", # used for parsing content-type encoding from http response headers & html tags
|
||||
### Extractor dependencies (optional binaries detected at runtime via shutil.which)
|
||||
### Binary/Package Management
|
||||
"abx-pkg>=1.9.14", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
|
||||
"abx-plugins>=1.9.18", # shared ArchiveBox plugin package with install_args-only overrides
|
||||
"abx-dl>=1.10.13", # shared ArchiveBox downloader package with install_args-only overrides
|
||||
"abxbus>=2.4.2", # explicit direct dep so local dev env resolves sibling abxbus repo, matching abx-dl EventBus API
|
||||
"abx-pkg>=1.9.18", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
|
||||
"abx-plugins>=1.10.14", # shared ArchiveBox plugin package with install_args-only overrides
|
||||
"abx-dl>=1.10.14", # shared ArchiveBox downloader package with install_args-only overrides
|
||||
### UUID7 backport for Python <3.14
|
||||
"uuid7>=0.1.0; python_version < '3.14'", # provides the uuid_extensions module on Python 3.13
|
||||
]
|
||||
@@ -156,9 +157,11 @@ environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"]
|
||||
package = true
|
||||
# compile-bytecode = true
|
||||
|
||||
[tool.uv.pip]
|
||||
python-version = "3.13"
|
||||
# compile-bytecode = true
|
||||
[tool.uv.sources]
|
||||
abxbus = { path = "../abxbus", editable = true }
|
||||
abx-pkg = { path = "../abx-pkg", editable = true }
|
||||
abx-plugins = { path = "../abx-plugins", editable = true }
|
||||
abx-dl = { path = "../abx-dl", editable = true }
|
||||
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
|
||||
154
uv.lock
generated
154
uv.lock
generated
@@ -14,8 +14,8 @@ supported-markers = [
|
||||
|
||||
[[package]]
|
||||
name = "abx-dl"
|
||||
version = "1.10.13"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
version = "1.10.14"
|
||||
source = { editable = "../abx-dl" }
|
||||
dependencies = [
|
||||
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "abx-plugins", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -27,44 +27,110 @@ dependencies = [
|
||||
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/10/de/d9d5a398f053f899fc62d45b9d21eb85412c6ca7d32099c25b9b43f84e32/abx_dl-1.10.13.tar.gz", hash = "sha256:f9fef6119691e07e1792593ed5bcd8de2f84df9d01e77966006d743593c611aa", size = 58200, upload-time = "2026-03-21T18:47:20.901Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e7/11/670fbdc0afe2274893b63774643f6bb44f09d4975d3968cf394384af1306/abx_dl-1.10.13-py3-none-any.whl", hash = "sha256:cd4aab469563b1c7d9f9202161d94ba7de62cf31fbe924f6fe6f51ad051f4d70", size = 62597, upload-time = "2026-03-21T18:47:19.573Z" },
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "abx-pkg", editable = "../abx-pkg" },
|
||||
{ name = "abx-plugins", editable = "../abx-plugins" },
|
||||
{ name = "abxbus", editable = "../abxbus" },
|
||||
{ name = "flake8", marker = "extra == 'dev'", specifier = ">=7.1.1" },
|
||||
{ name = "flask", marker = "extra == 'dev'", specifier = ">=3.0" },
|
||||
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" },
|
||||
{ name = "platformdirs", specifier = ">=4.0.0" },
|
||||
{ name = "psutil", specifier = ">=7.2.1" },
|
||||
{ name = "pydantic", specifier = ">=2.0.0" },
|
||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
|
||||
{ name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.5.0" },
|
||||
{ name = "requests", specifier = ">=2.28.0" },
|
||||
{ name = "rich", specifier = ">=13.0.0" },
|
||||
{ name = "rich-click", specifier = ">=1.8.0" },
|
||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.6.6" },
|
||||
]
|
||||
provides-extras = ["dev"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
{ name = "prek", specifier = ">=0.3.6" },
|
||||
{ name = "pyright", specifier = ">=1.1.408" },
|
||||
{ name = "ruff", specifier = ">=0.15.7" },
|
||||
{ name = "ty", specifier = ">=0.0.24" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "abx-pkg"
|
||||
version = "1.9.14"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
version = "1.9.18"
|
||||
source = { editable = "../abx-pkg" }
|
||||
dependencies = [
|
||||
{ name = "pip", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "platformdirs", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f9/6e/4465d44686b40ab0361d153160e2bd0167f588756518084308a8e8d08d8c/abx_pkg-1.9.14.tar.gz", hash = "sha256:b94d42cdbc6dde88635903cf14977b34e552d807a72c03d60f27f075deb59952", size = 146811, upload-time = "2026-03-21T07:44:12.158Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/56/af/8e65a23d44e5ccc069c344a7a626f575498b3c1a3ccacb17e941b36ecd35/abx_pkg-1.9.14-py3-none-any.whl", hash = "sha256:cf89dc4c5737e2078cb05fa7e33683718d540391a018445b6e54aa22666f25e0", size = 63511, upload-time = "2026-03-21T07:44:11.038Z" },
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "abx-pkg", extras = ["rich", "pyinfra", "ansible"], marker = "extra == 'all'" },
|
||||
{ name = "ansible", marker = "extra == 'ansible'", specifier = ">=12.3.0" },
|
||||
{ name = "ansible-core", marker = "extra == 'ansible'", specifier = ">=2.0.0" },
|
||||
{ name = "ansible-runner", marker = "extra == 'ansible'", specifier = ">=2.4.2" },
|
||||
{ name = "pip", specifier = ">=26.0.1" },
|
||||
{ name = "platformdirs", specifier = ">=4.9.2" },
|
||||
{ name = "pydantic", specifier = ">=2.12.5" },
|
||||
{ name = "pyinfra", marker = "extra == 'pyinfra'", specifier = ">=3.6.1" },
|
||||
{ name = "rich", marker = "extra == 'rich'", specifier = ">=14.0.0" },
|
||||
{ name = "typing-extensions", specifier = ">=4.15.0" },
|
||||
]
|
||||
provides-extras = ["rich", "pyinfra", "ansible", "all"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
{ name = "django", specifier = ">=4.0" },
|
||||
{ name = "django-admin-data-views", specifier = ">=0.3.1" },
|
||||
{ name = "django-jsonform", specifier = ">=2.22.0" },
|
||||
{ name = "django-pydantic-field", specifier = ">=0.3.9" },
|
||||
{ name = "django-stubs", specifier = ">=5.0.0" },
|
||||
{ name = "mypy", specifier = ">=1.19.1" },
|
||||
{ name = "prek", specifier = ">=0.3.6" },
|
||||
{ name = "pyright" },
|
||||
{ name = "pytest", specifier = ">=9.0.2" },
|
||||
{ name = "rich", specifier = ">=14.0.0" },
|
||||
{ name = "ruff", specifier = ">=0.15.7" },
|
||||
{ name = "ty", specifier = ">=0.0.24" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "abx-plugins"
|
||||
version = "1.10.13"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
version = "1.10.14"
|
||||
source = { editable = "../abx-plugins" }
|
||||
dependencies = [
|
||||
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pydantic-settings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/2b/ea/7e70fa30a1e52039decd8b755b22549b8c51fb9d97cf54751b6fd1af7f2d/abx_plugins-1.10.13.tar.gz", hash = "sha256:945623afc6436894d26e8e27ce6101032b0c42655d5cbfaeeaa8a57913d0d46a", size = 525322, upload-time = "2026-03-21T17:39:10.142Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/29/25/d5338a5a7a1958916e7104727046ec01744da3fb28b1e30934480ab57f65/abx_plugins-1.10.13-py3-none-any.whl", hash = "sha256:79353763baf685871d52ea7e5fa8d0249937ec9edb2f63c7768b0c0a98d5518e", size = 731961, upload-time = "2026-03-21T17:39:11.713Z" },
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "abx-pkg", editable = "../abx-pkg" },
|
||||
{ name = "feedparser", marker = "extra == 'dev'", specifier = ">=6.0.0" },
|
||||
{ name = "jinja2", marker = "extra == 'dev'", specifier = ">=3.1.0" },
|
||||
{ name = "pydantic-settings", specifier = ">=2.0.0" },
|
||||
{ name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.408" },
|
||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
|
||||
{ name = "pytest-httpserver", marker = "extra == 'dev'", specifier = ">=1.1.0" },
|
||||
{ name = "requests", marker = "extra == 'dev'", specifier = ">=2.28.0" },
|
||||
{ name = "rich-click", specifier = ">=1.9.7" },
|
||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.15.2" },
|
||||
{ name = "ty", marker = "extra == 'dev'", specifier = ">=0.0.18" },
|
||||
]
|
||||
provides-extras = ["dev"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [{ name = "prek", specifier = ">=0.3.6" }]
|
||||
|
||||
[[package]]
|
||||
name = "abxbus"
|
||||
version = "2.4.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
version = "2.4.7"
|
||||
source = { editable = "../abxbus" }
|
||||
dependencies = [
|
||||
{ name = "aiofiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "anyio", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -73,9 +139,41 @@ dependencies = [
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "uuid7", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/14/e5/ddf5dab0db243ddd9b193a4461a2d07f3d554b595c77e58af0beceb60eb2/abxbus-2.4.2.tar.gz", hash = "sha256:1c8056655decc81d28a8622f313109df9da36bde77175b0388a0ab9300b878a8", size = 114123, upload-time = "2026-03-20T21:09:35.643Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/86/c8/7815696415e66a7753112062a1357457f1cdd52d623964942f9086872dcb/abxbus-2.4.2-py3-none-any.whl", hash = "sha256:bd2058280fea91a021b604fdc32c4e4e690dfdee848fa50ea746cd786581f923", size = 110208, upload-time = "2026-03-20T21:09:33.942Z" },
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "aiofiles", specifier = ">=24.1.0" },
|
||||
{ name = "anyio", specifier = ">=4.9.0" },
|
||||
{ name = "asyncpg", marker = "extra == 'bridges'", specifier = ">=0.31.0" },
|
||||
{ name = "asyncpg", marker = "extra == 'postgres'", specifier = ">=0.31.0" },
|
||||
{ name = "nats-py", marker = "extra == 'bridges'", specifier = ">=2.13.1" },
|
||||
{ name = "nats-py", marker = "extra == 'nats'", specifier = ">=2.13.1" },
|
||||
{ name = "portalocker", specifier = ">=2.7.0" },
|
||||
{ name = "pydantic", specifier = ">=2.11.5" },
|
||||
{ name = "redis", marker = "extra == 'bridges'", specifier = ">=7.1.1" },
|
||||
{ name = "redis", marker = "extra == 'redis'", specifier = ">=7.1.1" },
|
||||
{ name = "typing-extensions", specifier = ">=4.12.2" },
|
||||
{ name = "uuid7", specifier = ">=0.1.0" },
|
||||
]
|
||||
provides-extras = ["postgres", "nats", "redis", "bridges"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
{ name = "build", specifier = ">=1.2.2" },
|
||||
{ name = "codespell", specifier = ">=2.4.1" },
|
||||
{ name = "fastapi", specifier = ">=0.118.0" },
|
||||
{ name = "ipdb", specifier = ">=0.13.13" },
|
||||
{ name = "prek", specifier = ">=0.3.3" },
|
||||
{ name = "psutil", specifier = ">=7.0.0" },
|
||||
{ name = "pyright", specifier = ">=1.1.404" },
|
||||
{ name = "pytest", specifier = ">=8.3.5" },
|
||||
{ name = "pytest-asyncio", specifier = ">=1.1.0" },
|
||||
{ name = "pytest-cov", specifier = ">=6.2.1" },
|
||||
{ name = "pytest-httpserver", specifier = ">=1.0.8" },
|
||||
{ name = "pytest-timeout", specifier = ">=2.4.0" },
|
||||
{ name = "pytest-xdist", specifier = ">=3.7.0" },
|
||||
{ name = "ruff", specifier = ">=0.15.1" },
|
||||
{ name = "ty", specifier = ">=0.0.1a19" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -119,12 +217,13 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "archivebox"
|
||||
version = "0.9.10rc1"
|
||||
version = "0.9.10rc2"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "abx-dl", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "abx-plugins", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "abxbus", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -213,9 +312,10 @@ dev = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "abx-dl", specifier = ">=1.10.13" },
|
||||
{ name = "abx-pkg", specifier = ">=1.9.14" },
|
||||
{ name = "abx-plugins", specifier = ">=1.9.18" },
|
||||
{ name = "abx-dl", editable = "../abx-dl" },
|
||||
{ name = "abx-pkg", editable = "../abx-pkg" },
|
||||
{ name = "abx-plugins", editable = "../abx-plugins" },
|
||||
{ name = "abxbus", editable = "../abxbus" },
|
||||
{ name = "archivebox", extras = ["sonic", "ldap", "debug"], marker = "extra == 'all'" },
|
||||
{ name = "atomicwrites", specifier = "==1.4.1" },
|
||||
{ name = "base32-crockford", specifier = ">=0.3.0" },
|
||||
@@ -1856,16 +1956,16 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "pytest-cov"
|
||||
version = "7.0.0"
|
||||
version = "7.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "coverage", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pluggy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
Reference in New Issue
Block a user