mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
# in a universe that seems indifferent to us."
|
||||
# --Norber Weiner
|
||||
|
||||
__package__ = 'archivebox'
|
||||
__package__ = "archivebox"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -22,11 +22,12 @@ from abx_plugins import get_plugins_dir
|
||||
class _ReconfigurableStream(Protocol):
|
||||
def reconfigure(self, *, line_buffering: bool) -> object: ...
|
||||
|
||||
|
||||
# Force unbuffered output for real-time logs
|
||||
if hasattr(sys.stdout, 'reconfigure'):
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
cast(_ReconfigurableStream, sys.stdout).reconfigure(line_buffering=True)
|
||||
cast(_ReconfigurableStream, sys.stderr).reconfigure(line_buffering=True)
|
||||
os.environ['PYTHONUNBUFFERED'] = '1'
|
||||
os.environ["PYTHONUNBUFFERED"] = "1"
|
||||
|
||||
ASCII_LOGO = """
|
||||
█████╗ ██████╗ ██████╗██╗ ██╗██╗██╗ ██╗███████╗ ██████╗ ██████╗ ██╗ ██╗
|
||||
@@ -44,48 +45,51 @@ PACKAGE_DIR = Path(__file__).resolve().parent
|
||||
# if str(PACKAGE_DIR) not in sys.path:
|
||||
# sys.path.append(str(PACKAGE_DIR))
|
||||
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
|
||||
os.environ['TZ'] = 'UTC'
|
||||
os.environ["DJANGO_SETTINGS_MODULE"] = "archivebox.core.settings"
|
||||
os.environ["TZ"] = "UTC"
|
||||
|
||||
# detect ArchiveBox user's UID/GID based on data dir ownership
|
||||
from .config.permissions import drop_privileges # noqa
|
||||
from .config.permissions import drop_privileges # noqa
|
||||
|
||||
drop_privileges()
|
||||
|
||||
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
|
||||
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
|
||||
|
||||
check_not_root()
|
||||
check_not_inside_source_dir()
|
||||
check_io_encoding()
|
||||
|
||||
# Install monkey patches for third-party libraries
|
||||
from .misc.monkey_patches import * # noqa
|
||||
from .misc.monkey_patches import * # noqa
|
||||
|
||||
# Plugin directories
|
||||
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
|
||||
USER_PLUGINS_DIR = Path(
|
||||
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
|
||||
or os.environ.get('USER_PLUGINS_DIR')
|
||||
or os.environ.get('DATA_DIR', os.getcwd())
|
||||
) / 'custom_plugins'
|
||||
USER_PLUGINS_DIR = (
|
||||
Path(
|
||||
os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or os.environ.get("USER_PLUGINS_DIR") or os.environ.get("DATA_DIR", os.getcwd()),
|
||||
)
|
||||
/ "custom_plugins"
|
||||
)
|
||||
|
||||
# These are kept for backwards compatibility with existing code
|
||||
# that checks for plugins. The new hook system uses discover_hooks()
|
||||
ALL_PLUGINS = {
|
||||
'builtin': BUILTIN_PLUGINS_DIR,
|
||||
'user': USER_PLUGINS_DIR,
|
||||
"builtin": BUILTIN_PLUGINS_DIR,
|
||||
"user": USER_PLUGINS_DIR,
|
||||
}
|
||||
LOADED_PLUGINS = ALL_PLUGINS
|
||||
|
||||
# Setup basic config, constants, paths, and version
|
||||
from .config.constants import CONSTANTS # noqa
|
||||
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .config.version import VERSION # noqa
|
||||
from .config.constants import CONSTANTS # noqa
|
||||
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .config.version import VERSION # noqa
|
||||
|
||||
# Set MACHINE_ID env var so hook scripts can use it
|
||||
os.environ.setdefault('MACHINE_ID', CONSTANTS.MACHINE_ID)
|
||||
os.environ.setdefault("MACHINE_ID", CONSTANTS.MACHINE_ID)
|
||||
|
||||
__version__ = VERSION
|
||||
__author__ = 'ArchiveBox'
|
||||
__license__ = 'MIT'
|
||||
__author__ = "ArchiveBox"
|
||||
__license__ = "MIT"
|
||||
|
||||
ASCII_ICON = """
|
||||
██████████████████████████████████████████████████████████████████████████████████████████████████
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
"""This is the entrypoint for python -m archivebox ..."""
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import archivebox # noqa # make sure monkey patches are applied before anything else
|
||||
__package__ = "archivebox"
|
||||
|
||||
import archivebox # noqa # make sure monkey patches are applied before anything else
|
||||
import sys
|
||||
|
||||
from .cli import main
|
||||
@@ -15,5 +16,5 @@ ASCII_LOGO_MINI = r"""
|
||||
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
|
||||
"""
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.contrib import admin
|
||||
from django.http import HttpRequest
|
||||
@@ -11,57 +11,81 @@ from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
class APITokenAdmin(BaseModelAdmin):
|
||||
list_display = ('created_at', 'id', 'created_by', 'token_redacted', 'expires')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'expires')
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
search_fields = ('id', 'created_by__username', 'token')
|
||||
list_display = ("created_at", "id", "created_by", "token_redacted", "expires")
|
||||
sort_fields = ("id", "created_at", "created_by", "expires")
|
||||
readonly_fields = ("created_at", "modified_at")
|
||||
search_fields = ("id", "created_by__username", "token")
|
||||
|
||||
fieldsets = (
|
||||
('Token', {
|
||||
'fields': ('token', 'expires'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Owner', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
(
|
||||
"Token",
|
||||
{
|
||||
"fields": ("token", "expires"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Owner",
|
||||
{
|
||||
"fields": ("created_by",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Timestamps",
|
||||
{
|
||||
"fields": ("created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
list_filter = ('created_by',)
|
||||
ordering = ['-created_at']
|
||||
list_filter = ("created_by",)
|
||||
ordering = ["-created_at"]
|
||||
list_per_page = 100
|
||||
|
||||
|
||||
class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
|
||||
list_display = ('created_at', 'created_by', 'id', *WebhookAdmin.list_display)
|
||||
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
|
||||
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
|
||||
list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display)
|
||||
sort_fields = ("created_at", "created_by", "id", "referenced_model", "endpoint", "last_success", "last_error")
|
||||
readonly_fields = ("created_at", "modified_at", *WebhookAdmin.readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Webhook', {
|
||||
'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Authentication', {
|
||||
'fields': ('auth_token',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('enabled', 'last_success', 'last_error'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Owner', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
(
|
||||
"Webhook",
|
||||
{
|
||||
"fields": ("name", "signal", "referenced_model", "endpoint"),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Authentication",
|
||||
{
|
||||
"fields": ("auth_token",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Status",
|
||||
{
|
||||
"fields": ("enabled", "last_success", "last_error"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Owner",
|
||||
{
|
||||
"fields": ("created_by",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Timestamps",
|
||||
{
|
||||
"fields": ("created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool:
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class APIConfig(AppConfig):
|
||||
name = 'archivebox.api'
|
||||
label = 'api'
|
||||
name = "archivebox.api"
|
||||
label = "api"
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
from archivebox.api.admin import register_admin
|
||||
|
||||
register_admin(admin_site)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from typing import Optional
|
||||
from datetime import timedelta
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -14,7 +13,7 @@ from ninja.errors import HttpError
|
||||
|
||||
def get_or_create_api_token(user: User | None):
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
if user and user.is_superuser:
|
||||
api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now())
|
||||
if api_tokens.exists():
|
||||
@@ -34,18 +33,18 @@ def get_or_create_api_token(user: User | None):
|
||||
|
||||
def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None:
|
||||
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
|
||||
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||
|
||||
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||
|
||||
user: User | None = None
|
||||
|
||||
submitted_empty_form = str(token).strip() in ('string', '', 'None', 'null')
|
||||
submitted_empty_form = str(token).strip() in ("string", "", "None", "null")
|
||||
if not submitted_empty_form:
|
||||
try:
|
||||
api_token = APIToken.objects.get(token=token)
|
||||
if api_token.is_valid() and isinstance(api_token.created_by, User):
|
||||
user = api_token.created_by
|
||||
if request is not None:
|
||||
setattr(request, '_api_token', api_token)
|
||||
setattr(request, "_api_token", api_token)
|
||||
except APIToken.DoesNotExist:
|
||||
pass
|
||||
|
||||
@@ -55,8 +54,8 @@ def auth_using_token(token: str | None, request: HttpRequest | None = None) -> U
|
||||
def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None:
|
||||
"""Given a username and password, check if they are valid and return the corresponding user"""
|
||||
user: User | None = None
|
||||
|
||||
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
|
||||
|
||||
submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None))
|
||||
if not submitted_empty_form:
|
||||
authenticated_user = authenticate(
|
||||
username=username,
|
||||
@@ -73,34 +72,40 @@ def auth_using_password(username: str | None, password: str | None, request: Htt
|
||||
def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None:
|
||||
if user and user.pk:
|
||||
request.user = user
|
||||
setattr(request, '_api_auth_method', auth_method)
|
||||
setattr(request, "_api_auth_method", auth_method)
|
||||
if not user.is_superuser:
|
||||
raise HttpError(403, 'Valid credentials but User does not have permission (make sure user.is_superuser=True)')
|
||||
raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)")
|
||||
return user
|
||||
|
||||
|
||||
### Django-Ninja-Provided Auth Methods
|
||||
|
||||
|
||||
class HeaderTokenAuth(APIKeyHeader):
|
||||
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
|
||||
|
||||
param_name = "X-ArchiveBox-API-Key"
|
||||
|
||||
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
|
||||
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
|
||||
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
|
||||
|
||||
|
||||
class BearerTokenAuth(HttpBearer):
|
||||
"""Allow authenticating by passing Bearer=xyz as a request header"""
|
||||
|
||||
def authenticate(self, request: HttpRequest, token: str) -> User | None:
|
||||
return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__)
|
||||
|
||||
|
||||
class QueryParamTokenAuth(APIKeyQuery):
|
||||
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
|
||||
|
||||
param_name = "api_key"
|
||||
|
||||
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
|
||||
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
|
||||
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
|
||||
|
||||
|
||||
class UsernameAndPasswordAuth(HttpBasicAuth):
|
||||
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
|
||||
|
||||
@@ -111,25 +116,28 @@ class UsernameAndPasswordAuth(HttpBasicAuth):
|
||||
self.__class__.__name__,
|
||||
)
|
||||
|
||||
|
||||
class DjangoSessionAuth:
|
||||
"""Allow authenticating with existing Django session cookies (same-origin only)."""
|
||||
|
||||
def __call__(self, request: HttpRequest) -> User | None:
|
||||
return self.authenticate(request)
|
||||
|
||||
def authenticate(self, request: HttpRequest, **kwargs) -> User | None:
|
||||
user = getattr(request, 'user', None)
|
||||
user = getattr(request, "user", None)
|
||||
if isinstance(user, User) and user.is_authenticated:
|
||||
setattr(request, '_api_auth_method', self.__class__.__name__)
|
||||
setattr(request, "_api_auth_method", self.__class__.__name__)
|
||||
if not user.is_superuser:
|
||||
raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
|
||||
raise HttpError(403, "Valid session but User does not have permission (make sure user.is_superuser=True)")
|
||||
return user
|
||||
return None
|
||||
|
||||
|
||||
### Enabled Auth Methods
|
||||
|
||||
API_AUTH_METHODS = [
|
||||
HeaderTokenAuth(),
|
||||
BearerTokenAuth(),
|
||||
QueryParamTokenAuth(),
|
||||
QueryParamTokenAuth(),
|
||||
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
|
||||
]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.http import HttpResponse
|
||||
|
||||
@@ -10,8 +10,8 @@ class ApiCorsMiddleware:
|
||||
self.get_response = get_response
|
||||
|
||||
def __call__(self, request):
|
||||
if request.path.startswith('/api/'):
|
||||
if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
|
||||
if request.path.startswith("/api/"):
|
||||
if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"):
|
||||
response = HttpResponse(status=204)
|
||||
return self._add_cors_headers(request, response)
|
||||
|
||||
@@ -21,14 +21,12 @@ class ApiCorsMiddleware:
|
||||
return self.get_response(request)
|
||||
|
||||
def _add_cors_headers(self, request, response):
|
||||
origin = request.META.get('HTTP_ORIGIN')
|
||||
origin = request.META.get("HTTP_ORIGIN")
|
||||
if not origin:
|
||||
return response
|
||||
|
||||
response['Access-Control-Allow-Origin'] = '*'
|
||||
response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
|
||||
response['Access-Control-Allow-Headers'] = (
|
||||
'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
|
||||
)
|
||||
response['Access-Control-Max-Age'] = '600'
|
||||
response["Access-Control-Allow-Origin"] = "*"
|
||||
response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS"
|
||||
response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken"
|
||||
response["Access-Control-Max-Age"] = "600"
|
||||
return response
|
||||
|
||||
@@ -13,11 +13,10 @@ import signal_webhooks.utils
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
("auth", "0012_alter_user_first_name_max_length"),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
@@ -75,55 +74,165 @@ class Migration(migrations.Migration):
|
||||
reverse_sql="""
|
||||
DROP TABLE IF EXISTS api_outboundwebhook;
|
||||
DROP TABLE IF EXISTS api_apitoken;
|
||||
"""
|
||||
""",
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
migrations.CreateModel(
|
||||
name='APIToken',
|
||||
name="APIToken",
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
('expires', models.DateTimeField(blank=True, null=True)),
|
||||
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
("expires", models.DateTimeField(blank=True, null=True)),
|
||||
(
|
||||
"created_by",
|
||||
models.ForeignKey(
|
||||
default=get_or_create_system_user_pk,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'API Key',
|
||||
'verbose_name_plural': 'API Keys',
|
||||
'app_label': 'api',
|
||||
"verbose_name": "API Key",
|
||||
"verbose_name_plural": "API Keys",
|
||||
"app_label": "api",
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='OutboundWebhook',
|
||||
name="OutboundWebhook",
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('name', models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name')),
|
||||
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal')),
|
||||
('ref', models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
|
||||
('endpoint', models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint')),
|
||||
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
|
||||
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
|
||||
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
|
||||
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
|
||||
('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
|
||||
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
|
||||
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
|
||||
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
|
||||
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
|
||||
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
(
|
||||
"name",
|
||||
models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"),
|
||||
),
|
||||
(
|
||||
"signal",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("CREATE", "Create"),
|
||||
("UPDATE", "Update"),
|
||||
("DELETE", "Delete"),
|
||||
("M2M", "M2M changed"),
|
||||
("CREATE_OR_UPDATE", "Create or Update"),
|
||||
("CREATE_OR_DELETE", "Create or Delete"),
|
||||
("CREATE_OR_M2M", "Create or M2M changed"),
|
||||
("UPDATE_OR_DELETE", "Update or Delete"),
|
||||
("UPDATE_OR_M2M", "Update or M2M changed"),
|
||||
("DELETE_OR_M2M", "Delete or M2M changed"),
|
||||
("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"),
|
||||
("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"),
|
||||
("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"),
|
||||
("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"),
|
||||
("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"),
|
||||
],
|
||||
help_text="Signal the webhook fires to.",
|
||||
max_length=255,
|
||||
verbose_name="signal",
|
||||
),
|
||||
),
|
||||
(
|
||||
"ref",
|
||||
models.CharField(
|
||||
db_index=True,
|
||||
help_text="Dot import notation to the model the webhook is for.",
|
||||
max_length=1023,
|
||||
validators=[signal_webhooks.utils.model_from_reference],
|
||||
verbose_name="referenced model",
|
||||
),
|
||||
),
|
||||
(
|
||||
"endpoint",
|
||||
models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"),
|
||||
),
|
||||
(
|
||||
"headers",
|
||||
models.JSONField(
|
||||
blank=True,
|
||||
default=dict,
|
||||
help_text="Headers to send with the webhook request.",
|
||||
validators=[signal_webhooks.utils.is_dict],
|
||||
verbose_name="headers",
|
||||
),
|
||||
),
|
||||
(
|
||||
"auth_token",
|
||||
signal_webhooks.fields.TokenField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Authentication token to use in an Authorization header.",
|
||||
max_length=8000,
|
||||
validators=[signal_webhooks.utils.decode_cipher_key],
|
||||
verbose_name="authentication token",
|
||||
),
|
||||
),
|
||||
("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")),
|
||||
(
|
||||
"keep_last_response",
|
||||
models.BooleanField(
|
||||
default=False,
|
||||
help_text="Should the webhook keep a log of the latest response it got?",
|
||||
verbose_name="keep last response",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created",
|
||||
models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"),
|
||||
),
|
||||
(
|
||||
"updated",
|
||||
models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"),
|
||||
),
|
||||
(
|
||||
"last_response",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Latest response to this webhook.",
|
||||
max_length=8000,
|
||||
verbose_name="last response",
|
||||
),
|
||||
),
|
||||
(
|
||||
"last_success",
|
||||
models.DateTimeField(
|
||||
default=None,
|
||||
help_text="When the webhook last succeeded.",
|
||||
null=True,
|
||||
verbose_name="last success",
|
||||
),
|
||||
),
|
||||
(
|
||||
"last_failure",
|
||||
models.DateTimeField(
|
||||
default=None,
|
||||
help_text="When the webhook last failed.",
|
||||
null=True,
|
||||
verbose_name="last failure",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created_by",
|
||||
models.ForeignKey(
|
||||
default=get_or_create_system_user_pk,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'API Outbound Webhook',
|
||||
'app_label': 'api',
|
||||
"verbose_name": "API Outbound Webhook",
|
||||
"app_label": "api",
|
||||
},
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='outboundwebhook',
|
||||
constraint=models.UniqueConstraint(fields=['ref', 'endpoint'], name='prevent_duplicate_hooks_api_outboundwebhook'),
|
||||
model_name="outboundwebhook",
|
||||
constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"),
|
||||
),
|
||||
],
|
||||
),
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
import secrets
|
||||
from archivebox.uuid_compat import uuid7
|
||||
@@ -25,7 +25,7 @@ class APIToken(models.Model):
|
||||
expires = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'api'
|
||||
app_label = "api"
|
||||
verbose_name = "API Key"
|
||||
verbose_name_plural = "API Keys"
|
||||
|
||||
@@ -34,7 +34,7 @@ class APIToken(models.Model):
|
||||
|
||||
@property
|
||||
def token_redacted(self):
|
||||
return f'************{self.token[-4:]}'
|
||||
return f"************{self.token[-4:]}"
|
||||
|
||||
def is_valid(self, for_date=None):
|
||||
return not self.expires or self.expires >= (for_date or timezone.now())
|
||||
@@ -47,8 +47,8 @@ class OutboundWebhook(WebhookBase):
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta(WebhookBase.Meta):
|
||||
app_label = 'api'
|
||||
verbose_name = 'API Outbound Webhook'
|
||||
app_label = "api"
|
||||
verbose_name = "API Outbound Webhook"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.id}] {self.ref} -> {self.endpoint}'
|
||||
return f"[{self.id}] {self.ref} -> {self.endpoint}"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.urls import path
|
||||
from django.views.generic.base import RedirectView
|
||||
@@ -6,12 +6,10 @@ from django.views.generic.base import RedirectView
|
||||
from .v1_api import urls as v1_api_urls
|
||||
|
||||
urlpatterns = [
|
||||
path("", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
path("v1/", RedirectView.as_view(url='/api/v1/docs')),
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
path("", RedirectView.as_view(url="/api/v1/docs")),
|
||||
path("v1/", RedirectView.as_view(url="/api/v1/docs")),
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url="/api/v1/docs")),
|
||||
# ... v2 can be added here ...
|
||||
# path("v2/", v2_api_urls),
|
||||
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
|
||||
from io import StringIO
|
||||
@@ -20,9 +20,9 @@ from archivebox.api.auth import API_AUTH_METHODS
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
|
||||
COMMIT_HASH = get_COMMIT_HASH() or "unknown"
|
||||
|
||||
html_description=f'''
|
||||
html_description = f"""
|
||||
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||
<br/>
|
||||
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
|
||||
@@ -35,47 +35,47 @@ html_description=f'''
|
||||
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
|
||||
</ul>
|
||||
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
|
||||
api.add_router('/core/', 'archivebox.api.v1_core.router')
|
||||
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
|
||||
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
|
||||
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
|
||||
api.add_router("/auth/", "archivebox.api.v1_auth.router")
|
||||
api.add_router("/core/", "archivebox.api.v1_core.router")
|
||||
api.add_router("/crawls/", "archivebox.api.v1_crawls.router")
|
||||
api.add_router("/cli/", "archivebox.api.v1_cli.router")
|
||||
api.add_router("/machine/", "archivebox.api.v1_machine.router")
|
||||
return api
|
||||
|
||||
|
||||
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
|
||||
stdout, stderr = StringIO(), StringIO()
|
||||
|
||||
with redirect_stderr(stderr):
|
||||
with redirect_stdout(stdout):
|
||||
setattr(request, 'stdout', stdout)
|
||||
setattr(request, 'stderr', stderr)
|
||||
setattr(request, "stdout", stdout)
|
||||
setattr(request, "stderr", stderr)
|
||||
|
||||
response = super().create_temporal_response(request)
|
||||
|
||||
# Diable caching of API responses entirely
|
||||
response['Cache-Control'] = 'no-store'
|
||||
# Disable caching of API responses entirely
|
||||
response["Cache-Control"] = "no-store"
|
||||
|
||||
# Add debug stdout and stderr headers to response
|
||||
response['X-ArchiveBox-Stdout'] = stdout.getvalue().replace('\n', '\\n')[:200]
|
||||
response['X-ArchiveBox-Stderr'] = stderr.getvalue().replace('\n', '\\n')[:200]
|
||||
response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200]
|
||||
response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200]
|
||||
# response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown'
|
||||
|
||||
# Add Auth Headers to response
|
||||
api_token_attr = getattr(request, '_api_token', None)
|
||||
api_token_attr = getattr(request, "_api_token", None)
|
||||
api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None
|
||||
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else 'Never'
|
||||
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never"
|
||||
|
||||
response['X-ArchiveBox-Auth-Method'] = str(getattr(request, '_api_auth_method', 'None'))
|
||||
response['X-ArchiveBox-Auth-Expires'] = token_expiry
|
||||
response['X-ArchiveBox-Auth-Token-Id'] = str(api_token.id) if api_token else 'None'
|
||||
response['X-ArchiveBox-Auth-User-Id'] = str(request.user.pk) if getattr(request.user, 'pk', None) else 'None'
|
||||
response['X-ArchiveBox-Auth-User-Username'] = request.user.username if isinstance(request.user, User) else 'None'
|
||||
response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None"))
|
||||
response["X-ArchiveBox-Auth-Expires"] = token_expiry
|
||||
response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None"
|
||||
response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None"
|
||||
response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None"
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
# print('RESPONDING NOW', response)
|
||||
@@ -84,7 +84,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
|
||||
|
||||
api = NinjaAPIWithIOCapture(
|
||||
title='ArchiveBox API',
|
||||
title="ArchiveBox API",
|
||||
description=html_description,
|
||||
version=VERSION,
|
||||
auth=API_AUTH_METHODS,
|
||||
@@ -103,15 +103,15 @@ def generic_exception_handler(request, err):
|
||||
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
|
||||
status = 404
|
||||
|
||||
print(''.join(format_exception(err)))
|
||||
print("".join(format_exception(err)))
|
||||
|
||||
return api.create_response(
|
||||
request,
|
||||
{
|
||||
"succeeded": False,
|
||||
"message": f'{err.__class__.__name__}: {err}',
|
||||
"message": f"{err.__class__.__name__}: {err}",
|
||||
"errors": [
|
||||
''.join(format_exception(err)),
|
||||
"".join(format_exception(err)),
|
||||
# or send simpler parent-only traceback:
|
||||
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
|
||||
],
|
||||
@@ -120,7 +120,6 @@ def generic_exception_handler(request, err):
|
||||
)
|
||||
|
||||
|
||||
|
||||
# import orjson
|
||||
# from ninja.renderers import BaseRenderer
|
||||
# class ORJSONRenderer(BaseRenderer):
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from typing import Optional
|
||||
from django.http import HttpRequest
|
||||
|
||||
from ninja import Router, Schema
|
||||
@@ -8,16 +7,21 @@ from ninja import Router, Schema
|
||||
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
|
||||
|
||||
|
||||
router = Router(tags=['Authentication'], auth=None)
|
||||
router = Router(tags=["Authentication"], auth=None)
|
||||
|
||||
|
||||
class PasswordAuthSchema(Schema):
|
||||
"""Schema for a /get_api_token request"""
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
|
||||
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
|
||||
@router.post(
|
||||
"/get_api_token",
|
||||
auth=None,
|
||||
summary="Generate an API token for a given username & password (or currently logged-in user)",
|
||||
) # auth=None because they are not authed yet
|
||||
def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
|
||||
user = auth_using_password(
|
||||
username=auth_data.username,
|
||||
@@ -35,17 +39,21 @@ def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
|
||||
"token": api_token.token,
|
||||
"expires": api_token.expires.isoformat() if api_token.expires else None,
|
||||
}
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
|
||||
class TokenAuthSchema(Schema):
|
||||
"""Schema for a /check_api_token request"""
|
||||
|
||||
token: str
|
||||
|
||||
|
||||
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
|
||||
@router.post(
|
||||
"/check_api_token",
|
||||
auth=None,
|
||||
summary="Validate an API token to make sure its valid and non-expired",
|
||||
) # auth=None because they are not authed yet
|
||||
def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
|
||||
user = auth_using_token(
|
||||
token=token_data.token,
|
||||
@@ -53,5 +61,5 @@ def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
|
||||
)
|
||||
if user:
|
||||
return {"success": True, "user_id": str(user.pk)}
|
||||
|
||||
|
||||
return {"success": False, "user_id": None}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
import json
|
||||
from io import StringIO
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import Any
|
||||
from enum import Enum
|
||||
|
||||
from django.http import HttpRequest
|
||||
@@ -16,44 +16,47 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
||||
# from .auth import API_AUTH_METHODS
|
||||
|
||||
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||
router = Router(tags=["ArchiveBox CLI Sub-Commands"])
|
||||
|
||||
|
||||
# Schemas
|
||||
|
||||
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
|
||||
JSONType = list[Any] | dict[str, Any] | bool | int | str | None
|
||||
|
||||
|
||||
class CLICommandResponseSchema(Schema):
|
||||
success: bool
|
||||
errors: List[str]
|
||||
errors: list[str]
|
||||
result: JSONType
|
||||
result_format: str = 'str'
|
||||
result_format: str = "str"
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
|
||||
class FilterTypeChoices(str, Enum):
|
||||
exact = 'exact'
|
||||
substring = 'substring'
|
||||
regex = 'regex'
|
||||
domain = 'domain'
|
||||
tag = 'tag'
|
||||
timestamp = 'timestamp'
|
||||
exact = "exact"
|
||||
substring = "substring"
|
||||
regex = "regex"
|
||||
domain = "domain"
|
||||
tag = "tag"
|
||||
timestamp = "timestamp"
|
||||
|
||||
|
||||
class StatusChoices(str, Enum):
|
||||
indexed = 'indexed'
|
||||
archived = 'archived'
|
||||
unarchived = 'unarchived'
|
||||
present = 'present'
|
||||
valid = 'valid'
|
||||
invalid = 'invalid'
|
||||
duplicate = 'duplicate'
|
||||
orphaned = 'orphaned'
|
||||
corrupted = 'corrupted'
|
||||
unrecognized = 'unrecognized'
|
||||
indexed = "indexed"
|
||||
archived = "archived"
|
||||
unarchived = "unarchived"
|
||||
present = "present"
|
||||
valid = "valid"
|
||||
invalid = "invalid"
|
||||
duplicate = "duplicate"
|
||||
orphaned = "orphaned"
|
||||
corrupted = "corrupted"
|
||||
unrecognized = "unrecognized"
|
||||
|
||||
|
||||
class AddCommandSchema(Schema):
|
||||
urls: List[str]
|
||||
urls: list[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
parser: str = "auto"
|
||||
@@ -62,53 +65,54 @@ class AddCommandSchema(Schema):
|
||||
overwrite: bool = False
|
||||
index_only: bool = False
|
||||
|
||||
|
||||
class UpdateCommandSchema(Schema):
|
||||
resume: Optional[str] = None
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
resume: str | None = None
|
||||
after: float | None = 0
|
||||
before: float | None = 999999999999999
|
||||
filter_type: str | None = FilterTypeChoices.substring
|
||||
filter_patterns: list[str] | None = ["https://example.com"]
|
||||
batch_size: int = 100
|
||||
continuous: bool = False
|
||||
|
||||
|
||||
class ScheduleCommandSchema(Schema):
|
||||
import_path: Optional[str] = None
|
||||
import_path: str | None = None
|
||||
add: bool = False
|
||||
show: bool = False
|
||||
foreground: bool = False
|
||||
run_all: bool = False
|
||||
quiet: bool = False
|
||||
every: Optional[str] = None
|
||||
tag: str = ''
|
||||
every: str | None = None
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
overwrite: bool = False
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
clear: bool = False
|
||||
|
||||
|
||||
class ListCommandSchema(Schema):
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_patterns: list[str] | None = ["https://example.com"]
|
||||
filter_type: str = FilterTypeChoices.substring
|
||||
status: StatusChoices = StatusChoices.indexed
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
sort: str = 'bookmarked_at'
|
||||
after: float | None = 0
|
||||
before: float | None = 999999999999999
|
||||
sort: str = "bookmarked_at"
|
||||
as_json: bool = True
|
||||
as_html: bool = False
|
||||
as_csv: str | None = 'timestamp,url'
|
||||
as_csv: str | None = "timestamp,url"
|
||||
with_headers: bool = False
|
||||
|
||||
|
||||
class RemoveCommandSchema(Schema):
|
||||
delete: bool = True
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
after: float | None = 0
|
||||
before: float | None = 999999999999999
|
||||
filter_type: str = FilterTypeChoices.exact
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_patterns: list[str] | None = ["https://example.com"]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
|
||||
def cli_add(request: HttpRequest, args: AddCommandSchema):
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
@@ -125,30 +129,30 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
|
||||
created_by_id=request.user.pk,
|
||||
)
|
||||
|
||||
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)]
|
||||
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
|
||||
result_payload = {
|
||||
"crawl_id": str(crawl.id),
|
||||
"num_snapshots": len(snapshot_ids),
|
||||
"snapshot_ids": snapshot_ids,
|
||||
"queued_urls": args.urls,
|
||||
}
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result_payload,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
|
||||
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
|
||||
from archivebox.cli.archivebox_update import update
|
||||
|
||||
|
||||
result = update(
|
||||
filter_patterns=args.filter_patterns or [],
|
||||
filter_type=args.filter_type or FilterTypeChoices.substring,
|
||||
@@ -158,21 +162,21 @@ def cli_update(request: HttpRequest, args: UpdateCommandSchema):
|
||||
batch_size=args.batch_size,
|
||||
continuous=args.continuous,
|
||||
)
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
|
||||
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
|
||||
from archivebox.cli.archivebox_schedule import schedule
|
||||
|
||||
|
||||
result = schedule(
|
||||
import_path=args.import_path,
|
||||
add=args.add,
|
||||
@@ -188,23 +192,22 @@ def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
|
||||
update=args.update,
|
||||
)
|
||||
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
|
||||
@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
|
||||
def cli_search(request: HttpRequest, args: ListCommandSchema):
|
||||
from archivebox.cli.archivebox_search import search
|
||||
|
||||
|
||||
result = search(
|
||||
filter_patterns=args.filter_patterns,
|
||||
filter_type=args.filter_type,
|
||||
@@ -218,7 +221,7 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
|
||||
with_headers=args.with_headers,
|
||||
)
|
||||
|
||||
result_format = 'txt'
|
||||
result_format = "txt"
|
||||
if args.as_json:
|
||||
result_format = "json"
|
||||
result = json.loads(result)
|
||||
@@ -227,20 +230,19 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
|
||||
elif args.as_csv:
|
||||
result_format = "csv"
|
||||
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": result_format,
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
|
||||
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
||||
from archivebox.cli.archivebox_remove import remove
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
@@ -253,10 +255,10 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
||||
after=args.after,
|
||||
before=args.before,
|
||||
)
|
||||
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
|
||||
|
||||
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
|
||||
|
||||
remove(
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
delete=args.delete,
|
||||
snapshots=snapshots_to_remove,
|
||||
before=args.before,
|
||||
@@ -270,14 +272,13 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
||||
"removed_snapshot_ids": removed_snapshot_ids,
|
||||
"remaining_snapshots": Snapshot.objects.count(),
|
||||
}
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from uuid import UUID
|
||||
from typing import List, Optional, Union, Any, Annotated
|
||||
from typing import Union, Any, Annotated
|
||||
from datetime import datetime
|
||||
|
||||
from django.db.models import Model, Q
|
||||
from django.db.models import Model, Q, Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
from django.conf import settings
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.core.exceptions import ValidationError
|
||||
@@ -39,7 +41,7 @@ from archivebox.crawls.models import Crawl
|
||||
from archivebox.api.v1_crawls import CrawlSchema
|
||||
|
||||
|
||||
router = Router(tags=['Core Models'])
|
||||
router = Router(tags=["Core Models"])
|
||||
|
||||
|
||||
class CustomPagination(PaginationBase):
|
||||
@@ -49,13 +51,14 @@ class CustomPagination(PaginationBase):
|
||||
page: int = 0
|
||||
|
||||
class Output(PaginationBase.Output):
|
||||
count: int
|
||||
total_items: int
|
||||
total_pages: int
|
||||
page: int
|
||||
limit: int
|
||||
offset: int
|
||||
num_items: int
|
||||
items: List[Any]
|
||||
items: list[Any]
|
||||
|
||||
def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params):
|
||||
limit = min(pagination.limit, 500)
|
||||
@@ -65,27 +68,29 @@ class CustomPagination(PaginationBase):
|
||||
current_page = math.ceil(offset / (limit + 1))
|
||||
items = queryset[offset : offset + limit]
|
||||
return {
|
||||
'total_items': total,
|
||||
'total_pages': total_pages,
|
||||
'page': current_page,
|
||||
'limit': limit,
|
||||
'offset': offset,
|
||||
'num_items': len(items),
|
||||
'items': items,
|
||||
"count": total,
|
||||
"total_items": total,
|
||||
"total_pages": total_pages,
|
||||
"page": current_page,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"num_items": len(items),
|
||||
"items": items,
|
||||
}
|
||||
|
||||
|
||||
### ArchiveResult #########################################################################
|
||||
|
||||
|
||||
class MinimalArchiveResultSchema(Schema):
|
||||
TYPE: str = 'core.models.ArchiveResult'
|
||||
TYPE: str = "core.models.ArchiveResult"
|
||||
id: UUID
|
||||
created_at: datetime | None
|
||||
modified_at: datetime | None
|
||||
created_by_id: str
|
||||
created_by_username: str
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
retry_at: datetime | None = None
|
||||
plugin: str
|
||||
hook_name: str
|
||||
process_id: UUID | None
|
||||
@@ -93,8 +98,8 @@ class MinimalArchiveResultSchema(Schema):
|
||||
cmd: list[str] | None
|
||||
pwd: str | None
|
||||
output_str: str
|
||||
output_json: dict | None
|
||||
output_files: dict | None
|
||||
output_json: dict[str, Any] | None
|
||||
output_files: dict[str, dict[str, Any]] | None
|
||||
output_size: int
|
||||
output_mimetypes: str
|
||||
start_ts: datetime | None
|
||||
@@ -108,13 +113,34 @@ class MinimalArchiveResultSchema(Schema):
|
||||
def resolve_created_by_username(obj) -> str:
|
||||
return obj.created_by.username
|
||||
|
||||
@staticmethod
|
||||
def resolve_output_files(obj):
|
||||
return obj.output_file_map()
|
||||
|
||||
@staticmethod
|
||||
def resolve_output_mimetypes(obj) -> str:
|
||||
mime_sizes: dict[str, int] = defaultdict(int)
|
||||
for metadata in obj.output_file_map().values():
|
||||
if not isinstance(metadata, dict):
|
||||
continue
|
||||
mimetype = str(metadata.get("mimetype") or "").strip()
|
||||
try:
|
||||
size = max(int(metadata.get("size") or 0), 0)
|
||||
except (TypeError, ValueError):
|
||||
size = 0
|
||||
if mimetype and size:
|
||||
mime_sizes[mimetype] += size
|
||||
if mime_sizes:
|
||||
return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
|
||||
return obj.output_mimetypes or ""
|
||||
|
||||
|
||||
class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
TYPE: str = 'core.models.ArchiveResult'
|
||||
TYPE: str = "core.models.ArchiveResult"
|
||||
snapshot_id: UUID
|
||||
snapshot_timestamp: str
|
||||
snapshot_url: str
|
||||
snapshot_tags: List[str]
|
||||
snapshot_tags: list[str]
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshot_timestamp(obj):
|
||||
@@ -134,25 +160,39 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
|
||||
|
||||
class ArchiveResultFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup(['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
|
||||
search: Annotated[Optional[str], FilterLookup(['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
|
||||
snapshot_id: Annotated[Optional[str], FilterLookup(['snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
|
||||
snapshot_url: Annotated[Optional[str], FilterLookup('snapshot__url__icontains')] = None
|
||||
snapshot_tag: Annotated[Optional[str], FilterLookup('snapshot__tags__name__icontains')] = None
|
||||
status: Annotated[Optional[str], FilterLookup('status')] = None
|
||||
output_str: Annotated[Optional[str], FilterLookup('output_str__icontains')] = None
|
||||
plugin: Annotated[Optional[str], FilterLookup('plugin__icontains')] = None
|
||||
hook_name: Annotated[Optional[str], FilterLookup('hook_name__icontains')] = None
|
||||
process_id: Annotated[Optional[str], FilterLookup('process__id__startswith')] = None
|
||||
cmd: Annotated[Optional[str], FilterLookup('cmd__0__icontains')] = None
|
||||
pwd: Annotated[Optional[str], FilterLookup('pwd__icontains')] = None
|
||||
cmd_version: Annotated[Optional[str], FilterLookup('cmd_version')] = None
|
||||
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
|
||||
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
|
||||
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
|
||||
id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
|
||||
search: Annotated[
|
||||
str | None,
|
||||
FilterLookup(
|
||||
[
|
||||
"snapshot__url__icontains",
|
||||
"snapshot__title__icontains",
|
||||
"snapshot__tags__name__icontains",
|
||||
"plugin",
|
||||
"output_str__icontains",
|
||||
"id__startswith",
|
||||
"snapshot__id__startswith",
|
||||
"snapshot__timestamp__startswith",
|
||||
],
|
||||
),
|
||||
] = None
|
||||
snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
|
||||
snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None
|
||||
snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None
|
||||
status: Annotated[str | None, FilterLookup("status")] = None
|
||||
output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None
|
||||
plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None
|
||||
hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None
|
||||
process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None
|
||||
cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None
|
||||
pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None
|
||||
cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None
|
||||
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
|
||||
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
|
||||
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
|
||||
|
||||
|
||||
@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
|
||||
@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult")
|
||||
@paginate(CustomPagination)
|
||||
def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]):
|
||||
"""List all ArchiveResult entries matching these filters."""
|
||||
@@ -167,8 +207,9 @@ def get_archiveresult(request: HttpRequest, archiveresult_id: str):
|
||||
|
||||
### Snapshot #########################################################################
|
||||
|
||||
|
||||
class SnapshotSchema(Schema):
|
||||
TYPE: str = 'core.models.Snapshot'
|
||||
TYPE: str = "core.models.Snapshot"
|
||||
id: UUID
|
||||
created_by_id: str
|
||||
created_by_username: str
|
||||
@@ -177,14 +218,16 @@ class SnapshotSchema(Schema):
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
bookmarked_at: datetime
|
||||
downloaded_at: Optional[datetime]
|
||||
downloaded_at: datetime | None
|
||||
url: str
|
||||
tags: List[str]
|
||||
title: Optional[str]
|
||||
tags: list[str]
|
||||
title: str | None
|
||||
timestamp: str
|
||||
archive_path: str
|
||||
archive_size: int
|
||||
output_size: int
|
||||
num_archiveresults: int
|
||||
archiveresults: List[MinimalArchiveResultSchema]
|
||||
archiveresults: list[MinimalArchiveResultSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_id(obj):
|
||||
@@ -198,13 +241,21 @@ class SnapshotSchema(Schema):
|
||||
def resolve_tags(obj):
|
||||
return sorted(tag.name for tag in obj.tags.all())
|
||||
|
||||
@staticmethod
|
||||
def resolve_archive_size(obj):
|
||||
return int(getattr(obj, "output_size_sum", obj.archive_size) or 0)
|
||||
|
||||
@staticmethod
|
||||
def resolve_output_size(obj):
|
||||
return SnapshotSchema.resolve_archive_size(obj)
|
||||
|
||||
@staticmethod
|
||||
def resolve_num_archiveresults(obj, context):
|
||||
return obj.archiveresult_set.all().distinct().count()
|
||||
|
||||
@staticmethod
|
||||
def resolve_archiveresults(obj, context):
|
||||
if bool(getattr(context['request'], 'with_archiveresults', False)):
|
||||
if bool(getattr(context["request"], "with_archiveresults", False)):
|
||||
return obj.archiveresult_set.all().distinct()
|
||||
return ArchiveResult.objects.none()
|
||||
|
||||
@@ -212,16 +263,16 @@ class SnapshotSchema(Schema):
|
||||
class SnapshotUpdateSchema(Schema):
|
||||
status: str | None = None
|
||||
retry_at: datetime | None = None
|
||||
tags: Optional[List[str]] = None
|
||||
tags: list[str] | None = None
|
||||
|
||||
|
||||
class SnapshotCreateSchema(Schema):
|
||||
url: str
|
||||
crawl_id: Optional[str] = None
|
||||
crawl_id: str | None = None
|
||||
depth: int = 0
|
||||
title: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
status: Optional[str] = None
|
||||
title: str | None = None
|
||||
tags: list[str] | None = None
|
||||
status: str | None = None
|
||||
|
||||
|
||||
class SnapshotDeleteResponseSchema(Schema):
|
||||
@@ -231,77 +282,82 @@ class SnapshotDeleteResponseSchema(Schema):
|
||||
deleted_count: int
|
||||
|
||||
|
||||
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
|
||||
def normalize_tag_list(tags: list[str] | None = None) -> list[str]:
|
||||
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
|
||||
|
||||
|
||||
class SnapshotFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup(['id__icontains', 'timestamp__startswith'])] = None
|
||||
created_by_id: Annotated[Optional[str], FilterLookup('crawl__created_by_id')] = None
|
||||
created_by_username: Annotated[Optional[str], FilterLookup('crawl__created_by__username__icontains')] = None
|
||||
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
|
||||
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
|
||||
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
|
||||
modified_at: Annotated[Optional[datetime], FilterLookup('modified_at')] = None
|
||||
modified_at__gte: Annotated[Optional[datetime], FilterLookup('modified_at__gte')] = None
|
||||
modified_at__lt: Annotated[Optional[datetime], FilterLookup('modified_at__lt')] = None
|
||||
search: Annotated[Optional[str], FilterLookup(['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])] = None
|
||||
url: Annotated[Optional[str], FilterLookup('url')] = None
|
||||
tag: Annotated[Optional[str], FilterLookup('tags__name')] = None
|
||||
title: Annotated[Optional[str], FilterLookup('title__icontains')] = None
|
||||
timestamp: Annotated[Optional[str], FilterLookup('timestamp__startswith')] = None
|
||||
bookmarked_at__gte: Annotated[Optional[datetime], FilterLookup('bookmarked_at__gte')] = None
|
||||
bookmarked_at__lt: Annotated[Optional[datetime], FilterLookup('bookmarked_at__lt')] = None
|
||||
id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None
|
||||
created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None
|
||||
created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None
|
||||
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
|
||||
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
|
||||
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
|
||||
modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None
|
||||
modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None
|
||||
modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None
|
||||
search: Annotated[
|
||||
str | None,
|
||||
FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]),
|
||||
] = None
|
||||
url: Annotated[str | None, FilterLookup("url")] = None
|
||||
tag: Annotated[str | None, FilterLookup("tags__name")] = None
|
||||
title: Annotated[str | None, FilterLookup("title__icontains")] = None
|
||||
timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None
|
||||
bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None
|
||||
bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None
|
||||
|
||||
|
||||
@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
|
||||
@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots")
|
||||
@paginate(CustomPagination)
|
||||
def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False):
|
||||
"""List all Snapshot entries matching these filters."""
|
||||
setattr(request, 'with_archiveresults', with_archiveresults)
|
||||
return filters.filter(Snapshot.objects.all()).distinct()
|
||||
setattr(request, "with_archiveresults", with_archiveresults)
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
|
||||
return filters.filter(queryset).distinct()
|
||||
|
||||
|
||||
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
|
||||
def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True):
|
||||
"""Get a specific Snapshot by id."""
|
||||
setattr(request, 'with_archiveresults', with_archiveresults)
|
||||
setattr(request, "with_archiveresults", with_archiveresults)
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
|
||||
try:
|
||||
return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
|
||||
return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
|
||||
except Snapshot.DoesNotExist:
|
||||
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
|
||||
return queryset.get(Q(id__icontains=snapshot_id))
|
||||
|
||||
|
||||
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
|
||||
def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
|
||||
tags = normalize_tag_list(data.tags)
|
||||
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {data.status}')
|
||||
raise HttpError(400, f"Invalid status: {data.status}")
|
||||
if not data.url.strip():
|
||||
raise HttpError(400, 'URL is required')
|
||||
raise HttpError(400, "URL is required")
|
||||
if data.depth not in (0, 1, 2, 3, 4):
|
||||
raise HttpError(400, 'depth must be between 0 and 4')
|
||||
raise HttpError(400, "depth must be between 0 and 4")
|
||||
|
||||
if data.crawl_id:
|
||||
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
|
||||
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
|
||||
crawl_tags = normalize_tag_list(crawl.tags_str.split(","))
|
||||
tags = tags or crawl_tags
|
||||
else:
|
||||
crawl = Crawl.objects.create(
|
||||
urls=data.url,
|
||||
max_depth=max(data.depth, 0),
|
||||
tags_str=','.join(tags),
|
||||
tags_str=",".join(tags),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
created_by=request.user if isinstance(request.user, User) else None,
|
||||
)
|
||||
|
||||
snapshot_defaults = {
|
||||
'depth': data.depth,
|
||||
'title': data.title,
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'status': data.status or Snapshot.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
"depth": data.depth,
|
||||
"title": data.title,
|
||||
"timestamp": str(timezone.now().timestamp()),
|
||||
"status": data.status or Snapshot.StatusChoices.QUEUED,
|
||||
"retry_at": timezone.now(),
|
||||
}
|
||||
snapshot, _ = Snapshot.objects.get_or_create(
|
||||
url=data.url,
|
||||
@@ -309,17 +365,17 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
|
||||
defaults=snapshot_defaults,
|
||||
)
|
||||
|
||||
update_fields: List[str] = []
|
||||
update_fields: list[str] = []
|
||||
if data.title is not None and snapshot.title != data.title:
|
||||
snapshot.title = data.title
|
||||
update_fields.append('title')
|
||||
update_fields.append("title")
|
||||
if data.status is not None and snapshot.status != data.status:
|
||||
if data.status not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {data.status}')
|
||||
raise HttpError(400, f"Invalid status: {data.status}")
|
||||
snapshot.status = data.status
|
||||
update_fields.append('status')
|
||||
update_fields.append("status")
|
||||
if update_fields:
|
||||
update_fields.append('modified_at')
|
||||
update_fields.append("modified_at")
|
||||
snapshot.save(update_fields=update_fields)
|
||||
|
||||
if tags:
|
||||
@@ -330,7 +386,7 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
return snapshot
|
||||
|
||||
|
||||
@@ -343,26 +399,26 @@ def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateS
|
||||
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
|
||||
|
||||
payload = data.dict(exclude_unset=True)
|
||||
update_fields = ['modified_at']
|
||||
tags = payload.pop('tags', None)
|
||||
update_fields = ["modified_at"]
|
||||
tags = payload.pop("tags", None)
|
||||
|
||||
if 'status' in payload:
|
||||
if payload['status'] not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {payload["status"]}')
|
||||
snapshot.status = payload['status']
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
|
||||
if "status" in payload:
|
||||
if payload["status"] not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f"Invalid status: {payload['status']}")
|
||||
snapshot.status = payload["status"]
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload:
|
||||
snapshot.retry_at = None
|
||||
update_fields.append('status')
|
||||
update_fields.append("status")
|
||||
|
||||
if 'retry_at' in payload:
|
||||
snapshot.retry_at = payload['retry_at']
|
||||
update_fields.append('retry_at')
|
||||
if "retry_at" in payload:
|
||||
snapshot.retry_at = payload["retry_at"]
|
||||
update_fields.append("retry_at")
|
||||
|
||||
if tags is not None:
|
||||
snapshot.save_tags(normalize_tag_list(tags))
|
||||
|
||||
snapshot.save(update_fields=update_fields)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
return snapshot
|
||||
|
||||
|
||||
@@ -373,17 +429,18 @@ def delete_snapshot(request: HttpRequest, snapshot_id: str):
|
||||
crawl_id_str = str(snapshot.crawl.pk)
|
||||
deleted_count, _ = snapshot.delete()
|
||||
return {
|
||||
'success': True,
|
||||
'snapshot_id': snapshot_id_str,
|
||||
'crawl_id': crawl_id_str,
|
||||
'deleted_count': deleted_count,
|
||||
"success": True,
|
||||
"snapshot_id": snapshot_id_str,
|
||||
"crawl_id": crawl_id_str,
|
||||
"deleted_count": deleted_count,
|
||||
}
|
||||
|
||||
|
||||
### Tag #########################################################################
|
||||
|
||||
|
||||
class TagSchema(Schema):
|
||||
TYPE: str = 'core.models.Tag'
|
||||
TYPE: str = "core.models.Tag"
|
||||
id: int
|
||||
modified_at: datetime
|
||||
created_at: datetime
|
||||
@@ -392,7 +449,7 @@ class TagSchema(Schema):
|
||||
name: str
|
||||
slug: str
|
||||
num_snapshots: int
|
||||
snapshots: List[SnapshotSchema]
|
||||
snapshots: list[SnapshotSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_id(obj):
|
||||
@@ -402,7 +459,7 @@ class TagSchema(Schema):
|
||||
def resolve_created_by_username(obj):
|
||||
user_model = get_user_model()
|
||||
user = user_model.objects.get(id=obj.created_by_id)
|
||||
username = getattr(user, 'username', None)
|
||||
username = getattr(user, "username", None)
|
||||
return username if isinstance(username, str) else str(user)
|
||||
|
||||
@staticmethod
|
||||
@@ -411,58 +468,67 @@ class TagSchema(Schema):
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshots(obj, context):
|
||||
if bool(getattr(context['request'], 'with_snapshots', False)):
|
||||
if bool(getattr(context["request"], "with_snapshots", False)):
|
||||
return obj.snapshot_set.all().distinct()
|
||||
return Snapshot.objects.none()
|
||||
|
||||
|
||||
@router.get("/tags", response=List[TagSchema], url_name="get_tags")
|
||||
@router.get("/tags", response=list[TagSchema], url_name="get_tags")
|
||||
@paginate(CustomPagination)
|
||||
def get_tags(request: HttpRequest):
|
||||
setattr(request, 'with_snapshots', False)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_snapshots", False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
return get_matching_tags()
|
||||
|
||||
|
||||
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
|
||||
def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
|
||||
setattr(request, 'with_snapshots', with_snapshots)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_snapshots", with_snapshots)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
try:
|
||||
return get_tag_by_ref(tag_id)
|
||||
except (Tag.DoesNotExist, ValidationError):
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
|
||||
|
||||
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
|
||||
@router.get(
|
||||
"/any/{id}",
|
||||
response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema],
|
||||
url_name="get_any",
|
||||
summary="Get any object by its ID",
|
||||
)
|
||||
def get_any(request: HttpRequest, id: str):
|
||||
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
|
||||
setattr(request, 'with_snapshots', False)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_snapshots", False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
|
||||
for getter in [get_snapshot, get_archiveresult, get_tag]:
|
||||
try:
|
||||
response = getter(request, id)
|
||||
if isinstance(response, Model):
|
||||
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
|
||||
return redirect(
|
||||
f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from archivebox.api.v1_crawls import get_crawl
|
||||
|
||||
response = get_crawl(request, id)
|
||||
if isinstance(response, Model):
|
||||
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
raise HttpError(404, 'Object with given ID not found')
|
||||
raise HttpError(404, "Object with given ID not found")
|
||||
|
||||
|
||||
### Tag Editor API Endpoints #########################################################################
|
||||
|
||||
|
||||
class TagAutocompleteSchema(Schema):
|
||||
tags: List[dict]
|
||||
tags: list[dict]
|
||||
|
||||
|
||||
class TagCreateSchema(Schema):
|
||||
@@ -483,7 +549,7 @@ class TagSearchSnapshotSchema(Schema):
|
||||
favicon_url: str
|
||||
admin_url: str
|
||||
archive_url: str
|
||||
downloaded_at: Optional[str] = None
|
||||
downloaded_at: str | None = None
|
||||
|
||||
|
||||
class TagSearchCardSchema(Schema):
|
||||
@@ -497,11 +563,11 @@ class TagSearchCardSchema(Schema):
|
||||
export_jsonl_url: str
|
||||
rename_url: str
|
||||
delete_url: str
|
||||
snapshots: List[TagSearchSnapshotSchema]
|
||||
snapshots: list[TagSearchSnapshotSchema]
|
||||
|
||||
|
||||
class TagSearchResponseSchema(Schema):
|
||||
tags: List[TagSearchCardSchema]
|
||||
tags: list[TagSearchCardSchema]
|
||||
sort: str
|
||||
created_by: str
|
||||
year: str
|
||||
@@ -527,8 +593,8 @@ class TagDeleteResponseSchema(Schema):
|
||||
|
||||
class TagSnapshotRequestSchema(Schema):
|
||||
snapshot_id: str
|
||||
tag_name: Optional[str] = None
|
||||
tag_id: Optional[int] = None
|
||||
tag_name: str | None = None
|
||||
tag_id: int | None = None
|
||||
|
||||
|
||||
class TagSnapshotResponseSchema(Schema):
|
||||
@@ -541,10 +607,10 @@ class TagSnapshotResponseSchema(Schema):
|
||||
def search_tags(
|
||||
request: HttpRequest,
|
||||
q: str = "",
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
sort: str = "created_desc",
|
||||
created_by: str = "",
|
||||
year: str = "",
|
||||
has_snapshots: str = "all",
|
||||
):
|
||||
"""Return detailed tag cards for admin/live-search UIs."""
|
||||
normalized_sort = normalize_tag_sort(sort)
|
||||
@@ -552,7 +618,7 @@ def search_tags(
|
||||
normalized_year = normalize_created_year_filter(year)
|
||||
normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
|
||||
return {
|
||||
'tags': build_tag_cards(
|
||||
"tags": build_tag_cards(
|
||||
query=q,
|
||||
request=request,
|
||||
sort=normalized_sort,
|
||||
@@ -560,28 +626,28 @@ def search_tags(
|
||||
year=normalized_year,
|
||||
has_snapshots=normalized_has_snapshots,
|
||||
),
|
||||
'sort': normalized_sort,
|
||||
'created_by': normalized_created_by,
|
||||
'year': normalized_year,
|
||||
'has_snapshots': normalized_has_snapshots,
|
||||
"sort": normalized_sort,
|
||||
"created_by": normalized_created_by,
|
||||
"year": normalized_year,
|
||||
"has_snapshots": normalized_has_snapshots,
|
||||
}
|
||||
|
||||
|
||||
def _public_tag_listing_enabled() -> bool:
|
||||
explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
|
||||
explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None)
|
||||
if explicit is not None:
|
||||
return bool(explicit)
|
||||
return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
|
||||
return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX))
|
||||
|
||||
|
||||
def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
|
||||
user = getattr(request, 'user', None)
|
||||
if getattr(user, 'is_authenticated', False):
|
||||
user = getattr(request, "user", None)
|
||||
if getattr(user, "is_authenticated", False):
|
||||
return True
|
||||
|
||||
token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
|
||||
auth_header = request.headers.get('Authorization', '')
|
||||
if not token and auth_header.lower().startswith('bearer '):
|
||||
token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key")
|
||||
auth_header = request.headers.get("Authorization", "")
|
||||
if not token and auth_header.lower().startswith("bearer "):
|
||||
token = auth_header.split(None, 1)[1].strip()
|
||||
|
||||
if token and auth_using_token(token=token, request=request):
|
||||
@@ -594,12 +660,12 @@ def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
|
||||
def tags_autocomplete(request: HttpRequest, q: str = ""):
|
||||
"""Return tags matching the query for autocomplete."""
|
||||
if not _request_has_tag_autocomplete_access(request):
|
||||
raise HttpError(401, 'Authentication required')
|
||||
raise HttpError(401, "Authentication required")
|
||||
|
||||
tags = get_matching_tags(q)[:50 if not q else 20]
|
||||
tags = get_matching_tags(q)[: 50 if not q else 20]
|
||||
|
||||
return {
|
||||
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
|
||||
"tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags],
|
||||
}
|
||||
|
||||
|
||||
@@ -615,10 +681,10 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
|
||||
raise HttpError(400, str(err)) from err
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
'created': created,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
"created": created,
|
||||
}
|
||||
|
||||
|
||||
@@ -627,15 +693,15 @@ def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
|
||||
try:
|
||||
tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
except ValueError as err:
|
||||
raise HttpError(400, str(err)) from err
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
'slug': tag.slug,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
"slug": tag.slug,
|
||||
}
|
||||
|
||||
|
||||
@@ -644,13 +710,13 @@ def delete_tag(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
|
||||
deleted_count, _ = delete_tag_record(tag)
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': int(tag_id),
|
||||
'deleted_count': deleted_count,
|
||||
"success": True,
|
||||
"tag_id": int(tag_id),
|
||||
"deleted_count": deleted_count,
|
||||
}
|
||||
|
||||
|
||||
@@ -659,10 +725,10 @@ def tag_urls_export(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
|
||||
response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
|
||||
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
|
||||
response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8")
|
||||
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
|
||||
return response
|
||||
|
||||
|
||||
@@ -671,10 +737,10 @@ def tag_snapshots_export(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
|
||||
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
|
||||
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
|
||||
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8")
|
||||
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
|
||||
return response
|
||||
|
||||
|
||||
@@ -684,16 +750,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
|
||||
# Get the snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot = Snapshot.objects.filter(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
).first()
|
||||
if snapshot is None:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
|
||||
# Get or create the tag
|
||||
if data.tag_name:
|
||||
@@ -708,17 +774,17 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
|
||||
try:
|
||||
tag = get_tag_by_ref(data.tag_id)
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
else:
|
||||
raise HttpError(400, 'Either tag_name or tag_id is required')
|
||||
raise HttpError(400, "Either tag_name or tag_id is required")
|
||||
|
||||
# Add the tag to the snapshot
|
||||
snapshot.tags.add(tag.pk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
}
|
||||
|
||||
|
||||
@@ -728,36 +794,36 @@ def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSche
|
||||
# Get the snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot = Snapshot.objects.filter(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
).first()
|
||||
if snapshot is None:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
|
||||
# Get the tag
|
||||
if data.tag_id:
|
||||
try:
|
||||
tag = Tag.objects.get(pk=data.tag_id)
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
elif data.tag_name:
|
||||
try:
|
||||
tag = Tag.objects.get(name__iexact=data.tag_name.strip())
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
else:
|
||||
raise HttpError(400, 'Either tag_name or tag_id is required')
|
||||
raise HttpError(400, "Either tag_name or tag_id is required")
|
||||
|
||||
# Remove the tag from the snapshot
|
||||
snapshot.tags.remove(tag.pk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from django.http import HttpRequest
|
||||
from django.utils import timezone
|
||||
@@ -17,11 +16,11 @@ from archivebox.crawls.models import Crawl
|
||||
|
||||
from .auth import API_AUTH_METHODS
|
||||
|
||||
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
|
||||
router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS)
|
||||
|
||||
|
||||
class CrawlSchema(Schema):
|
||||
TYPE: str = 'crawls.models.Crawl'
|
||||
TYPE: str = "crawls.models.Crawl"
|
||||
|
||||
id: UUID
|
||||
|
||||
@@ -35,6 +34,8 @@ class CrawlSchema(Schema):
|
||||
|
||||
urls: str
|
||||
max_depth: int
|
||||
max_urls: int
|
||||
max_size: int
|
||||
tags_str: str
|
||||
config: dict
|
||||
|
||||
@@ -48,12 +49,12 @@ class CrawlSchema(Schema):
|
||||
def resolve_created_by_username(obj):
|
||||
user_model = get_user_model()
|
||||
user = user_model.objects.get(id=obj.created_by_id)
|
||||
username = getattr(user, 'username', None)
|
||||
username = getattr(user, "username", None)
|
||||
return username if isinstance(username, str) else str(user)
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshots(obj, context):
|
||||
if bool(getattr(context['request'], 'with_snapshots', False)):
|
||||
if bool(getattr(context["request"], "with_snapshots", False)):
|
||||
return obj.snapshot_set.all().distinct()
|
||||
return Snapshot.objects.none()
|
||||
|
||||
@@ -61,17 +62,19 @@ class CrawlSchema(Schema):
|
||||
class CrawlUpdateSchema(Schema):
|
||||
status: str | None = None
|
||||
retry_at: datetime | None = None
|
||||
tags: Optional[List[str]] = None
|
||||
tags: list[str] | None = None
|
||||
tags_str: str | None = None
|
||||
|
||||
|
||||
class CrawlCreateSchema(Schema):
|
||||
urls: List[str]
|
||||
urls: list[str]
|
||||
max_depth: int = 0
|
||||
tags: Optional[List[str]] = None
|
||||
tags_str: str = ''
|
||||
label: str = ''
|
||||
notes: str = ''
|
||||
max_urls: int = 0
|
||||
max_size: int = 0
|
||||
tags: list[str] | None = None
|
||||
tags_str: str = ""
|
||||
label: str = ""
|
||||
notes: str = ""
|
||||
config: dict = {}
|
||||
|
||||
|
||||
@@ -82,13 +85,13 @@ class CrawlDeleteResponseSchema(Schema):
|
||||
deleted_snapshots: int
|
||||
|
||||
|
||||
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
|
||||
def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]:
|
||||
if tags is not None:
|
||||
return [tag.strip() for tag in tags if tag and tag.strip()]
|
||||
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
|
||||
return [tag.strip() for tag in tags_str.split(",") if tag.strip()]
|
||||
|
||||
|
||||
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
|
||||
@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls")
|
||||
def get_crawls(request: HttpRequest):
|
||||
return Crawl.objects.all().distinct()
|
||||
|
||||
@@ -97,15 +100,21 @@ def get_crawls(request: HttpRequest):
|
||||
def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
|
||||
urls = [url.strip() for url in data.urls if url and url.strip()]
|
||||
if not urls:
|
||||
raise HttpError(400, 'At least one URL is required')
|
||||
raise HttpError(400, "At least one URL is required")
|
||||
if data.max_depth not in (0, 1, 2, 3, 4):
|
||||
raise HttpError(400, 'max_depth must be between 0 and 4')
|
||||
raise HttpError(400, "max_depth must be between 0 and 4")
|
||||
if data.max_urls < 0:
|
||||
raise HttpError(400, "max_urls must be >= 0")
|
||||
if data.max_size < 0:
|
||||
raise HttpError(400, "max_size must be >= 0")
|
||||
|
||||
tags = normalize_tag_list(data.tags, data.tags_str)
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join(urls),
|
||||
urls="\n".join(urls),
|
||||
max_depth=data.max_depth,
|
||||
tags_str=','.join(tags),
|
||||
max_urls=data.max_urls,
|
||||
max_size=data.max_size,
|
||||
tags_str=",".join(tags),
|
||||
label=data.label,
|
||||
notes=data.notes,
|
||||
config=data.config,
|
||||
@@ -116,25 +125,26 @@ def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
|
||||
crawl.create_snapshots_from_urls()
|
||||
return crawl
|
||||
|
||||
|
||||
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
|
||||
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
|
||||
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False):
|
||||
"""Get a specific Crawl by id."""
|
||||
setattr(request, 'with_snapshots', with_snapshots)
|
||||
setattr(request, 'with_archiveresults', with_archiveresults)
|
||||
setattr(request, "with_snapshots", with_snapshots)
|
||||
setattr(request, "with_archiveresults", with_archiveresults)
|
||||
crawl = Crawl.objects.get(id__icontains=crawl_id)
|
||||
|
||||
|
||||
if crawl and as_rss:
|
||||
# return snapshots as XML rss feed
|
||||
urls = [
|
||||
{'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
|
||||
{"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str}
|
||||
for snapshot in crawl.snapshot_set.all()
|
||||
]
|
||||
xml = '<rss version="2.0"><channel>'
|
||||
for url in urls:
|
||||
xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
|
||||
xml += '</channel></rss>'
|
||||
xml += f"<item><url>{url['url']}</url><title>{url['title']}</title><bookmarked_at>{url['bookmarked_at']}</bookmarked_at><tags>{url['tags']}</tags></item>"
|
||||
xml += "</channel></rss>"
|
||||
return xml
|
||||
|
||||
|
||||
return crawl
|
||||
|
||||
|
||||
@@ -143,29 +153,29 @@ def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema):
|
||||
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
|
||||
crawl = Crawl.objects.get(id__icontains=crawl_id)
|
||||
payload = data.dict(exclude_unset=True)
|
||||
update_fields = ['modified_at']
|
||||
update_fields = ["modified_at"]
|
||||
|
||||
tags = payload.pop('tags', None)
|
||||
tags_str = payload.pop('tags_str', None)
|
||||
tags = payload.pop("tags", None)
|
||||
tags_str = payload.pop("tags_str", None)
|
||||
if tags is not None or tags_str is not None:
|
||||
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
|
||||
update_fields.append('tags_str')
|
||||
crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or ""))
|
||||
update_fields.append("tags_str")
|
||||
|
||||
if 'status' in payload:
|
||||
if payload['status'] not in Crawl.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {payload["status"]}')
|
||||
crawl.status = payload['status']
|
||||
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
|
||||
if "status" in payload:
|
||||
if payload["status"] not in Crawl.StatusChoices.values:
|
||||
raise HttpError(400, f"Invalid status: {payload['status']}")
|
||||
crawl.status = payload["status"]
|
||||
if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload:
|
||||
crawl.retry_at = None
|
||||
update_fields.append('status')
|
||||
update_fields.append("status")
|
||||
|
||||
if 'retry_at' in payload:
|
||||
crawl.retry_at = payload['retry_at']
|
||||
update_fields.append('retry_at')
|
||||
if "retry_at" in payload:
|
||||
crawl.retry_at = payload["retry_at"]
|
||||
update_fields.append("retry_at")
|
||||
|
||||
crawl.save(update_fields=update_fields)
|
||||
|
||||
if payload.get('status') == Crawl.StatusChoices.SEALED:
|
||||
if payload.get("status") == Crawl.StatusChoices.SEALED:
|
||||
Snapshot.objects.filter(
|
||||
crawl=crawl,
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
|
||||
@@ -184,8 +194,8 @@ def delete_crawl(request: HttpRequest, crawl_id: str):
|
||||
snapshot_count = crawl.snapshot_set.count()
|
||||
deleted_count, _ = crawl.delete()
|
||||
return {
|
||||
'success': True,
|
||||
'crawl_id': crawl_id_str,
|
||||
'deleted_count': deleted_count,
|
||||
'deleted_snapshots': snapshot_count,
|
||||
"success": True,
|
||||
"crawl_id": crawl_id_str,
|
||||
"deleted_count": deleted_count,
|
||||
"deleted_snapshots": snapshot_count,
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from uuid import UUID
|
||||
from typing import Annotated, List, Optional
|
||||
from typing import Annotated
|
||||
from datetime import datetime
|
||||
|
||||
from django.http import HttpRequest
|
||||
@@ -12,16 +12,18 @@ from ninja.pagination import paginate
|
||||
from archivebox.api.v1_core import CustomPagination
|
||||
|
||||
|
||||
router = Router(tags=['Machine and Dependencies'])
|
||||
router = Router(tags=["Machine and Dependencies"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Machine Schemas
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class MachineSchema(Schema):
|
||||
"""Schema for Machine model."""
|
||||
TYPE: str = 'machine.Machine'
|
||||
|
||||
TYPE: str = "machine.Machine"
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
@@ -43,22 +45,24 @@ class MachineSchema(Schema):
|
||||
|
||||
|
||||
class MachineFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
|
||||
hostname: Annotated[Optional[str], FilterLookup('hostname__icontains')] = None
|
||||
os_platform: Annotated[Optional[str], FilterLookup('os_platform__icontains')] = None
|
||||
os_arch: Annotated[Optional[str], FilterLookup('os_arch')] = None
|
||||
hw_in_docker: Annotated[Optional[bool], FilterLookup('hw_in_docker')] = None
|
||||
hw_in_vm: Annotated[Optional[bool], FilterLookup('hw_in_vm')] = None
|
||||
bin_providers: Annotated[Optional[str], FilterLookup('bin_providers__icontains')] = None
|
||||
id: Annotated[str | None, FilterLookup("id__startswith")] = None
|
||||
hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None
|
||||
os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None
|
||||
os_arch: Annotated[str | None, FilterLookup("os_arch")] = None
|
||||
hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None
|
||||
hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None
|
||||
bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Binary Schemas
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class BinarySchema(Schema):
|
||||
"""Schema for Binary model."""
|
||||
TYPE: str = 'machine.Binary'
|
||||
|
||||
TYPE: str = "machine.Binary"
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
@@ -85,23 +89,25 @@ class BinarySchema(Schema):
|
||||
|
||||
|
||||
class BinaryFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
|
||||
name: Annotated[Optional[str], FilterLookup('name__icontains')] = None
|
||||
binprovider: Annotated[Optional[str], FilterLookup('binprovider')] = None
|
||||
status: Annotated[Optional[str], FilterLookup('status')] = None
|
||||
machine_id: Annotated[Optional[str], FilterLookup('machine_id__startswith')] = None
|
||||
version: Annotated[Optional[str], FilterLookup('version__icontains')] = None
|
||||
id: Annotated[str | None, FilterLookup("id__startswith")] = None
|
||||
name: Annotated[str | None, FilterLookup("name__icontains")] = None
|
||||
binprovider: Annotated[str | None, FilterLookup("binprovider")] = None
|
||||
status: Annotated[str | None, FilterLookup("status")] = None
|
||||
machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None
|
||||
version: Annotated[str | None, FilterLookup("version__icontains")] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Machine Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
|
||||
|
||||
@router.get("/machines", response=list[MachineSchema], url_name="get_machines")
|
||||
@paginate(CustomPagination)
|
||||
def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
|
||||
"""List all machines."""
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return filters.filter(Machine.objects.all()).distinct()
|
||||
|
||||
|
||||
@@ -109,6 +115,7 @@ def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
|
||||
def get_current_machine(request: HttpRequest):
|
||||
"""Get the current machine."""
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.current()
|
||||
|
||||
|
||||
@@ -117,6 +124,7 @@ def get_machine(request: HttpRequest, machine_id: str):
|
||||
"""Get a specific machine by ID."""
|
||||
from archivebox.machine.models import Machine
|
||||
from django.db.models import Q
|
||||
|
||||
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
|
||||
|
||||
|
||||
@@ -127,23 +135,27 @@ def get_machine(request: HttpRequest, machine_id: str):
|
||||
# Binary Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/binaries", response=List[BinarySchema], url_name="get_binaries")
|
||||
|
||||
@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries")
|
||||
@paginate(CustomPagination)
|
||||
def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]):
|
||||
"""List all binaries."""
|
||||
from archivebox.machine.models import Binary
|
||||
return filters.filter(Binary.objects.all().select_related('machine')).distinct()
|
||||
|
||||
return filters.filter(Binary.objects.all().select_related("machine")).distinct()
|
||||
|
||||
|
||||
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
|
||||
def get_binary(request: HttpRequest, binary_id: str):
|
||||
"""Get a specific binary by ID."""
|
||||
from archivebox.machine.models import Binary
|
||||
return Binary.objects.select_related('machine').get(id__startswith=binary_id)
|
||||
|
||||
return Binary.objects.select_related("machine").get(id__startswith=binary_id)
|
||||
|
||||
|
||||
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
|
||||
@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name")
|
||||
def get_binaries_by_name(request: HttpRequest, name: str):
|
||||
"""Get all binaries with the given name."""
|
||||
from archivebox.machine.models import Binary
|
||||
return list(Binary.objects.filter(name__iexact=name).select_related('machine'))
|
||||
|
||||
return list(Binary.objects.filter(name__iexact=name).select_related("machine"))
|
||||
|
||||
@@ -1 +1 @@
|
||||
__package__ = 'archivebox.base_models'
|
||||
__package__ = "archivebox.base_models"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Base admin classes for models using UUIDv7."""
|
||||
|
||||
__package__ = 'archivebox.base_models'
|
||||
__package__ = "archivebox.base_models"
|
||||
|
||||
import json
|
||||
from collections.abc import Mapping
|
||||
@@ -32,11 +32,12 @@ class KeyValueWidget(forms.Widget):
|
||||
with + and - buttons to add/remove rows.
|
||||
Includes autocomplete for available config keys from the plugin system.
|
||||
"""
|
||||
|
||||
template_name = "" # We render manually
|
||||
|
||||
class Media:
|
||||
css = {
|
||||
'all': []
|
||||
"all": [],
|
||||
}
|
||||
js = []
|
||||
|
||||
@@ -44,17 +45,18 @@ class KeyValueWidget(forms.Widget):
|
||||
"""Get available config options from plugins."""
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
options: dict[str, ConfigOption] = {}
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop in schema.get('properties', {}).items():
|
||||
for key, prop in schema.get("properties", {}).items():
|
||||
option: ConfigOption = {
|
||||
'plugin': plugin_name,
|
||||
'type': prop.get('type', 'string'),
|
||||
'default': prop.get('default', ''),
|
||||
'description': prop.get('description', ''),
|
||||
"plugin": plugin_name,
|
||||
"type": prop.get("type", "string"),
|
||||
"default": prop.get("default", ""),
|
||||
"description": prop.get("description", ""),
|
||||
}
|
||||
for schema_key in ('enum', 'pattern', 'minimum', 'maximum'):
|
||||
for schema_key in ("enum", "pattern", "minimum", "maximum"):
|
||||
if schema_key in prop:
|
||||
option[schema_key] = prop[schema_key]
|
||||
options[key] = option
|
||||
@@ -85,11 +87,11 @@ class KeyValueWidget(forms.Widget):
|
||||
) -> SafeString:
|
||||
data = self._parse_value(value)
|
||||
|
||||
widget_id = attrs.get('id', name) if attrs else name
|
||||
widget_id = attrs.get("id", name) if attrs else name
|
||||
config_options = self._get_config_options()
|
||||
|
||||
# Build datalist options
|
||||
datalist_options = '\n'.join(
|
||||
datalist_options = "\n".join(
|
||||
f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
|
||||
for key, opt in sorted(config_options.items())
|
||||
)
|
||||
@@ -111,7 +113,7 @@ class KeyValueWidget(forms.Widget):
|
||||
html += self._render_row(widget_id, key, val_str)
|
||||
|
||||
# Always add one empty row for new entries
|
||||
html += self._render_row(widget_id, '', '')
|
||||
html += self._render_row(widget_id, "", "")
|
||||
|
||||
html += f'''
|
||||
</div>
|
||||
@@ -669,8 +671,8 @@ class KeyValueWidget(forms.Widget):
|
||||
def _escape(self, s: object) -> str:
|
||||
"""Escape HTML special chars in attribute values."""
|
||||
if not s:
|
||||
return ''
|
||||
return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
return ""
|
||||
return str(s).replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """)
|
||||
|
||||
def value_from_datadict(
|
||||
self,
|
||||
@@ -678,8 +680,8 @@ class KeyValueWidget(forms.Widget):
|
||||
files: object,
|
||||
name: str,
|
||||
) -> str:
|
||||
value = data.get(name, '{}')
|
||||
return value if isinstance(value, str) else '{}'
|
||||
value = data.get(name, "{}")
|
||||
return value if isinstance(value, str) else "{}"
|
||||
|
||||
|
||||
class ConfigEditorMixin(admin.ModelAdmin):
|
||||
@@ -696,14 +698,20 @@ class ConfigEditorMixin(admin.ModelAdmin):
|
||||
**kwargs: object,
|
||||
) -> forms.Field | None:
|
||||
"""Use KeyValueWidget for the config JSON field."""
|
||||
if db_field.name == 'config':
|
||||
kwargs['widget'] = KeyValueWidget()
|
||||
if db_field.name == "config":
|
||||
kwargs["widget"] = KeyValueWidget()
|
||||
return super().formfield_for_dbfield(db_field, request, **kwargs)
|
||||
|
||||
|
||||
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by')
|
||||
readonly_fields = ('id', 'created_at', 'modified_at')
|
||||
list_display = ("id", "created_at", "created_by")
|
||||
readonly_fields = ("id", "created_at", "modified_at")
|
||||
show_search_mode_selector = False
|
||||
|
||||
def get_default_search_mode(self) -> str:
|
||||
# The shared changelist template always asks every admin for a default
|
||||
# search mode, even when the search-mode toggle is hidden.
|
||||
return "meta"
|
||||
|
||||
def get_form(
|
||||
self,
|
||||
@@ -713,6 +721,6 @@ class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
||||
**kwargs: object,
|
||||
):
|
||||
form = super().get_form(request, obj, change=change, **kwargs)
|
||||
if 'created_by' in form.base_fields:
|
||||
form.base_fields['created_by'].initial = request.user
|
||||
if "created_by" in form.base_fields:
|
||||
form.base_fields["created_by"].initial = request.user
|
||||
return form
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Base models using UUIDv7 for all id fields."""
|
||||
|
||||
__package__ = 'archivebox.base_models'
|
||||
__package__ = "archivebox.base_models"
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
@@ -15,22 +15,22 @@ from django.conf import settings
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
|
||||
|
||||
def get_or_create_system_user_pk(username='system'):
|
||||
def get_or_create_system_user_pk(username="system"):
|
||||
User = get_user_model()
|
||||
# If there's exactly one superuser, use that for all system operations
|
||||
if User.objects.filter(is_superuser=True).count() == 1:
|
||||
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
|
||||
return User.objects.filter(is_superuser=True).values_list("pk", flat=True)[0]
|
||||
# Otherwise get or create the system user
|
||||
user, _ = User.objects.get_or_create(
|
||||
username=username,
|
||||
defaults={'is_staff': True, 'is_superuser': True, 'email': '', 'password': '!'}
|
||||
defaults={"is_staff": True, "is_superuser": True, "email": "", "password": "!"},
|
||||
)
|
||||
return user.pk
|
||||
|
||||
|
||||
class AutoDateTimeField(models.DateTimeField):
|
||||
"""DateTimeField that automatically updates on save (legacy compatibility)."""
|
||||
|
||||
def pre_save(self, model_instance, add):
|
||||
if add or not getattr(model_instance, self.attname):
|
||||
value = timezone.now()
|
||||
@@ -43,13 +43,19 @@ class ModelWithUUID(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, db_index=True)
|
||||
created_by = models.ForeignKey(
|
||||
settings.AUTH_USER_MODEL,
|
||||
on_delete=models.CASCADE,
|
||||
default=get_or_create_system_user_pk,
|
||||
null=False,
|
||||
db_index=True,
|
||||
)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.id}] {self.__class__.__name__}'
|
||||
return f"[{self.id}] {self.__class__.__name__}"
|
||||
|
||||
@property
|
||||
def admin_change_url(self) -> str:
|
||||
@@ -57,17 +63,17 @@ class ModelWithUUID(models.Model):
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return str(reverse_lazy('api-1:get_any', args=[self.id]))
|
||||
return str(reverse_lazy("api-1:get_any", args=[self.id]))
|
||||
|
||||
@property
|
||||
def api_docs_url(self) -> str:
|
||||
return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
|
||||
|
||||
return f"/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}"
|
||||
|
||||
|
||||
class ModelWithNotes(models.Model):
|
||||
"""Mixin for models with a notes field."""
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
|
||||
notes = models.TextField(blank=True, null=False, default="")
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
@@ -75,6 +81,7 @@ class ModelWithNotes(models.Model):
|
||||
|
||||
class ModelWithHealthStats(models.Model):
|
||||
"""Mixin for models with health tracking fields."""
|
||||
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
@@ -88,12 +95,13 @@ class ModelWithHealthStats(models.Model):
|
||||
|
||||
def increment_health_stats(self, success: bool):
|
||||
"""Atomically increment success or failure counter using F() expression."""
|
||||
field = 'num_uses_succeeded' if success else 'num_uses_failed'
|
||||
field = "num_uses_succeeded" if success else "num_uses_failed"
|
||||
type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
|
||||
|
||||
|
||||
class ModelWithConfig(models.Model):
|
||||
"""Mixin for models with a JSON config field."""
|
||||
|
||||
config = models.JSONField(default=dict, null=True, blank=True, editable=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
@@ -111,7 +119,7 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
|
||||
@property
|
||||
def output_dir_parent(self) -> str:
|
||||
return f'{self._meta.model_name}s'
|
||||
return f"{self._meta.model_name}s"
|
||||
|
||||
@property
|
||||
def output_dir_name(self) -> str:
|
||||
@@ -119,7 +127,7 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
|
||||
@property
|
||||
def output_dir_str(self) -> str:
|
||||
return f'{self.output_dir_parent}/{self.output_dir_name}'
|
||||
return f"{self.output_dir_parent}/{self.output_dir_name}"
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox"
|
||||
import os
|
||||
import sys
|
||||
from importlib import import_module
|
||||
@@ -10,55 +10,55 @@ from rich import print
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
|
||||
|
||||
if '--debug' in sys.argv:
|
||||
os.environ['DEBUG'] = 'True'
|
||||
sys.argv.remove('--debug')
|
||||
if "--debug" in sys.argv:
|
||||
os.environ["DEBUG"] = "True"
|
||||
sys.argv.remove("--debug")
|
||||
|
||||
|
||||
class ArchiveBoxGroup(click.Group):
|
||||
"""lazy loading click group for archivebox commands"""
|
||||
|
||||
meta_commands = {
|
||||
'help': 'archivebox.cli.archivebox_help.main',
|
||||
'version': 'archivebox.cli.archivebox_version.main',
|
||||
'mcp': 'archivebox.cli.archivebox_mcp.main',
|
||||
"help": "archivebox.cli.archivebox_help.main",
|
||||
"version": "archivebox.cli.archivebox_version.main",
|
||||
"mcp": "archivebox.cli.archivebox_mcp.main",
|
||||
}
|
||||
setup_commands = {
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
'install': 'archivebox.cli.archivebox_install.main',
|
||||
"init": "archivebox.cli.archivebox_init.main",
|
||||
"install": "archivebox.cli.archivebox_install.main",
|
||||
}
|
||||
# Model commands (CRUD operations via subcommands)
|
||||
model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
|
||||
'tag': 'archivebox.cli.archivebox_tag.main',
|
||||
'binary': 'archivebox.cli.archivebox_binary.main',
|
||||
'process': 'archivebox.cli.archivebox_process.main',
|
||||
'machine': 'archivebox.cli.archivebox_machine.main',
|
||||
'persona': 'archivebox.cli.archivebox_persona.main',
|
||||
"crawl": "archivebox.cli.archivebox_crawl.main",
|
||||
"snapshot": "archivebox.cli.archivebox_snapshot.main",
|
||||
"archiveresult": "archivebox.cli.archivebox_archiveresult.main",
|
||||
"tag": "archivebox.cli.archivebox_tag.main",
|
||||
"binary": "archivebox.cli.archivebox_binary.main",
|
||||
"process": "archivebox.cli.archivebox_process.main",
|
||||
"machine": "archivebox.cli.archivebox_machine.main",
|
||||
"persona": "archivebox.cli.archivebox_persona.main",
|
||||
}
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
'list': 'archivebox.cli.archivebox_list.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
"add": "archivebox.cli.archivebox_add.main",
|
||||
"extract": "archivebox.cli.archivebox_extract.main",
|
||||
"list": "archivebox.cli.archivebox_list.main",
|
||||
"remove": "archivebox.cli.archivebox_remove.main",
|
||||
"run": "archivebox.cli.archivebox_run.main",
|
||||
"update": "archivebox.cli.archivebox_update.main",
|
||||
"status": "archivebox.cli.archivebox_status.main",
|
||||
"search": "archivebox.cli.archivebox_search.main",
|
||||
"config": "archivebox.cli.archivebox_config.main",
|
||||
"schedule": "archivebox.cli.archivebox_schedule.main",
|
||||
"server": "archivebox.cli.archivebox_server.main",
|
||||
"shell": "archivebox.cli.archivebox_shell.main",
|
||||
"manage": "archivebox.cli.archivebox_manage.main",
|
||||
# Introspection commands
|
||||
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
|
||||
"pluginmap": "archivebox.cli.archivebox_pluginmap.main",
|
||||
}
|
||||
legacy_model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
|
||||
"crawl": "archivebox.cli.archivebox_crawl_compat.main",
|
||||
"snapshot": "archivebox.cli.archivebox_snapshot_compat.main",
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
@@ -67,15 +67,15 @@ class ArchiveBoxGroup(click.Group):
|
||||
**archive_commands,
|
||||
}
|
||||
renamed_commands = {
|
||||
'setup': 'install',
|
||||
'import': 'add',
|
||||
'archive': 'add',
|
||||
"setup": "install",
|
||||
"import": "add",
|
||||
"archive": "add",
|
||||
}
|
||||
legacy_model_subcommands = {
|
||||
'crawl': {'create', 'list', 'update', 'delete'},
|
||||
'snapshot': {'create', 'list', 'update', 'delete'},
|
||||
"crawl": {"create", "list", "update", "delete"},
|
||||
"snapshot": {"create", "list", "update", "delete"},
|
||||
}
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_canonical_name(cls, cmd_name):
|
||||
return cls.renamed_commands.get(cmd_name, cmd_name)
|
||||
@@ -90,23 +90,22 @@ class ArchiveBoxGroup(click.Group):
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
remaining_args = sys.argv[arg_idx + 1:]
|
||||
remaining_args = sys.argv[arg_idx + 1 :]
|
||||
if not remaining_args:
|
||||
return False
|
||||
|
||||
first_arg = remaining_args[0]
|
||||
if first_arg in ('-h', '--help'):
|
||||
if first_arg in ("-h", "--help"):
|
||||
return False
|
||||
|
||||
return first_arg not in cls.legacy_model_subcommands[cmd_name]
|
||||
|
||||
|
||||
def get_command(self, ctx, cmd_name):
|
||||
# handle renamed commands
|
||||
if cmd_name in self.renamed_commands:
|
||||
new_name = self.renamed_commands[cmd_name]
|
||||
print(
|
||||
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
|
||||
f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`",
|
||||
file=sys.stderr,
|
||||
)
|
||||
cmd_name = new_name
|
||||
@@ -114,11 +113,11 @@ class ArchiveBoxGroup(click.Group):
|
||||
|
||||
if self._should_use_legacy_model_command(cmd_name):
|
||||
return self._lazy_load(self.legacy_model_commands[cmd_name])
|
||||
|
||||
|
||||
# handle lazy loading of commands
|
||||
if cmd_name in self.all_subcommands:
|
||||
return self._lazy_load(cmd_name)
|
||||
|
||||
|
||||
# fall-back to using click's default command lookup
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
@@ -127,72 +126,74 @@ class ArchiveBoxGroup(click.Group):
|
||||
import_path = cls.all_subcommands.get(cmd_name_or_path)
|
||||
if import_path is None:
|
||||
import_path = cmd_name_or_path
|
||||
modname, funcname = import_path.rsplit('.', 1)
|
||||
|
||||
modname, funcname = import_path.rsplit(".", 1)
|
||||
|
||||
# print(f'LAZY LOADING {import_path}')
|
||||
mod = import_module(modname)
|
||||
func = getattr(mod, funcname)
|
||||
|
||||
if not hasattr(func, '__doc__'):
|
||||
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
|
||||
|
||||
|
||||
if not hasattr(func, "__doc__"):
|
||||
raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method")
|
||||
|
||||
# if not isinstance(cmd, click.BaseCommand):
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
return func
|
||||
|
||||
|
||||
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.version_option(VERSION, '-v', '--version', package_name='archivebox', message='%(version)s')
|
||||
@click.option("--help", "-h", is_flag=True, help="Show help")
|
||||
@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s")
|
||||
@click.pass_context
|
||||
def cli(ctx, help=False):
|
||||
"""ArchiveBox: The self-hosted internet archive"""
|
||||
|
||||
|
||||
subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand)
|
||||
|
||||
|
||||
# if --help is passed or no subcommand is given, show custom help message
|
||||
if help or ctx.invoked_subcommand is None:
|
||||
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||
|
||||
ctx.invoke(ctx.command.get_command(ctx, "help"))
|
||||
|
||||
# if the subcommand is in archive_commands or model_commands,
|
||||
# then we need to set up the django environment and check that we're in a valid data folder
|
||||
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
|
||||
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||
try:
|
||||
if subcommand == 'server':
|
||||
run_in_debug = '--reload' in sys.argv or os.environ.get('DEBUG') in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
if subcommand == "server":
|
||||
run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes")
|
||||
if run_in_debug:
|
||||
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
|
||||
if '--reload' in sys.argv:
|
||||
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
|
||||
os.environ["ARCHIVEBOX_RUNSERVER"] = "1"
|
||||
if "--reload" in sys.argv:
|
||||
os.environ["ARCHIVEBOX_AUTORELOAD"] = "1"
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
|
||||
|
||||
os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
except Exception as e:
|
||||
print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr)
|
||||
if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand
|
||||
print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr)
|
||||
if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand
|
||||
raise
|
||||
|
||||
|
||||
|
||||
def main(args=None, prog_name=None, stdin=None):
|
||||
# show `docker run archivebox xyz` in help messages if running in docker
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
IS_TTY = sys.stdin.isatty()
|
||||
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
|
||||
|
||||
prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox")
|
||||
|
||||
# stdin param allows passing input data from caller (used by __main__.py)
|
||||
# currently not used by click-based CLI, but kept for backwards compatibility
|
||||
|
||||
try:
|
||||
cli(args=args, prog_name=prog_name)
|
||||
except KeyboardInterrupt:
|
||||
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
|
||||
print("\n\n[red][X] Got CTRL+C. Exiting...[/red]")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox add'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox add"
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -14,6 +14,7 @@ from django.utils import timezone
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.util import parse_filesize_to_bytes
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.permissions import USER, HOSTNAME
|
||||
@@ -29,34 +30,38 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
|
||||
urls: list[str] = []
|
||||
for record in read_args_or_stdin(args):
|
||||
url = record.get('url')
|
||||
url = record.get("url")
|
||||
if isinstance(url, str) and url:
|
||||
urls.append(url)
|
||||
|
||||
urls_field = record.get('urls')
|
||||
urls_field = record.get("urls")
|
||||
if isinstance(urls_field, str):
|
||||
for line in urls_field.splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
if line and not line.startswith("#"):
|
||||
urls.append(line)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
@enforce_types
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
tag: str='',
|
||||
url_allowlist: str='',
|
||||
url_denylist: str='',
|
||||
parser: str="auto",
|
||||
plugins: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool | None=None,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
|
||||
def add(
|
||||
urls: str | list[str],
|
||||
depth: int | str = 0,
|
||||
max_urls: int = 0,
|
||||
max_size: int | str = 0,
|
||||
tag: str = "",
|
||||
url_allowlist: str = "",
|
||||
url_denylist: str = "",
|
||||
parser: str = "auto",
|
||||
plugins: str = "",
|
||||
persona: str = "Default",
|
||||
overwrite: bool = False,
|
||||
update: bool | None = None,
|
||||
index_only: bool = False,
|
||||
bg: bool = False,
|
||||
created_by_id: int | None = None,
|
||||
) -> tuple["Crawl", QuerySet["Snapshot"]]:
|
||||
"""Add a new URL or list of URLs to your archive.
|
||||
|
||||
The flow is:
|
||||
@@ -72,8 +77,15 @@ def add(urls: str | list[str],
|
||||
from rich import print
|
||||
|
||||
depth = int(depth)
|
||||
max_urls = int(max_urls or 0)
|
||||
max_size = parse_filesize_to_bytes(max_size)
|
||||
|
||||
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
|
||||
if depth not in (0, 1, 2, 3, 4):
|
||||
raise ValueError("Depth must be 0-4")
|
||||
if max_urls < 0:
|
||||
raise ValueError("max_urls must be >= 0")
|
||||
if max_size < 0:
|
||||
raise ValueError("max_size must be >= 0")
|
||||
|
||||
# import models once django is set up
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -91,47 +103,49 @@ def add(urls: str | list[str],
|
||||
update = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
|
||||
|
||||
# 2. Create a new Crawl with inline URLs
|
||||
cli_args = [*sys.argv]
|
||||
if cli_args[0].lower().endswith('archivebox'):
|
||||
cli_args[0] = 'archivebox'
|
||||
cmd_str = ' '.join(cli_args)
|
||||
if cli_args[0].lower().endswith("archivebox"):
|
||||
cli_args[0] = "archivebox"
|
||||
cmd_str = " ".join(cli_args)
|
||||
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
|
||||
# Read URLs directly into crawl
|
||||
urls_content = sources_file.read_text()
|
||||
persona_name = (persona or 'Default').strip() or 'Default'
|
||||
plugins = plugins or str(get_config().get('PLUGINS') or '')
|
||||
persona_name = (persona or "Default").strip() or "Default"
|
||||
plugins = plugins or str(get_config().get("PLUGINS") or "")
|
||||
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
|
||||
persona_obj.ensure_dirs()
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
max_urls=max_urls,
|
||||
max_size=max_size,
|
||||
tags_str=tag,
|
||||
persona_id=persona_obj.id,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]",
|
||||
created_by_id=created_by_id,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
|
||||
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
|
||||
}
|
||||
"ONLY_NEW": not update,
|
||||
"INDEX_ONLY": index_only,
|
||||
"OVERWRITE": overwrite,
|
||||
"PLUGINS": plugins,
|
||||
"DEFAULT_PERSONA": persona_name,
|
||||
"PARSER": parser,
|
||||
**({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}),
|
||||
**({"URL_DENYLIST": url_denylist} if url_denylist else {}),
|
||||
},
|
||||
)
|
||||
|
||||
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
|
||||
print(f' [dim]First URL: {first_url}[/dim]')
|
||||
print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]")
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ""
|
||||
print(f" [dim]First URL: {first_url}[/dim]")
|
||||
|
||||
# 3. The CrawlMachine will create Snapshots from all URLs when started
|
||||
# Parser extractors run on snapshots and discover more URLs
|
||||
@@ -139,20 +153,21 @@ def add(urls: str | list[str],
|
||||
|
||||
if index_only:
|
||||
# Just create the crawl but don't start processing
|
||||
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
|
||||
print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]")
|
||||
# Create snapshots for all URLs in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
snapshot, _ = Snapshot.objects.update_or_create(
|
||||
crawl=crawl, url=url,
|
||||
crawl=crawl,
|
||||
url=url,
|
||||
defaults={
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'depth': 0,
|
||||
"status": Snapshot.INITIAL_STATE,
|
||||
"retry_at": timezone.now(),
|
||||
"timestamp": str(timezone.now().timestamp()),
|
||||
"depth": 0,
|
||||
},
|
||||
)
|
||||
if tag:
|
||||
snapshot.save_tags(tag.split(','))
|
||||
snapshot.save_tags(tag.split(","))
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return crawl, crawl.snapshot_set.all()
|
||||
|
||||
@@ -168,10 +183,12 @@ def add(urls: str | list[str],
|
||||
|
||||
if bg:
|
||||
# Background mode: just queue work and return (background runner via server will pick it up)
|
||||
print('[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]')
|
||||
print(
|
||||
"[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]",
|
||||
)
|
||||
else:
|
||||
# Foreground mode: run full crawl runner until all work is done
|
||||
print('[green]\\[*] Starting crawl runner to process crawl...[/green]')
|
||||
print("[green]\\[*] Starting crawl runner to process crawl...[/green]")
|
||||
run_crawl(str(crawl.id))
|
||||
|
||||
# Print summary for foreground runs
|
||||
@@ -179,7 +196,10 @@ def add(urls: str | list[str],
|
||||
crawl.refresh_from_db()
|
||||
snapshots_count = crawl.snapshot_set.count()
|
||||
try:
|
||||
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
|
||||
from django.db.models import Count, Sum
|
||||
|
||||
totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size"))
|
||||
total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0
|
||||
except Exception:
|
||||
total_bytes, _, _ = get_dir_size(crawl.output_dir)
|
||||
total_size = printable_filesize(total_bytes)
|
||||
@@ -197,23 +217,23 @@ def add(urls: str | list[str],
|
||||
# Output dir relative to DATA_DIR
|
||||
try:
|
||||
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
|
||||
rel_output_str = f'./{rel_output}'
|
||||
rel_output_str = f"./{rel_output}"
|
||||
except Exception:
|
||||
rel_output_str = str(crawl.output_dir)
|
||||
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
|
||||
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000"
|
||||
if bind_addr.startswith("http://") or bind_addr.startswith("https://"):
|
||||
base_url = bind_addr
|
||||
else:
|
||||
base_url = f'http://{bind_addr}'
|
||||
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
|
||||
base_url = f"http://{bind_addr}"
|
||||
admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/"
|
||||
|
||||
print('\n[bold]crawl output saved to:[/bold]')
|
||||
print(f' {rel_output_str}')
|
||||
print(f' {admin_url}')
|
||||
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
|
||||
print(f'[bold]total size:[/bold] {total_size}')
|
||||
print(f'[bold]total time:[/bold] {duration_str}')
|
||||
print("\n[bold]crawl output saved to:[/bold]")
|
||||
print(f" {rel_output_str}")
|
||||
print(f" {admin_url}")
|
||||
print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}")
|
||||
print(f"[bold]total size:[/bold] {total_size}")
|
||||
print(f"[bold]total time:[/bold] {duration_str}")
|
||||
except Exception:
|
||||
# Summary is best-effort; avoid failing the command if something goes wrong
|
||||
pass
|
||||
@@ -224,29 +244,43 @@ def add(urls: str | list[str],
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
|
||||
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
|
||||
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
@click.option(
|
||||
"--depth",
|
||||
"-d",
|
||||
type=click.Choice([str(i) for i in range(5)]),
|
||||
default="0",
|
||||
help="Recursively archive linked pages up to N hops away",
|
||||
)
|
||||
@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)")
|
||||
@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3")
|
||||
@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl")
|
||||
@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl")
|
||||
@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)")
|
||||
@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...")
|
||||
@click.option("--persona", default="Default", help="Authentication profile to use when archiving")
|
||||
@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
||||
@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them")
|
||||
@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now")
|
||||
@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)")
|
||||
@click.argument("urls", nargs=-1, type=click.Path())
|
||||
@docstring(add.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
raw_urls = kwargs.pop('urls')
|
||||
raw_urls = kwargs.pop("urls")
|
||||
urls = _collect_input_urls(raw_urls)
|
||||
if not urls:
|
||||
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
|
||||
raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.")
|
||||
if int(kwargs.get("max_urls") or 0) < 0:
|
||||
raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls")
|
||||
try:
|
||||
kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size"))
|
||||
except ValueError as err:
|
||||
raise click.BadParameter(str(err), param_hint="--max-size") from err
|
||||
|
||||
add(urls=urls, **kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -30,11 +30,10 @@ Examples:
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox archiveresult'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox archiveresult"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -42,13 +41,13 @@ from rich import print as rprint
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict:
|
||||
return {
|
||||
'type': 'ArchiveResult',
|
||||
'snapshot_id': str(snapshot_id),
|
||||
'plugin': plugin,
|
||||
'hook_name': hook_name,
|
||||
'status': status,
|
||||
"type": "ArchiveResult",
|
||||
"snapshot_id": str(snapshot_id),
|
||||
"plugin": plugin,
|
||||
"hook_name": hook_name,
|
||||
"status": status,
|
||||
}
|
||||
|
||||
|
||||
@@ -56,10 +55,11 @@ def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str =
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_archiveresults(
|
||||
snapshot_id: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
status: str = 'queued',
|
||||
snapshot_id: str | None = None,
|
||||
plugin: str | None = None,
|
||||
status: str = "queued",
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResult request records for Snapshots.
|
||||
@@ -86,13 +86,13 @@ def create_archiveresults(
|
||||
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
||||
pass_through_records = []
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Read from stdin
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate snapshot records from pass-through records
|
||||
@@ -100,17 +100,17 @@ def create_archiveresults(
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
# Pass through the Snapshot record itself
|
||||
pass_through_records.append(record)
|
||||
if record.get('id'):
|
||||
snapshot_ids.append(record['id'])
|
||||
if record.get("id"):
|
||||
snapshot_ids.append(record["id"])
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
# ArchiveResult records: pass through if they have an id
|
||||
if record.get('id'):
|
||||
if record.get("id"):
|
||||
pass_through_records.append(record)
|
||||
# If no id, we could create it, but for now just pass through
|
||||
else:
|
||||
@@ -120,9 +120,9 @@ def create_archiveresults(
|
||||
# Other typed records (Crawl, Tag, etc): pass through
|
||||
pass_through_records.append(record)
|
||||
|
||||
elif record.get('id'):
|
||||
elif record.get("id"):
|
||||
# Untyped record with id - assume it's a snapshot ID
|
||||
snapshot_ids.append(record['id'])
|
||||
snapshot_ids.append(record["id"])
|
||||
|
||||
# Output pass-through records first
|
||||
if not is_tty:
|
||||
@@ -131,15 +131,15 @@ def create_archiveresults(
|
||||
|
||||
if not snapshot_ids:
|
||||
if pass_through_records:
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
|
||||
|
||||
if not snapshots:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
|
||||
return 0 if pass_through_records else 1
|
||||
|
||||
created_count = 0
|
||||
@@ -150,7 +150,7 @@ def create_archiveresults(
|
||||
created_count += 1
|
||||
else:
|
||||
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
hooks = discover_hooks("Snapshot", config=config)
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name
|
||||
plugin_name = hook_path.parent.name
|
||||
@@ -158,7 +158,7 @@ def create_archiveresults(
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -166,11 +166,12 @@ def create_archiveresults(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
plugin: str | None = None,
|
||||
snapshot_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List ArchiveResults as JSONL with optional filters.
|
||||
@@ -183,13 +184,13 @@ def list_archiveresults(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = ArchiveResult.objects.all().order_by('-start_ts')
|
||||
queryset = ArchiveResult.objects.all().order_by("-start_ts")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'plugin': plugin,
|
||||
'snapshot_id': snapshot_id,
|
||||
"status": status,
|
||||
"plugin": plugin,
|
||||
"snapshot_id": snapshot_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -197,20 +198,22 @@ def list_archiveresults(
|
||||
for result in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'noresults': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"succeeded": "green",
|
||||
"failed": "red",
|
||||
"skipped": "dim",
|
||||
"noresults": "dim",
|
||||
"backoff": "magenta",
|
||||
}.get(result.status, "dim")
|
||||
rprint(
|
||||
f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}",
|
||||
)
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -218,8 +221,9 @@ def list_archiveresults(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update ArchiveResults from stdin JSONL.
|
||||
@@ -238,12 +242,12 @@ def update_archiveresults(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
result_id = record.get('id')
|
||||
result_id = record.get("id")
|
||||
if not result_id:
|
||||
continue
|
||||
|
||||
@@ -261,10 +265,10 @@ def update_archiveresults(
|
||||
write_record(result.to_json())
|
||||
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -272,6 +276,7 @@ def update_archiveresults(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete ArchiveResults from stdin JSONL.
|
||||
@@ -287,37 +292,37 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
result_ids = [r.get('id') for r in records if r.get('id')]
|
||||
result_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not result_ids:
|
||||
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = ArchiveResult.objects.filter(id__in=result_ids)
|
||||
count = results.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr)
|
||||
for result in results[:10]:
|
||||
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
|
||||
rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr)
|
||||
if count > 10:
|
||||
rprint(f' ... and {count - 10} more', file=sys.stderr)
|
||||
rprint(f" ... and {count - 10} more", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = results.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -325,51 +330,58 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage ArchiveResult records (plugin extraction results)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--snapshot-id', help='Snapshot ID to create results for')
|
||||
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
|
||||
@main.command("create")
|
||||
@click.option("--snapshot-id", help="Snapshot ID to create results for")
|
||||
@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
def create_cmd(snapshot_id: str | None, plugin: str | None, status: str):
|
||||
"""Create ArchiveResults for Snapshots from stdin JSONL."""
|
||||
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
|
||||
@click.option('--plugin', '-p', help='Filter by plugin name')
|
||||
@click.option('--snapshot-id', help='Filter by snapshot ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], plugin: Optional[str],
|
||||
snapshot_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)")
|
||||
@click.option("--plugin", "-p", help="Filter by plugin name")
|
||||
@click.option("--snapshot-id", help="Filter by snapshot ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
plugin: str | None,
|
||||
snapshot_id: str | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List ArchiveResults as JSONL."""
|
||||
sys.exit(list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
def update_cmd(status: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
def update_cmd(status: str | None):
|
||||
"""Update ArchiveResults from stdin JSONL."""
|
||||
sys.exit(update_archiveresults(status=status))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete ArchiveResults from stdin JSONL."""
|
||||
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -25,11 +25,10 @@ Examples:
|
||||
archivebox binary list --name=chrome | archivebox binary delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox binary'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox binary"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -41,10 +40,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_binary(
|
||||
name: str,
|
||||
abspath: str,
|
||||
version: str = '',
|
||||
version: str = "",
|
||||
) -> int:
|
||||
"""
|
||||
Create/register a Binary.
|
||||
@@ -59,7 +59,7 @@ def create_binary(
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
if not name or not abspath:
|
||||
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
|
||||
rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
@@ -76,28 +76,30 @@ def create_binary(
|
||||
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
|
||||
# records are owned by the current machine and can be safely piped into
|
||||
# `archivebox run` without creating invalid rows missing machine_id.
|
||||
binary = Binary.from_json({
|
||||
'name': name,
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'binproviders': 'env',
|
||||
'binprovider': 'env',
|
||||
})
|
||||
binary = Binary.from_json(
|
||||
{
|
||||
"name": name,
|
||||
"abspath": abspath,
|
||||
"version": version,
|
||||
"binproviders": "env",
|
||||
"binprovider": "env",
|
||||
},
|
||||
)
|
||||
if binary is None:
|
||||
raise ValueError('failed to create binary record')
|
||||
raise ValueError("failed to create binary record")
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
if created:
|
||||
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@@ -105,11 +107,12 @@ def create_binary(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_binaries(
|
||||
name: Optional[str] = None,
|
||||
abspath__icontains: Optional[str] = None,
|
||||
version__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
abspath__icontains: str | None = None,
|
||||
version__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Binaries as JSONL with optional filters.
|
||||
@@ -122,25 +125,25 @@ def list_binaries(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Binary.objects.all().order_by('name', '-modified_at', '-created_at')
|
||||
queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'abspath__icontains': abspath__icontains,
|
||||
'version__icontains': version__icontains,
|
||||
"name": name,
|
||||
"abspath__icontains": abspath__icontains,
|
||||
"version__icontains": version__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for binary in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
|
||||
rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}")
|
||||
else:
|
||||
write_record(binary.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -148,9 +151,10 @@ def list_binaries(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_binaries(
|
||||
version: Optional[str] = None,
|
||||
abspath: Optional[str] = None,
|
||||
version: str | None = None,
|
||||
abspath: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Binaries from stdin JSONL.
|
||||
@@ -169,12 +173,12 @@ def update_binaries(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
binary_id = record.get('id')
|
||||
binary_id = record.get("id")
|
||||
if not binary_id:
|
||||
continue
|
||||
|
||||
@@ -194,10 +198,10 @@ def update_binaries(
|
||||
write_record(binary.to_json())
|
||||
|
||||
except Binary.DoesNotExist:
|
||||
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -205,6 +209,7 @@ def update_binaries(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Binaries from stdin JSONL.
|
||||
@@ -220,35 +225,35 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binary_ids = [r.get('id') for r in records if r.get('id')]
|
||||
binary_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not binary_ids:
|
||||
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binaries = Binary.objects.filter(id__in=binary_ids)
|
||||
count = binaries.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr)
|
||||
for binary in binaries:
|
||||
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
|
||||
rprint(f" {binary.name} {binary.abspath}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = binaries.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -256,52 +261,59 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Binary records (detected executables)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
|
||||
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
|
||||
@click.option('--version', '-v', default='', help='Binary version')
|
||||
@main.command("create")
|
||||
@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)")
|
||||
@click.option("--abspath", "-p", required=True, help="Absolute path to binary")
|
||||
@click.option("--version", "-v", default="", help="Binary version")
|
||||
def create_cmd(name: str, abspath: str, version: str):
|
||||
"""Create/register a Binary."""
|
||||
sys.exit(create_binary(name=name, abspath=abspath, version=version))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', '-n', help='Filter by name')
|
||||
@click.option('--abspath__icontains', help='Filter by path contains')
|
||||
@click.option('--version__icontains', help='Filter by version contains')
|
||||
@click.option('--limit', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
|
||||
version__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", "-n", help="Filter by name")
|
||||
@click.option("--abspath__icontains", help="Filter by path contains")
|
||||
@click.option("--version__icontains", help="Filter by version contains")
|
||||
@click.option("--limit", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
name: str | None,
|
||||
abspath__icontains: str | None,
|
||||
version__icontains: str | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List Binaries as JSONL."""
|
||||
sys.exit(list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--version', '-v', help='Set version')
|
||||
@click.option('--abspath', '-p', help='Set path')
|
||||
def update_cmd(version: Optional[str], abspath: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--version", "-v", help="Set version")
|
||||
@click.option("--abspath", "-p", help="Set path")
|
||||
def update_cmd(version: str | None, abspath: str | None):
|
||||
"""Update Binaries from stdin JSONL."""
|
||||
sys.exit(update_binaries(version=version, abspath=abspath))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Binaries from stdin JSONL."""
|
||||
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import sys
|
||||
import rich_click as click
|
||||
@@ -12,12 +12,14 @@ from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
|
||||
@enforce_types
|
||||
def config(*keys,
|
||||
get: bool=False,
|
||||
set: bool=False,
|
||||
search: bool=False,
|
||||
reset: bool=False,
|
||||
**kwargs) -> None:
|
||||
def config(
|
||||
*keys,
|
||||
get: bool = False,
|
||||
set: bool = False,
|
||||
search: bool = False,
|
||||
reset: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""Get and set your ArchiveBox project configuration values"""
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
@@ -29,8 +31,8 @@ def config(*keys,
|
||||
|
||||
FLAT_CONFIG = get_flat_config()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
|
||||
|
||||
config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()])
|
||||
no_args = not (get or set or reset or config_options)
|
||||
|
||||
matching_config = {}
|
||||
@@ -39,19 +41,19 @@ def config(*keys,
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
for config_section in CONFIGS.values():
|
||||
aliases = getattr(config_section, 'aliases', {})
|
||||
|
||||
aliases = getattr(config_section, "aliases", {})
|
||||
|
||||
for search_key in config_options:
|
||||
# search all aliases in the section
|
||||
for alias_key, key in aliases.items():
|
||||
if search_key.lower() in alias_key.lower():
|
||||
matching_config[key] = dict(config_section)[key]
|
||||
|
||||
|
||||
# search all keys and values in the section
|
||||
for existing_key, value in dict(config_section).items():
|
||||
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
|
||||
matching_config[existing_key] = value
|
||||
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
@@ -61,23 +63,23 @@ def config(*keys,
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
|
||||
if failed_config:
|
||||
print('\n[red][X] These options failed to get[/red]')
|
||||
print(' {}'.format('\n '.join(config_options)))
|
||||
print("\n[red][X] These options failed to get[/red]")
|
||||
print(" {}".format("\n ".join(config_options)))
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
# Display core config sections
|
||||
for config_section in CONFIGS.values():
|
||||
section_header = getattr(config_section, 'toml_section_header', '')
|
||||
section_header = getattr(config_section, "toml_section_header", "")
|
||||
if isinstance(section_header, str) and section_header:
|
||||
print(f'[grey53]\\[{section_header}][/grey53]')
|
||||
print(f"[grey53]\\[{section_header}][/grey53]")
|
||||
else:
|
||||
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
|
||||
print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]")
|
||||
|
||||
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
|
||||
print("[grey53]################################################################[/grey53]")
|
||||
|
||||
# Display plugin config section
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
@@ -87,17 +89,17 @@ def config(*keys,
|
||||
|
||||
# Collect all plugin config keys
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' not in schema:
|
||||
if "properties" not in schema:
|
||||
continue
|
||||
for key in schema['properties'].keys():
|
||||
for key in schema["properties"].keys():
|
||||
if key in matching_config:
|
||||
plugin_keys[key] = matching_config[key]
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print('[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
print("[grey53]\\[PLUGINS][/grey53]")
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
|
||||
print("[grey53]################################################################[/grey53]")
|
||||
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
@@ -105,18 +107,20 @@ def config(*keys,
|
||||
new_config = {}
|
||||
failed_options = []
|
||||
for line in config_options:
|
||||
if line.startswith('#') or not line.strip():
|
||||
if line.startswith("#") or not line.strip():
|
||||
continue
|
||||
if '=' not in line:
|
||||
print('[red][X] Config KEY=VALUE must have an = sign in it[/red]')
|
||||
print(f' {line}')
|
||||
if "=" not in line:
|
||||
print("[red][X] Config KEY=VALUE must have an = sign in it[/red]")
|
||||
print(f" {line}")
|
||||
raise SystemExit(2)
|
||||
|
||||
raw_key, val = line.split('=', 1)
|
||||
raw_key, val = line.split("=", 1)
|
||||
raw_key = raw_key.upper().strip()
|
||||
key = get_real_name(raw_key)
|
||||
if key != raw_key:
|
||||
print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]')
|
||||
print(
|
||||
f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]",
|
||||
)
|
||||
|
||||
if key in FLAT_CONFIG:
|
||||
new_config[key] = val.strip()
|
||||
@@ -136,38 +140,38 @@ def config(*keys,
|
||||
|
||||
if side_effect_changes:
|
||||
print(file=sys.stderr)
|
||||
print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr)
|
||||
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr)
|
||||
print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr)
|
||||
print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr)
|
||||
|
||||
if failed_options:
|
||||
print()
|
||||
print('[red][X] These options failed to set (check for typos):[/red]')
|
||||
print(' {}'.format('\n '.join(failed_options)))
|
||||
print("[red][X] These options failed to set (check for typos):[/red]")
|
||||
print(" {}".format("\n ".join(failed_options)))
|
||||
raise SystemExit(1)
|
||||
|
||||
elif reset:
|
||||
print('[red][X] This command is not implemented yet.[/red]')
|
||||
print(' Please manually remove the relevant lines from your config file:')
|
||||
print("[red][X] This command is not implemented yet.[/red]")
|
||||
print(" Please manually remove the relevant lines from your config file:")
|
||||
raise SystemExit(2)
|
||||
|
||||
else:
|
||||
print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]')
|
||||
print(' archivebox config')
|
||||
print(' archivebox config --get SOME_KEY')
|
||||
print(' archivebox config --set SOME_KEY=SOME_VALUE')
|
||||
print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]")
|
||||
print(" archivebox config")
|
||||
print(" archivebox config --get SOME_KEY")
|
||||
print(" archivebox config --set SOME_KEY=SOME_VALUE")
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term')
|
||||
@click.option('--get', is_flag=True, help='Get the value for the given config KEYs')
|
||||
@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values')
|
||||
@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults')
|
||||
@click.argument('KEY=VALUE', nargs=-1, type=str)
|
||||
@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term")
|
||||
@click.option("--get", is_flag=True, help="Get the value for the given config KEYs")
|
||||
@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values")
|
||||
@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults")
|
||||
@click.argument("KEY=VALUE", nargs=-1, type=str)
|
||||
@docstring(config.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
config(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -30,11 +30,11 @@ Examples:
|
||||
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox crawl"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -46,12 +46,13 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_crawl(
|
||||
urls: Iterable[str],
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
created_by_id: Optional[int] = None,
|
||||
tag: str = "",
|
||||
status: str = "queued",
|
||||
created_by_id: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create a Crawl job from URLs.
|
||||
@@ -74,7 +75,7 @@ def create_crawl(
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate pass-through records from URL records
|
||||
@@ -82,29 +83,29 @@ def create_crawl(
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
# Pass-through: output records that aren't URL/Crawl types
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Handle existing Crawl records (just pass through with id)
|
||||
if record_type == TYPE_CRAWL and record.get('id'):
|
||||
if record_type == TYPE_CRAWL and record.get("id"):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Collect URLs
|
||||
url = record.get('url')
|
||||
url = record.get("url")
|
||||
if url:
|
||||
url_list.append(url)
|
||||
|
||||
# Handle 'urls' field (newline-separated)
|
||||
urls_field = record.get('urls')
|
||||
urls_field = record.get("urls")
|
||||
if urls_field:
|
||||
for line in urls_field.split('\n'):
|
||||
for line in urls_field.split("\n"):
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
if line and not line.startswith("#"):
|
||||
url_list.append(line)
|
||||
|
||||
# Output pass-through records first
|
||||
@@ -115,44 +116,44 @@ def create_crawl(
|
||||
if not url_list:
|
||||
if pass_through_records:
|
||||
# If we had pass-through records but no URLs, that's OK
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
||||
rprint("[red]No valid URLs found[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Build crawl record with all URLs as newline-separated string
|
||||
crawl_record = {
|
||||
'urls': '\n'.join(url_list),
|
||||
'max_depth': depth,
|
||||
'tags_str': tag,
|
||||
'status': status,
|
||||
'label': '',
|
||||
"urls": "\n".join(url_list),
|
||||
"max_depth": depth,
|
||||
"tags_str": tag,
|
||||
"status": status,
|
||||
"label": "",
|
||||
}
|
||||
|
||||
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
|
||||
if not crawl:
|
||||
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
|
||||
rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(crawl.to_json())
|
||||
|
||||
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
|
||||
rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr)
|
||||
for url in url_list[:5]: # Show first 5 URLs
|
||||
rprint(f' {url[:70]}', file=sys.stderr)
|
||||
rprint(f" {url[:70]}", file=sys.stderr)
|
||||
if len(url_list) > 5:
|
||||
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
|
||||
rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@@ -160,11 +161,12 @@ def create_crawl(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_crawls(
|
||||
status: Optional[str] = None,
|
||||
urls__icontains: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
urls__icontains: str | None = None,
|
||||
max_depth: int | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Crawls as JSONL with optional filters.
|
||||
@@ -177,13 +179,13 @@ def list_crawls(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Crawl.objects.all().order_by('-created_at')
|
||||
queryset = Crawl.objects.all().order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'urls__icontains': urls__icontains,
|
||||
'max_depth': max_depth,
|
||||
"status": status,
|
||||
"urls__icontains": urls__icontains,
|
||||
"max_depth": max_depth,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -191,17 +193,17 @@ def list_crawls(
|
||||
for crawl in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(crawl.status, 'dim')
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"sealed": "green",
|
||||
}.get(crawl.status, "dim")
|
||||
url_preview = crawl.urls[:50].replace("\n", " ")
|
||||
rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
|
||||
else:
|
||||
write_record(crawl.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -209,9 +211,10 @@ def list_crawls(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_crawls(
|
||||
status: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
max_depth: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Crawls from stdin JSONL.
|
||||
@@ -232,12 +235,12 @@ def update_crawls(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id')
|
||||
crawl_id = record.get("id")
|
||||
if not crawl_id:
|
||||
continue
|
||||
|
||||
@@ -258,10 +261,10 @@ def update_crawls(
|
||||
write_record(crawl.to_json())
|
||||
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -269,6 +272,7 @@ def update_crawls(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Crawls from stdin JSONL.
|
||||
@@ -284,36 +288,36 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawl_ids = [r.get('id') for r in records if r.get('id')]
|
||||
crawl_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not crawl_ids:
|
||||
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawls = Crawl.objects.filter(id__in=crawl_ids)
|
||||
count = crawls.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
|
||||
for crawl in crawls:
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
|
||||
url_preview = crawl.urls[:50].replace("\n", " ")
|
||||
rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = crawls.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -321,53 +325,60 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Crawl records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@main.command("create")
|
||||
@click.argument("urls", nargs=-1)
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
|
||||
"""Create a Crawl job from URLs or stdin."""
|
||||
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--urls__icontains', help='Filter by URLs contains')
|
||||
@click.option('--max-depth', type=int, help='Filter by max depth')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
|
||||
max_depth: Optional[int], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--urls__icontains", help="Filter by URLs contains")
|
||||
@click.option("--max-depth", type=int, help="Filter by max depth")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
urls__icontains: str | None,
|
||||
max_depth: int | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List Crawls as JSONL."""
|
||||
sys.exit(list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--max-depth', type=int, help='Set max depth')
|
||||
def update_cmd(status: Optional[str], max_depth: Optional[int]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
@click.option("--max-depth", type=int, help="Set max depth")
|
||||
def update_cmd(status: str | None, max_depth: int | None):
|
||||
"""Update Crawls from stdin JSONL."""
|
||||
sys.exit(update_crawls(status=status, max_depth=max_depth))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Crawls from stdin JSONL."""
|
||||
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox crawl"
|
||||
|
||||
import sys
|
||||
|
||||
@@ -10,12 +10,12 @@ import rich_click as click
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.command(context_settings={"ignore_unknown_options": True})
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility")
|
||||
@click.argument("urls", nargs=-1)
|
||||
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
|
||||
del status, wait
|
||||
@@ -23,5 +23,5 @@ def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,8 +27,8 @@ Examples:
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox extract"
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -52,51 +52,52 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
|
||||
rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr)
|
||||
|
||||
try:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
snapshot.status = snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.status != crawl.StatusChoices.STARTED:
|
||||
crawl.status = crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]")
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
|
||||
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
|
||||
print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]")
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Still in progress or backoff - not a failure
|
||||
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
|
||||
print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_plugins(
|
||||
args: tuple,
|
||||
records: list[dict] | None = None,
|
||||
plugins: str = '',
|
||||
plugins: str = "",
|
||||
wait: bool = True,
|
||||
emit_results: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Run plugins on Snapshots from input.
|
||||
@@ -111,16 +112,18 @@ def run_plugins(
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
read_args_or_stdin,
|
||||
write_record,
|
||||
TYPE_SNAPSHOT,
|
||||
TYPE_ARCHIVERESULT,
|
||||
)
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Parse comma-separated plugins list once (reused in creation and filtering)
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
|
||||
plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else []
|
||||
|
||||
# Parse stdin/args exactly once per CLI invocation.
|
||||
# `main()` may already have consumed stdin to distinguish Snapshot input from
|
||||
@@ -130,41 +133,41 @@ def run_plugins(
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs and optional plugin constraints to process
|
||||
snapshot_ids = set()
|
||||
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
record_type = record.get("type")
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
snapshot_id = record.get("id")
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record.get('url'):
|
||||
elif record.get("url"):
|
||||
# Look up by URL (get most recent if multiple exist)
|
||||
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
|
||||
snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first()
|
||||
if snap:
|
||||
snapshot_ids.add(str(snap.id))
|
||||
else:
|
||||
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr)
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
snapshot_id = record.get("snapshot_id")
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
plugin_name = record.get('plugin')
|
||||
plugin_name = record.get("plugin")
|
||||
if plugin_name and not plugins_list:
|
||||
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
|
||||
|
||||
elif 'id' in record:
|
||||
elif "id" in record:
|
||||
# Assume it's a snapshot ID
|
||||
snapshot_ids.add(record['id'])
|
||||
snapshot_ids.add(record["id"])
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
|
||||
rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get snapshots and ensure they have pending ArchiveResults
|
||||
@@ -173,17 +176,13 @@ def run_plugins(
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
|
||||
if existing_result and existing_result.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set())
|
||||
for plugin_name in requested_plugin_names:
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
||||
if existing_result:
|
||||
existing_result.reset_for_retry()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
@@ -195,34 +194,39 @@ def run_plugins(
|
||||
processed_count += 1
|
||||
|
||||
if processed_count == 0:
|
||||
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
|
||||
rprint("[red]No snapshots to process[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr)
|
||||
|
||||
# Run orchestrator if --wait (default)
|
||||
if wait:
|
||||
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
|
||||
rprint("[blue]Running plugins...[/blue]", file=sys.stderr)
|
||||
snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id)
|
||||
snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
|
||||
|
||||
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
|
||||
selected_plugins = plugins_list or sorted({
|
||||
plugin
|
||||
for snapshot_id in crawl_snapshot_ids
|
||||
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
|
||||
}) or None
|
||||
selected_plugins = (
|
||||
plugins_list
|
||||
or sorted(
|
||||
{plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())},
|
||||
)
|
||||
or None
|
||||
)
|
||||
run_crawl(
|
||||
crawl_id,
|
||||
snapshot_ids=sorted(crawl_snapshot_ids),
|
||||
selected_plugins=selected_plugins,
|
||||
)
|
||||
|
||||
if not emit_results:
|
||||
return 0
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
@@ -234,11 +238,14 @@ def run_plugins(
|
||||
for result in results:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
"succeeded": "green",
|
||||
"failed": "red",
|
||||
"skipped": "yellow",
|
||||
}.get(result.status, "dim")
|
||||
rprint(
|
||||
f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
except Snapshot.DoesNotExist:
|
||||
@@ -250,18 +257,20 @@ def run_plugins(
|
||||
def is_archiveresult_id(value: str) -> bool:
|
||||
"""Check if value looks like an ArchiveResult UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
|
||||
uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)")
|
||||
@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)")
|
||||
@click.argument("args", nargs=-1)
|
||||
def main(plugins: str, wait: bool, args: tuple):
|
||||
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
@@ -271,14 +280,12 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
|
||||
rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing ArchiveResult IDs to process
|
||||
all_are_archiveresult_ids = all(
|
||||
is_archiveresult_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records)
|
||||
|
||||
if all_are_archiveresult_ids:
|
||||
# Process existing ArchiveResults by ID
|
||||
@@ -286,9 +293,9 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
archiveresult_id = record.get('id') or record.get('url')
|
||||
archiveresult_id = record.get("id") or record.get("url")
|
||||
if not isinstance(archiveresult_id, str):
|
||||
rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr)
|
||||
exit_code = 1
|
||||
continue
|
||||
result = process_archiveresult_by_id(archiveresult_id)
|
||||
@@ -300,5 +307,5 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox help'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox help"
|
||||
|
||||
import os
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@@ -17,33 +17,44 @@ def help() -> None:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
from archivebox.misc.logging_util import log_cli_command
|
||||
|
||||
log_cli_command('help', [], None, '.')
|
||||
|
||||
COMMANDS_HELP_TEXT = '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
|
||||
log_cli_command("help", [], None, ".")
|
||||
|
||||
COMMANDS_HELP_TEXT = (
|
||||
"\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
)
|
||||
+ "\n\n "
|
||||
+ "\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
)
|
||||
+ "\n\n "
|
||||
+ "\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
)
|
||||
)
|
||||
|
||||
DOCKER_USAGE = '''
|
||||
|
||||
DOCKER_USAGE = (
|
||||
"""
|
||||
[dodger_blue3]Docker Usage:[/dodger_blue3]
|
||||
[grey53]# using Docker Compose:[/grey53]
|
||||
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
[grey53]# using Docker:[/grey53]
|
||||
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
''' if IN_DOCKER else ''
|
||||
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
|
||||
"""
|
||||
if IN_DOCKER
|
||||
else ""
|
||||
)
|
||||
DOCKER_DOCS = (
|
||||
"\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]"
|
||||
if IN_DOCKER
|
||||
else ""
|
||||
)
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ""
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ""
|
||||
|
||||
print(f'''{DOCKER_USAGE}
|
||||
print(f"""{DOCKER_USAGE}
|
||||
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
|
||||
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
@@ -54,12 +65,11 @@ def help() -> None:
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
|
||||
''')
|
||||
|
||||
|
||||
""")
|
||||
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
|
||||
EXAMPLE_USAGE = f'''
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~")
|
||||
EXAMPLE_USAGE = f"""
|
||||
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
|
||||
|
||||
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
|
||||
@@ -73,33 +83,49 @@ def help() -> None:
|
||||
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
|
||||
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
|
||||
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
|
||||
'''
|
||||
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
|
||||
"""
|
||||
print(
|
||||
Panel(
|
||||
EXAMPLE_USAGE,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]",
|
||||
subtitle="Commands run inside this dir will only apply to this collection.",
|
||||
),
|
||||
)
|
||||
else:
|
||||
DATA_SETUP_HELP = '\n'
|
||||
DATA_SETUP_HELP = "\n"
|
||||
if IN_DOCKER:
|
||||
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
|
||||
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
|
||||
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
|
||||
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
|
||||
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
|
||||
DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n"
|
||||
DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n"
|
||||
DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n"
|
||||
DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n"
|
||||
DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n"
|
||||
DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
|
||||
DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n"
|
||||
print(
|
||||
Panel(
|
||||
DATA_SETUP_HELP,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[red]:cross_mark: No collection is currently active[/red]",
|
||||
subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.option("--help", "-h", is_flag=True, help="Show help")
|
||||
def main(**kwargs):
|
||||
"""Print the ArchiveBox help message and usage"""
|
||||
return help()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Mapping
|
||||
from collections.abc import Mapping
|
||||
|
||||
from rich import print
|
||||
import rich_click as click
|
||||
@@ -14,12 +14,12 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None:
|
||||
url = link_dict.get('url')
|
||||
url = link_dict.get("url")
|
||||
if not isinstance(url, str) or not url:
|
||||
return None
|
||||
|
||||
record: dict[str, object] = {'url': url}
|
||||
for key in ('timestamp', 'title', 'tags', 'sources'):
|
||||
record: dict[str, object] = {"url": url}
|
||||
for key in ("timestamp", "title", "tags", "sources"):
|
||||
value = link_dict.get(key)
|
||||
if value is not None:
|
||||
record[key] = value
|
||||
@@ -27,15 +27,15 @@ def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, di
|
||||
|
||||
|
||||
@enforce_types
|
||||
def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
def init(force: bool = False, quick: bool = False, install: bool = False) -> None:
|
||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.collection import write_config_file
|
||||
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details
|
||||
from archivebox.misc.db import apply_migrations
|
||||
|
||||
|
||||
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
|
||||
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
|
||||
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
|
||||
@@ -43,69 +43,71 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
|
||||
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
|
||||
if is_empty and not existing_index:
|
||||
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]")
|
||||
print("[green]----------------------------------------------------------------------[/green]")
|
||||
elif existing_index:
|
||||
# TODO: properly detect and print the existing version in current index as well
|
||||
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]")
|
||||
print("[green]----------------------------------------------------------------------[/green]")
|
||||
else:
|
||||
if force:
|
||||
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
|
||||
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
|
||||
print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]")
|
||||
print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]")
|
||||
else:
|
||||
print(
|
||||
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
"[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
" You must run init in a completely empty directory, or an existing data folder.\n\n"
|
||||
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
|
||||
" then run and run 'archivebox init' to pick up where you left off.\n\n"
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||
)
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)",
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
if existing_index:
|
||||
print('\n[green][*] Verifying archive folder structure...[/green]')
|
||||
print("\n[green][*] Verifying archive folder structure...[/green]")
|
||||
else:
|
||||
print('\n[green][+] Building archive folder structure...[/green]')
|
||||
|
||||
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
|
||||
print("\n[green][+] Building archive folder structure...[/green]")
|
||||
|
||||
print(
|
||||
f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...",
|
||||
)
|
||||
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
||||
|
||||
|
||||
print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...")
|
||||
|
||||
# create the .archivebox_id file with a unique ID for this collection
|
||||
from archivebox.config.paths import _get_collection_id
|
||||
_get_collection_id(DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
_get_collection_id(DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
||||
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
||||
print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]")
|
||||
else:
|
||||
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
|
||||
|
||||
print("\n[green][+] Building main SQL index and running initial migrations...[/green]")
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
|
||||
for migration_line in apply_migrations(DATA_DIR):
|
||||
sys.stdout.write(f' {migration_line}\n')
|
||||
sys.stdout.write(f" {migration_line}\n")
|
||||
|
||||
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
|
||||
print()
|
||||
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
|
||||
|
||||
print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}")
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
print()
|
||||
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||
print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]")
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@@ -114,10 +116,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
if existing_index:
|
||||
all_links = Snapshot.objects.all()
|
||||
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||
print(f" √ Loaded {all_links.count()} links from existing main index.")
|
||||
|
||||
if quick:
|
||||
print(' > Skipping orphan snapshot import (quick mode)')
|
||||
print(" > Skipping orphan snapshot import (quick mode)")
|
||||
else:
|
||||
try:
|
||||
# Import orphaned links from legacy JSON indexes
|
||||
@@ -131,7 +133,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
orphaned_json_links[url] = record
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
|
||||
print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]")
|
||||
|
||||
orphaned_data_dir_links: dict[str, dict[str, object]] = {}
|
||||
for link_dict in parse_json_links_details(DATA_DIR):
|
||||
@@ -143,7 +145,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
orphaned_data_dir_links[url] = record
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
|
||||
print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]")
|
||||
|
||||
if pending_links:
|
||||
for link_dict in pending_links.values():
|
||||
@@ -151,42 +153,44 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
# Hint for orphaned snapshot directories
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
|
||||
print(' archivebox update')
|
||||
print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:")
|
||||
print(" archivebox update")
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print(file=sys.stderr)
|
||||
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
|
||||
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
|
||||
print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr)
|
||||
print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
|
||||
print(' archivebox init --quick', file=sys.stderr)
|
||||
print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr)
|
||||
print(" archivebox init --quick", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||
print("\n[green]----------------------------------------------------------------------[/green]")
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(
|
||||
username=SERVER_CONFIG.ADMIN_USERNAME,
|
||||
).exists():
|
||||
print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]")
|
||||
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||
|
||||
if existing_index:
|
||||
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
|
||||
print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]")
|
||||
else:
|
||||
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
|
||||
print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]")
|
||||
|
||||
|
||||
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True)
|
||||
if working_tmp_dir:
|
||||
@@ -195,33 +199,35 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True)
|
||||
if working_lib_dir:
|
||||
working_lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
(working_lib_dir / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(working_lib_dir / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if install:
|
||||
from archivebox.cli.archivebox_install import install as install_method
|
||||
|
||||
install_method()
|
||||
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To view your archive index, run:')
|
||||
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
|
||||
print(" [violet]Hint:[/violet] To view your archive index, run:")
|
||||
print(
|
||||
" archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]",
|
||||
)
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" To add new links, you can run:")
|
||||
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
||||
print(" For more usage and examples, run:")
|
||||
print(" archivebox help")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
|
||||
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
|
||||
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
|
||||
@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway")
|
||||
@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs")
|
||||
@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving")
|
||||
@docstring(init.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
init(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
|
||||
@@ -11,7 +11,7 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None:
|
||||
"""Detect and install ArchiveBox dependencies by running the abx-dl install flow
|
||||
|
||||
Examples:
|
||||
@@ -31,33 +31,34 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
|
||||
|
||||
# Show what we're installing
|
||||
if binaries:
|
||||
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
|
||||
print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]")
|
||||
else:
|
||||
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
|
||||
print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]")
|
||||
|
||||
if binproviders != '*':
|
||||
print(f'[green][+] Using providers: {binproviders}[/green]')
|
||||
if binproviders != "*":
|
||||
print(f"[green][+] Using providers: {binproviders}[/green]")
|
||||
|
||||
if IS_ROOT:
|
||||
EUID = os.geteuid()
|
||||
print()
|
||||
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
|
||||
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||
print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]")
|
||||
print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].")
|
||||
print()
|
||||
|
||||
if dry_run:
|
||||
print('[dim]Dry run - would run the abx-dl install flow[/dim]')
|
||||
print("[dim]Dry run - would run the abx-dl install flow[/dim]")
|
||||
return
|
||||
|
||||
# Set up Django
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
plugin_names = list(binaries)
|
||||
if binproviders != '*':
|
||||
plugin_names.extend(provider.strip() for provider in binproviders.split(',') if provider.strip())
|
||||
if binproviders != "*":
|
||||
plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip())
|
||||
|
||||
print('[+] Running installer via abx-dl bus...')
|
||||
print("[+] Running installer via abx-dl bus...")
|
||||
print()
|
||||
|
||||
from archivebox.services.runner import run_install
|
||||
@@ -68,28 +69,36 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
|
||||
|
||||
# Check for superuser
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
|
||||
stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green")
|
||||
stderr(" archivebox manage createsuperuser")
|
||||
|
||||
print()
|
||||
|
||||
# Show version to display full status including installed binaries
|
||||
# Django is already loaded, so just import and call the function directly
|
||||
from archivebox.cli.archivebox_version import version as show_version
|
||||
|
||||
show_version(quiet=False)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('binaries', nargs=-1, type=str, required=False)
|
||||
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
|
||||
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
|
||||
@click.argument("binaries", nargs=-1, type=str, required=False)
|
||||
@click.option(
|
||||
"--binproviders",
|
||||
"-p",
|
||||
default="*",
|
||||
help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False)
|
||||
@docstring(install.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
install(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox list"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -12,31 +11,47 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
|
||||
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
|
||||
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
|
||||
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--url__icontains", help="Filter by URL contains")
|
||||
@click.option("--url__istartswith", help="Filter by URL starts with")
|
||||
@click.option("--tag", "-t", help="Filter by tag name")
|
||||
@click.option("--crawl-id", help="Filter by crawl ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
|
||||
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
|
||||
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
|
||||
@click.argument("query", nargs=-1)
|
||||
def main(
|
||||
status: str | None,
|
||||
url__icontains: str | None,
|
||||
url__istartswith: str | None,
|
||||
tag: str | None,
|
||||
crawl_id: str | None,
|
||||
limit: int | None,
|
||||
sort: str | None,
|
||||
csv: str | None,
|
||||
with_headers: bool,
|
||||
search: str | None,
|
||||
query: tuple[str, ...],
|
||||
) -> None:
|
||||
"""List Snapshots."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
))
|
||||
sys.exit(
|
||||
list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
search=search,
|
||||
query=" ".join(query),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -19,11 +19,10 @@ Examples:
|
||||
archivebox machine list --hostname__icontains=myserver
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox machine'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox machine"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -35,10 +34,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_machines(
|
||||
hostname__icontains: Optional[str] = None,
|
||||
os_platform: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
hostname__icontains: str | None = None,
|
||||
os_platform: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Machines as JSONL with optional filters.
|
||||
@@ -51,24 +51,24 @@ def list_machines(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Machine.objects.all().order_by('-created_at')
|
||||
queryset = Machine.objects.all().order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'hostname__icontains': hostname__icontains,
|
||||
'os_platform': os_platform,
|
||||
"hostname__icontains": hostname__icontains,
|
||||
"os_platform": os_platform,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for machine in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
|
||||
rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}")
|
||||
else:
|
||||
write_record(machine.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -76,24 +76,27 @@ def list_machines(
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Machine records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--hostname__icontains', help='Filter by hostname contains')
|
||||
@click.option('--os-platform', help='Filter by OS platform')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--hostname__icontains", help="Filter by hostname contains")
|
||||
@click.option("--os-platform", help="Filter by OS platform")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None):
|
||||
"""List Machines as JSONL."""
|
||||
sys.exit(list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,33 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import rich_click as click
|
||||
from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def manage(args: list[str] | None=None) -> None:
|
||||
def manage(args: list[str] | None = None) -> None:
|
||||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
|
||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr('')
|
||||
stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow")
|
||||
stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow")
|
||||
stderr("")
|
||||
|
||||
from django.core.management import execute_from_command_line
|
||||
execute_from_command_line(['manage.py', *(args or ['help'])])
|
||||
|
||||
execute_from_command_line(["manage.py", *(args or ["help"])])
|
||||
|
||||
|
||||
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.argument("args", nargs=-1)
|
||||
@docstring(manage.__doc__)
|
||||
def main(args: list[str] | None=None) -> None:
|
||||
def main(args: list[str] | None = None) -> None:
|
||||
manage(args=args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -6,8 +6,8 @@ Start the Model Context Protocol (MCP) server in stdio mode.
|
||||
Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox mcp'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox mcp"
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -45,5 +45,5 @@ def main(**kwargs):
|
||||
mcp()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -24,8 +24,8 @@ Examples:
|
||||
archivebox persona list --name=old | archivebox persona delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox persona'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox persona"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -35,7 +35,7 @@ import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
from collections import OrderedDict
|
||||
|
||||
import rich_click as click
|
||||
@@ -49,134 +49,145 @@ from archivebox.personas import importers as persona_importers
|
||||
# Browser Profile Locations
|
||||
# =============================================================================
|
||||
|
||||
def get_chrome_user_data_dir() -> Optional[Path]:
|
||||
|
||||
def get_chrome_user_data_dir() -> Path | None:
|
||||
"""Get the default Chrome user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin': # macOS
|
||||
if system == "Darwin": # macOS
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
|
||||
home / 'Library' / 'Application Support' / 'Chromium',
|
||||
home / "Library" / "Application Support" / "Google" / "Chrome",
|
||||
home / "Library" / "Application Support" / "Chromium",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'google-chrome',
|
||||
home / '.config' / 'chromium',
|
||||
home / '.config' / 'chrome',
|
||||
home / 'snap' / 'chromium' / 'common' / 'chromium',
|
||||
home / ".config" / "google-chrome",
|
||||
home / ".config" / "chromium",
|
||||
home / ".config" / "chrome",
|
||||
home / "snap" / "chromium" / "common" / "chromium",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'Google' / 'Chrome' / 'User Data',
|
||||
local_app_data / 'Chromium' / 'User Data',
|
||||
local_app_data / "Google" / "Chrome" / "User Data",
|
||||
local_app_data / "Chromium" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_brave_user_data_dir() -> Optional[Path]:
|
||||
def get_brave_user_data_dir() -> Path | None:
|
||||
"""Get the default Brave user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
|
||||
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'BraveSoftware' / 'Brave-Browser',
|
||||
home / ".config" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
|
||||
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_edge_user_data_dir() -> Optional[Path]:
|
||||
def get_edge_user_data_dir() -> Path | None:
|
||||
"""Get the default Edge user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Microsoft Edge',
|
||||
home / "Library" / "Application Support" / "Microsoft Edge",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'microsoft-edge',
|
||||
home / '.config' / 'microsoft-edge-beta',
|
||||
home / '.config' / 'microsoft-edge-dev',
|
||||
home / ".config" / "microsoft-edge",
|
||||
home / ".config" / "microsoft-edge-beta",
|
||||
home / ".config" / "microsoft-edge-dev",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
|
||||
local_app_data / "Microsoft" / "Edge" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_browser_binary(browser: str) -> Optional[str]:
|
||||
def get_browser_binary(browser: str) -> str | None:
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
browser = browser.lower()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = {
|
||||
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
|
||||
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
|
||||
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
|
||||
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
|
||||
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
|
||||
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
|
||||
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
|
||||
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
|
||||
}.get(browser, [])
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = {
|
||||
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
|
||||
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
|
||||
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
|
||||
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
|
||||
"chrome": [
|
||||
"/usr/bin/google-chrome",
|
||||
"/usr/bin/google-chrome-stable",
|
||||
"/usr/bin/google-chrome-beta",
|
||||
"/usr/bin/google-chrome-unstable",
|
||||
],
|
||||
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
|
||||
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
|
||||
"edge": [
|
||||
"/usr/bin/microsoft-edge",
|
||||
"/usr/bin/microsoft-edge-stable",
|
||||
"/usr/bin/microsoft-edge-beta",
|
||||
"/usr/bin/microsoft-edge-dev",
|
||||
],
|
||||
}.get(browser, [])
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = {
|
||||
'chrome': [
|
||||
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
|
||||
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
||||
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
||||
"chrome": [
|
||||
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
|
||||
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
||||
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
||||
],
|
||||
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
|
||||
'brave': [
|
||||
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
|
||||
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
|
||||
"brave": [
|
||||
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
|
||||
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
],
|
||||
'edge': [
|
||||
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
|
||||
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
"edge": [
|
||||
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
|
||||
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
],
|
||||
}.get(browser, [])
|
||||
else:
|
||||
@@ -190,13 +201,13 @@ def get_browser_binary(browser: str) -> Optional[str]:
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
'chrome': get_chrome_user_data_dir,
|
||||
'chromium': get_chrome_user_data_dir, # Same locations
|
||||
'brave': get_brave_user_data_dir,
|
||||
'edge': get_edge_user_data_dir,
|
||||
"chrome": get_chrome_user_data_dir,
|
||||
"chromium": get_chrome_user_data_dir, # Same locations
|
||||
"brave": get_brave_user_data_dir,
|
||||
"edge": get_edge_user_data_dir,
|
||||
}
|
||||
|
||||
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -204,12 +215,12 @@ CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
# =============================================================================
|
||||
|
||||
NETSCAPE_COOKIE_HEADER = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
"# Netscape HTTP Cookie File",
|
||||
"# https://curl.se/docs/http-cookies.html",
|
||||
"# This file was generated by ArchiveBox persona cookie extraction",
|
||||
"#",
|
||||
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
|
||||
"",
|
||||
]
|
||||
|
||||
|
||||
@@ -219,9 +230,9 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
|
||||
return cookies
|
||||
|
||||
for line in path.read_text().splitlines():
|
||||
if not line or line.startswith('#'):
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
|
||||
@@ -233,8 +244,8 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
|
||||
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
|
||||
lines = list(NETSCAPE_COOKIE_HEADER)
|
||||
for cookie in cookies.values():
|
||||
lines.append('\t'.join(cookie))
|
||||
path.write_text('\n'.join(lines) + '\n')
|
||||
lines.append("\t".join(cookie))
|
||||
path.write_text("\n".join(lines) + "\n")
|
||||
|
||||
|
||||
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
|
||||
@@ -259,52 +270,52 @@ def extract_cookies_via_cdp(
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
# Find the cookie extraction script
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
|
||||
extract_script = chrome_plugin_dir / 'extract_cookies.js'
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome"
|
||||
extract_script = chrome_plugin_dir / "extract_cookies.js"
|
||||
|
||||
if not extract_script.exists():
|
||||
rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
|
||||
# Get node modules dir
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
|
||||
|
||||
# Set up environment
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(node_modules_dir)
|
||||
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env["NODE_MODULES_DIR"] = str(node_modules_dir)
|
||||
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
|
||||
env["CHROME_HEADLESS"] = "true"
|
||||
if chrome_binary:
|
||||
env['CHROME_BINARY'] = str(chrome_binary)
|
||||
env["CHROME_BINARY"] = str(chrome_binary)
|
||||
output_path = output_file
|
||||
temp_output = None
|
||||
temp_dir = None
|
||||
if output_file.exists():
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
|
||||
temp_output = temp_dir / 'cookies.txt'
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_"))
|
||||
temp_output = temp_dir / "cookies.txt"
|
||||
output_path = temp_output
|
||||
if profile_dir:
|
||||
extra_arg = f'--profile-directory={profile_dir}'
|
||||
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
|
||||
extra_arg = f"--profile-directory={profile_dir}"
|
||||
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
|
||||
args_list = []
|
||||
if existing_extra:
|
||||
if existing_extra.startswith('['):
|
||||
if existing_extra.startswith("["):
|
||||
try:
|
||||
parsed = json.loads(existing_extra)
|
||||
if isinstance(parsed, list):
|
||||
args_list.extend(str(x) for x in parsed)
|
||||
except Exception:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
else:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
args_list.append(extra_arg)
|
||||
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
|
||||
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
|
||||
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_path)
|
||||
env["COOKIES_OUTPUT_FILE"] = str(output_path)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(extract_script)],
|
||||
["node", str(extract_script)],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -316,17 +327,17 @@ def extract_cookies_via_cdp(
|
||||
_merge_netscape_cookies(output_file, temp_output)
|
||||
return True
|
||||
else:
|
||||
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
finally:
|
||||
if temp_dir and temp_dir.exists():
|
||||
@@ -337,6 +348,7 @@ def extract_cookies_via_cdp(
|
||||
# Validation Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate persona name to prevent path traversal attacks.
|
||||
@@ -348,19 +360,19 @@ def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
return False, "Persona name cannot be empty"
|
||||
|
||||
# Check for path separators
|
||||
if '/' in name or '\\' in name:
|
||||
if "/" in name or "\\" in name:
|
||||
return False, "Persona name cannot contain path separators (/ or \\)"
|
||||
|
||||
# Check for parent directory references
|
||||
if '..' in name:
|
||||
if ".." in name:
|
||||
return False, "Persona name cannot contain parent directory references (..)"
|
||||
|
||||
# Check for hidden files/directories
|
||||
if name.startswith('.'):
|
||||
if name.startswith("."):
|
||||
return False, "Persona name cannot start with a dot (.)"
|
||||
|
||||
# Ensure name doesn't contain null bytes or other dangerous chars
|
||||
if '\x00' in name or '\n' in name or '\r' in name:
|
||||
if "\x00" in name or "\n" in name or "\r" in name:
|
||||
return False, "Persona name contains invalid characters"
|
||||
|
||||
return True, ""
|
||||
@@ -394,10 +406,11 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_personas(
|
||||
names: Iterable[str],
|
||||
import_from: Optional[str] = None,
|
||||
profile: Optional[str] = None,
|
||||
import_from: str | None = None,
|
||||
profile: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Personas from names.
|
||||
@@ -416,7 +429,7 @@ def create_personas(
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Validate import source if specified
|
||||
@@ -424,23 +437,23 @@ def create_personas(
|
||||
if import_from:
|
||||
import_from = import_from.lower()
|
||||
if import_from not in BROWSER_PROFILE_FINDERS:
|
||||
rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
|
||||
rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
|
||||
rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr)
|
||||
rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
|
||||
if not source_profile_dir:
|
||||
rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr)
|
||||
|
||||
if profile is None and (source_profile_dir / 'Default').exists():
|
||||
profile = 'Default'
|
||||
if profile is None and (source_profile_dir / "Default").exists():
|
||||
profile = "Default"
|
||||
|
||||
browser_binary = get_browser_binary(import_from)
|
||||
if browser_binary:
|
||||
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr)
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
@@ -459,11 +472,11 @@ def create_personas(
|
||||
if created:
|
||||
persona.ensure_dirs()
|
||||
created_count += 1
|
||||
rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr)
|
||||
|
||||
cookies_file = Path(persona.path) / 'cookies.txt'
|
||||
cookies_file = Path(persona.path) / "cookies.txt"
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
|
||||
@@ -477,29 +490,31 @@ def create_personas(
|
||||
capture_storage=False,
|
||||
)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if import_result.profile_copied:
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr)
|
||||
if import_result.cookies_imported:
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr)
|
||||
elif not import_result.profile_copied:
|
||||
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr)
|
||||
|
||||
for warning in import_result.warnings:
|
||||
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr)
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
|
||||
"COOKIES_FILE": persona.COOKIES_FILE,
|
||||
},
|
||||
)
|
||||
|
||||
rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -507,10 +522,11 @@ def create_personas(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_personas(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
name__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Personas as JSONL with optional filters.
|
||||
@@ -523,33 +539,35 @@ def list_personas(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Persona.objects.all().order_by('name')
|
||||
queryset = Persona.objects.all().order_by("name")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
"name": name,
|
||||
"name__icontains": name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for persona in queryset:
|
||||
cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
|
||||
chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
|
||||
cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]"
|
||||
chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]"
|
||||
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
|
||||
rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]")
|
||||
else:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
|
||||
"COOKIES_FILE": persona.COOKIES_FILE,
|
||||
},
|
||||
)
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -557,7 +575,8 @@ def list_personas(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_personas(name: Optional[str] = None) -> int:
|
||||
|
||||
def update_personas(name: str | None = None) -> int:
|
||||
"""
|
||||
Update Personas from stdin JSONL.
|
||||
|
||||
@@ -575,13 +594,13 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
persona_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
persona_id = record.get("id")
|
||||
old_name = record.get("name")
|
||||
|
||||
if not persona_id and not old_name:
|
||||
continue
|
||||
@@ -613,17 +632,19 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
},
|
||||
)
|
||||
|
||||
except Persona.DoesNotExist:
|
||||
rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -631,6 +652,7 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Personas from stdin JSONL.
|
||||
@@ -646,23 +668,24 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect persona IDs or names
|
||||
persona_ids = []
|
||||
persona_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
persona_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
persona_names.append(r['name'])
|
||||
if r.get("id"):
|
||||
persona_ids.append(r["id"])
|
||||
elif r.get("name"):
|
||||
persona_names.append(r["name"])
|
||||
|
||||
if not persona_ids and not persona_names:
|
||||
rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
query = Q()
|
||||
if persona_ids:
|
||||
query |= Q(id__in=persona_ids)
|
||||
@@ -673,17 +696,17 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
count = personas.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr)
|
||||
for persona in personas:
|
||||
rprint(f' {persona.name} ({persona.path})', file=sys.stderr)
|
||||
rprint(f" {persona.name} ({persona.path})", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Delete persona directories and database records
|
||||
@@ -701,7 +724,7 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
persona.delete()
|
||||
deleted_count += 1
|
||||
|
||||
rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -709,44 +732,45 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Persona records (browser profiles)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
|
||||
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
|
||||
@main.command("create")
|
||||
@click.argument("names", nargs=-1)
|
||||
@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)")
|
||||
@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)")
|
||||
def create_cmd(names: tuple, import_from: str | None, profile: str | None):
|
||||
"""Create Personas, optionally importing from a browser profile."""
|
||||
sys.exit(create_personas(names, import_from=import_from, profile=profile))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", help="Filter by exact name")
|
||||
@click.option("--name__icontains", help="Filter by name contains")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
|
||||
"""List Personas as JSONL."""
|
||||
sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--name", "-n", help="Set new name")
|
||||
def update_cmd(name: str | None):
|
||||
"""Update Personas from stdin JSONL."""
|
||||
sys.exit(update_personas(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Personas from stdin JSONL."""
|
||||
sys.exit(delete_personas(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -137,7 +136,7 @@ BINARY_MACHINE_DIAGRAM = """
|
||||
@enforce_types
|
||||
def pluginmap(
|
||||
show_disabled: bool = False,
|
||||
model: Optional[str] = None,
|
||||
model: str | None = None,
|
||||
quiet: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -164,25 +163,25 @@ def pluginmap(
|
||||
|
||||
# Model event types that can have hooks
|
||||
model_events = {
|
||||
'Crawl': {
|
||||
'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': CRAWL_MACHINE_DIAGRAM,
|
||||
"Crawl": {
|
||||
"description": "Hooks run when a Crawl starts (QUEUED→STARTED)",
|
||||
"machine": "CrawlMachine",
|
||||
"diagram": CRAWL_MACHINE_DIAGRAM,
|
||||
},
|
||||
'CrawlEnd': {
|
||||
'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': None, # Part of CrawlMachine
|
||||
"CrawlEnd": {
|
||||
"description": "Hooks run when a Crawl finishes (STARTED→SEALED)",
|
||||
"machine": "CrawlMachine",
|
||||
"diagram": None, # Part of CrawlMachine
|
||||
},
|
||||
'Snapshot': {
|
||||
'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
|
||||
'machine': 'SnapshotMachine',
|
||||
'diagram': SNAPSHOT_MACHINE_DIAGRAM,
|
||||
"Snapshot": {
|
||||
"description": "Hooks run for each Snapshot (creates ArchiveResults)",
|
||||
"machine": "SnapshotMachine",
|
||||
"diagram": SNAPSHOT_MACHINE_DIAGRAM,
|
||||
},
|
||||
'Binary': {
|
||||
'description': 'Hooks for installing binary dependencies (providers)',
|
||||
'machine': 'BinaryMachine',
|
||||
'diagram': BINARY_MACHINE_DIAGRAM,
|
||||
"Binary": {
|
||||
"description": "Hooks for installing binary dependencies (providers)",
|
||||
"machine": "BinaryMachine",
|
||||
"diagram": BINARY_MACHINE_DIAGRAM,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -195,16 +194,16 @@ def pluginmap(
|
||||
model_events = {model: model_events[model]}
|
||||
|
||||
result = {
|
||||
'models': {},
|
||||
'plugins_dir': str(BUILTIN_PLUGINS_DIR),
|
||||
'user_plugins_dir': str(USER_PLUGINS_DIR),
|
||||
"models": {},
|
||||
"plugins_dir": str(BUILTIN_PLUGINS_DIR),
|
||||
"user_plugins_dir": str(USER_PLUGINS_DIR),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
prnt()
|
||||
prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
|
||||
prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
|
||||
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
|
||||
prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]")
|
||||
prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]")
|
||||
prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]")
|
||||
prnt()
|
||||
|
||||
for event_name, info in model_events.items():
|
||||
@@ -218,88 +217,93 @@ def pluginmap(
|
||||
plugin_name = hook_path.parent.name
|
||||
is_bg = is_background_hook(hook_path.name)
|
||||
|
||||
hook_infos.append({
|
||||
'path': str(hook_path),
|
||||
'name': hook_path.name,
|
||||
'plugin': plugin_name,
|
||||
'is_background': is_bg,
|
||||
'extension': hook_path.suffix,
|
||||
})
|
||||
hook_infos.append(
|
||||
{
|
||||
"path": str(hook_path),
|
||||
"name": hook_path.name,
|
||||
"plugin": plugin_name,
|
||||
"is_background": is_bg,
|
||||
"extension": hook_path.suffix,
|
||||
},
|
||||
)
|
||||
|
||||
result['models'][event_name] = {
|
||||
'description': info['description'],
|
||||
'machine': info['machine'],
|
||||
'hooks': hook_infos,
|
||||
'hook_count': len(hook_infos),
|
||||
result["models"][event_name] = {
|
||||
"description": info["description"],
|
||||
"machine": info["machine"],
|
||||
"hooks": hook_infos,
|
||||
"hook_count": len(hook_infos),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
# Show diagram if this model has one
|
||||
if info.get('diagram'):
|
||||
assert info['diagram'] is not None
|
||||
prnt(Panel(
|
||||
info['diagram'],
|
||||
title=f'[bold green]{info["machine"]}[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
if info.get("diagram"):
|
||||
assert info["diagram"] is not None
|
||||
prnt(
|
||||
Panel(
|
||||
info["diagram"],
|
||||
title=f"[bold green]{info['machine']}[/bold green]",
|
||||
border_style="green",
|
||||
expand=False,
|
||||
),
|
||||
)
|
||||
prnt()
|
||||
|
||||
# Create hooks table
|
||||
table = Table(
|
||||
title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
|
||||
title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)",
|
||||
box=box.ROUNDED,
|
||||
show_header=True,
|
||||
header_style='bold magenta',
|
||||
header_style="bold magenta",
|
||||
)
|
||||
table.add_column('Plugin', style='cyan', width=20)
|
||||
table.add_column('Hook Name', style='green')
|
||||
table.add_column('BG', justify='center', width=4)
|
||||
table.add_column('Type', justify='center', width=5)
|
||||
table.add_column("Plugin", style="cyan", width=20)
|
||||
table.add_column("Hook Name", style="green")
|
||||
table.add_column("BG", justify="center", width=4)
|
||||
table.add_column("Type", justify="center", width=5)
|
||||
|
||||
# Sort lexicographically by hook name
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h["name"])
|
||||
|
||||
for hook in sorted_hooks:
|
||||
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
|
||||
ext = hook['extension'].lstrip('.')
|
||||
bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else ""
|
||||
ext = hook["extension"].lstrip(".")
|
||||
table.add_row(
|
||||
hook['plugin'],
|
||||
hook['name'],
|
||||
hook["plugin"],
|
||||
hook["name"],
|
||||
bg_marker,
|
||||
ext,
|
||||
)
|
||||
|
||||
prnt(table)
|
||||
prnt()
|
||||
prnt(f'[dim]{info["description"]}[/dim]')
|
||||
prnt(f"[dim]{info['description']}[/dim]")
|
||||
prnt()
|
||||
|
||||
# Summary
|
||||
if not quiet:
|
||||
total_hooks = sum(m['hook_count'] for m in result['models'].values())
|
||||
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
|
||||
total_hooks = sum(m["hook_count"] for m in result["models"].values())
|
||||
prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]")
|
||||
prnt()
|
||||
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
|
||||
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
|
||||
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
|
||||
prnt('[dim] - ext: py, sh, or js[/dim]')
|
||||
prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]")
|
||||
prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]")
|
||||
prnt("[dim] - .bg: Background hook (non-blocking)[/dim]")
|
||||
prnt("[dim] - ext: py, sh, or js[/dim]")
|
||||
prnt()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
|
||||
@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
|
||||
@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too")
|
||||
@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams")
|
||||
@docstring(pluginmap.__doc__)
|
||||
def main(**kwargs):
|
||||
import json
|
||||
|
||||
result = pluginmap(**kwargs)
|
||||
if kwargs.get('quiet'):
|
||||
if kwargs.get("quiet"):
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -22,11 +22,10 @@ Examples:
|
||||
archivebox process list --limit=10
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox process'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox process"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -38,10 +37,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_processes(
|
||||
binary_name: Optional[str] = None,
|
||||
machine_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
binary_name: str | None = None,
|
||||
machine_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Processes as JSONL with optional filters.
|
||||
@@ -54,29 +54,29 @@ def list_processes(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
|
||||
queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {}
|
||||
if binary_name:
|
||||
filter_kwargs['binary__name'] = binary_name
|
||||
filter_kwargs["binary__name"] = binary_name
|
||||
if machine_id:
|
||||
filter_kwargs['machine_id'] = machine_id
|
||||
filter_kwargs["machine_id"] = machine_id
|
||||
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for process in queryset:
|
||||
if is_tty:
|
||||
binary_name_str = process.binary.name if process.binary else 'unknown'
|
||||
exit_code = process.exit_code if process.exit_code is not None else '?'
|
||||
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
|
||||
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
|
||||
binary_name_str = process.binary.name if process.binary else "unknown"
|
||||
exit_code = process.exit_code if process.exit_code is not None else "?"
|
||||
status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow"
|
||||
rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]")
|
||||
else:
|
||||
write_record(process.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -84,24 +84,27 @@ def list_processes(
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Process records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--binary-name', '-b', help='Filter by binary name')
|
||||
@click.option('--machine-id', '-m', help='Filter by machine ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--binary-name", "-b", help="Filter by binary name")
|
||||
@click.option("--machine-id", "-m", help="Filter by machine ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None):
|
||||
"""List Processes as JSONL."""
|
||||
sys.exit(list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox remove'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox remove"
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -26,25 +26,27 @@ from archivebox.misc.logging_util import (
|
||||
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_patterns: Iterable[str]=(),
|
||||
filter_type: str='exact',
|
||||
snapshots: QuerySet | None=None,
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
yes: bool=False,
|
||||
delete: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
def remove(
|
||||
filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = "exact",
|
||||
snapshots: QuerySet | None = None,
|
||||
after: float | None = None,
|
||||
before: float | None = None,
|
||||
yes: bool = False,
|
||||
delete: bool = False,
|
||||
out_dir: Path = DATA_DIR,
|
||||
) -> QuerySet:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
|
||||
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
|
||||
pattern_list = list(filter_patterns)
|
||||
|
||||
log_list_started(pattern_list or None, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
timer = TimedProgress(360, prefix=" ")
|
||||
try:
|
||||
snapshots = get_snapshots(
|
||||
snapshots=snapshots,
|
||||
@@ -63,7 +65,7 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
log_list_finished(snapshots)
|
||||
log_removal_started(snapshots, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
timer = TimedProgress(360, prefix=" ")
|
||||
try:
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
@@ -88,17 +90,23 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
|
||||
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
|
||||
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
|
||||
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
|
||||
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm")
|
||||
@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index")
|
||||
@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp")
|
||||
@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp")
|
||||
@click.option(
|
||||
"--filter-type",
|
||||
"-f",
|
||||
type=click.Choice(("exact", "substring", "domain", "regex", "tag")),
|
||||
default="exact",
|
||||
help="Type of pattern matching to use when filtering URLs",
|
||||
)
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(remove.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Remove the specified URLs from the archive"""
|
||||
remove(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -37,8 +37,8 @@ Examples:
|
||||
archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox run'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox run"
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -87,8 +87,8 @@ def process_stdin_records() -> int:
|
||||
binary_ids: list[str] = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_id = record.get('id')
|
||||
record_type = record.get("type", "")
|
||||
record_id = record.get("id")
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
@@ -97,10 +97,10 @@ def process_stdin_records() -> int:
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=record_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
# New crawl - create it
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if crawl:
|
||||
crawl.retry_at = timezone.now()
|
||||
@@ -112,16 +112,16 @@ def process_stdin_records() -> int:
|
||||
output_records.append(crawl.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type):
|
||||
if record_id:
|
||||
# Existing snapshot - re-queue
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=record_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
# New snapshot - create it
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if snapshot:
|
||||
snapshot.retry_at = timezone.now()
|
||||
@@ -132,7 +132,7 @@ def process_stdin_records() -> int:
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
run_all_plugins_for_crawl.add(crawl_id)
|
||||
@@ -149,11 +149,16 @@ def process_stdin_records() -> int:
|
||||
else:
|
||||
archiveresult = None
|
||||
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin_name = record.get('plugin')
|
||||
snapshot_id = record.get("snapshot_id")
|
||||
plugin_name = record.get("plugin")
|
||||
snapshot = None
|
||||
if archiveresult:
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
if archiveresult.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
plugin_name = plugin_name or archiveresult.plugin
|
||||
@@ -167,12 +172,12 @@ def process_stdin_records() -> int:
|
||||
snapshot.retry_at = timezone.now()
|
||||
if snapshot.status != Snapshot.StatusChoices.STARTED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl = snapshot.crawl
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
if plugin_name:
|
||||
@@ -203,7 +208,7 @@ def process_stdin_records() -> int:
|
||||
output_records.append(record)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Output all processed records (for chaining)
|
||||
@@ -212,10 +217,10 @@ def process_stdin_records() -> int:
|
||||
write_record(rec)
|
||||
|
||||
if queued_count == 0:
|
||||
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records to process[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr)
|
||||
|
||||
for binary_id in binary_ids:
|
||||
run_binary(binary_id)
|
||||
@@ -245,13 +250,14 @@ def run_runner(daemon: bool = False) -> int:
|
||||
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Process.cleanup_orphaned_workers()
|
||||
recover_orphaned_snapshots()
|
||||
recover_orphaned_crawls()
|
||||
Machine.current()
|
||||
current = Process.current()
|
||||
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
|
||||
current.process_type = Process.TypeChoices.ORCHESTRATOR
|
||||
current.save(update_fields=['process_type', 'modified_at'])
|
||||
current.save(update_fields=["process_type", "modified_at"])
|
||||
|
||||
try:
|
||||
run_pending_crawls(daemon=daemon)
|
||||
@@ -259,21 +265,21 @@ def run_runner(daemon: bool = False) -> int:
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
finally:
|
||||
current.refresh_from_db()
|
||||
if current.status != Process.StatusChoices.EXITED:
|
||||
current.status = Process.StatusChoices.EXITED
|
||||
current.ended_at = current.ended_at or timezone.now()
|
||||
current.save(update_fields=['status', 'ended_at', 'modified_at'])
|
||||
current.save(update_fields=["status", "ended_at", "modified_at"])
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--crawl-id', help="Run the crawl runner for a specific crawl only")
|
||||
@click.option('--snapshot-id', help="Run one snapshot through its crawl")
|
||||
@click.option('--binary-id', help="Run one queued binary install directly on the bus")
|
||||
@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only")
|
||||
@click.option("--snapshot-id", help="Run one snapshot through its crawl")
|
||||
@click.option("--binary-id", help="Run one queued binary install directly on the bus")
|
||||
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
"""
|
||||
Process queued work.
|
||||
@@ -297,21 +303,24 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if crawl_id:
|
||||
try:
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
run_crawl(crawl_id)
|
||||
sys.exit(0)
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -333,17 +342,18 @@ def run_snapshot_worker(snapshot_id: str) -> int:
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.select_related('crawl').get(id=snapshot_id)
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)])
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -10,18 +10,20 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
@enforce_types
|
||||
def schedule(add: bool = False,
|
||||
show: bool = False,
|
||||
clear: bool = False,
|
||||
foreground: bool = False,
|
||||
run_all: bool = False,
|
||||
quiet: bool = False,
|
||||
every: str | None = None,
|
||||
tag: str = '',
|
||||
depth: int | str = 0,
|
||||
overwrite: bool = False,
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: str | None = None):
|
||||
def schedule(
|
||||
add: bool = False,
|
||||
show: bool = False,
|
||||
clear: bool = False,
|
||||
foreground: bool = False,
|
||||
run_all: bool = False,
|
||||
quiet: bool = False,
|
||||
every: str | None = None,
|
||||
tag: str = "",
|
||||
depth: int | str = 0,
|
||||
overwrite: bool = False,
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: str | None = None,
|
||||
):
|
||||
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -33,55 +35,51 @@ def schedule(add: bool = False,
|
||||
|
||||
depth = int(depth)
|
||||
result: dict[str, object] = {
|
||||
'created_schedule_ids': [],
|
||||
'disabled_count': 0,
|
||||
'run_all_enqueued': 0,
|
||||
'active_schedule_ids': [],
|
||||
"created_schedule_ids": [],
|
||||
"disabled_count": 0,
|
||||
"run_all_enqueued": 0,
|
||||
"active_schedule_ids": [],
|
||||
}
|
||||
|
||||
def _active_schedules():
|
||||
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
|
||||
return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at")
|
||||
|
||||
if clear:
|
||||
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
|
||||
is_enabled=False,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
result['disabled_count'] = disabled_count
|
||||
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
|
||||
result["disabled_count"] = disabled_count
|
||||
print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]")
|
||||
|
||||
if every or add:
|
||||
schedule_str = (every or 'day').strip()
|
||||
schedule_str = (every or "day").strip()
|
||||
validate_schedule(schedule_str)
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
is_update_schedule = not import_path
|
||||
template_urls = import_path or 'archivebox://update'
|
||||
template_label = (
|
||||
f'Scheduled import: {template_urls}'
|
||||
if import_path else
|
||||
'Scheduled ArchiveBox update'
|
||||
)[:64]
|
||||
template_urls = import_path or "archivebox://update"
|
||||
template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64]
|
||||
template_notes = (
|
||||
f'Created by archivebox schedule for {template_urls}'
|
||||
if import_path else
|
||||
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
|
||||
f"Created by archivebox schedule for {template_urls}"
|
||||
if import_path
|
||||
else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls."
|
||||
)
|
||||
|
||||
template = Crawl.objects.create(
|
||||
urls=template_urls,
|
||||
max_depth=0 if is_update_schedule else depth,
|
||||
tags_str='' if is_update_schedule else tag,
|
||||
tags_str="" if is_update_schedule else tag,
|
||||
label=template_label,
|
||||
notes=template_notes,
|
||||
created_by_id=created_by_id,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': 0 if is_update_schedule else depth,
|
||||
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
|
||||
"ONLY_NEW": not update,
|
||||
"OVERWRITE": overwrite,
|
||||
"DEPTH": 0 if is_update_schedule else depth,
|
||||
"SCHEDULE_KIND": "update" if is_update_schedule else "crawl",
|
||||
},
|
||||
)
|
||||
crawl_schedule = CrawlSchedule.objects.create(
|
||||
@@ -92,31 +90,31 @@ def schedule(add: bool = False,
|
||||
notes=template_notes,
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
result['created_schedule_ids'] = [str(crawl_schedule.id)]
|
||||
result["created_schedule_ids"] = [str(crawl_schedule.id)]
|
||||
|
||||
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
|
||||
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
|
||||
print(f' id={crawl_schedule.id}')
|
||||
print(f' every={crawl_schedule.schedule}')
|
||||
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
|
||||
schedule_type = "maintenance update" if is_update_schedule else "crawl"
|
||||
print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]")
|
||||
print(f" id={crawl_schedule.id}")
|
||||
print(f" every={crawl_schedule.schedule}")
|
||||
print(f" next_run={crawl_schedule.next_run_at.isoformat()}")
|
||||
if import_path:
|
||||
print(f' source={import_path}')
|
||||
print(f" source={import_path}")
|
||||
|
||||
schedules = list(_active_schedules())
|
||||
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
|
||||
result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules]
|
||||
|
||||
if show:
|
||||
if schedules:
|
||||
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
|
||||
print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]")
|
||||
for scheduled_crawl in schedules:
|
||||
template = scheduled_crawl.template
|
||||
print(
|
||||
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
|
||||
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
|
||||
f'source={template.urls.splitlines()[0] if template.urls else ""}'
|
||||
f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} "
|
||||
f"next_run={scheduled_crawl.next_run_at.isoformat()} "
|
||||
f"source={template.urls.splitlines()[0] if template.urls else ''}",
|
||||
)
|
||||
else:
|
||||
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
||||
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
|
||||
|
||||
if run_all:
|
||||
enqueued = 0
|
||||
@@ -124,13 +122,17 @@ def schedule(add: bool = False,
|
||||
for scheduled_crawl in schedules:
|
||||
scheduled_crawl.enqueue(queued_at=now)
|
||||
enqueued += 1
|
||||
result['run_all_enqueued'] = enqueued
|
||||
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
|
||||
result["run_all_enqueued"] = enqueued
|
||||
print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]")
|
||||
if enqueued:
|
||||
print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
|
||||
print(
|
||||
"[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]",
|
||||
)
|
||||
|
||||
if foreground:
|
||||
print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
|
||||
print(
|
||||
"[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]",
|
||||
)
|
||||
run_pending_crawls(daemon=True)
|
||||
|
||||
if quiet:
|
||||
@@ -138,33 +140,38 @@ def schedule(add: bool = False,
|
||||
|
||||
if not any((every, add, show, clear, foreground, run_all)):
|
||||
if schedules:
|
||||
print('[green]\\[*] Active scheduled crawls:[/green]')
|
||||
print("[green]\\[*] Active scheduled crawls:[/green]")
|
||||
for scheduled_crawl in schedules:
|
||||
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
|
||||
print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}")
|
||||
else:
|
||||
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
||||
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
|
||||
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
|
||||
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
|
||||
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
|
||||
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
|
||||
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
|
||||
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
|
||||
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
|
||||
@click.argument('import_path', required=False)
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output")
|
||||
@click.option("--add", is_flag=True, help="Create a new scheduled crawl")
|
||||
@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots")
|
||||
@click.option(
|
||||
"--depth",
|
||||
type=click.Choice([str(i) for i in range(5)]),
|
||||
default="0",
|
||||
help="Recursively archive linked pages up to N hops away",
|
||||
)
|
||||
@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
||||
@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run")
|
||||
@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules")
|
||||
@click.option("--show", is_flag=True, help="Print all currently enabled schedules")
|
||||
@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)")
|
||||
@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once")
|
||||
@click.argument("import_path", required=False)
|
||||
@docstring(schedule.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
||||
schedule(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox search"
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Callable
|
||||
from typing import TYPE_CHECKING
|
||||
from collections.abc import Callable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -20,30 +21,28 @@ if TYPE_CHECKING:
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
||||
'exact': lambda pattern: Q(url=pattern),
|
||||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: (
|
||||
Q(url__istartswith=f'http://{pattern}')
|
||||
| Q(url__istartswith=f'https://{pattern}')
|
||||
| Q(url__istartswith=f'ftp://{pattern}')
|
||||
"exact": lambda pattern: Q(url=pattern),
|
||||
"substring": lambda pattern: Q(url__icontains=pattern),
|
||||
"regex": lambda pattern: Q(url__iregex=pattern),
|
||||
"domain": lambda pattern: (
|
||||
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
||||
),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
"tag": lambda pattern: Q(tags__name=pattern),
|
||||
"timestamp": lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
|
||||
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
|
||||
|
||||
|
||||
def _apply_pattern_filters(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
filter_patterns: list[str],
|
||||
filter_type: str,
|
||||
) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
filter_builder = LINK_FILTERS.get(filter_type)
|
||||
if filter_builder is None:
|
||||
stderr()
|
||||
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red')
|
||||
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
|
||||
raise SystemExit(2)
|
||||
|
||||
query = Q()
|
||||
@@ -53,7 +52,7 @@ def _apply_pattern_filters(
|
||||
|
||||
|
||||
def _snapshots_to_json(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
@@ -63,31 +62,35 @@ def _snapshots_to_json(
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.util import to_json
|
||||
|
||||
main_index_header = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': VERSION,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': {},
|
||||
},
|
||||
} if with_headers else {}
|
||||
main_index_header = (
|
||||
{
|
||||
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
|
||||
"schema": "archivebox.index.json",
|
||||
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
|
||||
"meta": {
|
||||
"project": "ArchiveBox",
|
||||
"version": VERSION,
|
||||
"git_sha": VERSION,
|
||||
"website": "https://ArchiveBox.io",
|
||||
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
|
||||
"source": "https://github.com/ArchiveBox/ArchiveBox",
|
||||
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
|
||||
"dependencies": {},
|
||||
},
|
||||
}
|
||||
if with_headers
|
||||
else {}
|
||||
)
|
||||
|
||||
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
output: dict[str, object] | list[dict[str, object]]
|
||||
if with_headers:
|
||||
output = {
|
||||
**main_index_header,
|
||||
'num_links': len(snapshot_dicts),
|
||||
'updated': datetime.now(tz.utc),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': snapshot_dicts,
|
||||
"num_links": len(snapshot_dicts),
|
||||
"updated": datetime.now(tz.utc),
|
||||
"last_run_cmd": sys.argv,
|
||||
"links": snapshot_dicts,
|
||||
}
|
||||
else:
|
||||
output = snapshot_dicts
|
||||
@@ -96,18 +99,18 @@ def _snapshots_to_json(
|
||||
|
||||
|
||||
def _snapshots_to_csv(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
cols: list[str],
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
header = ','.join(cols) if with_headers else ''
|
||||
rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return '\n'.join((header, *rows))
|
||||
header = ",".join(cols) if with_headers else ""
|
||||
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return "\n".join((header, *rows))
|
||||
|
||||
|
||||
def _snapshots_to_html(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
@@ -119,26 +122,31 @@ def _snapshots_to_html(
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
template = 'static_index.html' if with_headers else 'minimal_index.html'
|
||||
template = "static_index.html" if with_headers else "minimal_index.html"
|
||||
snapshot_list = list(snapshots.iterator(chunk_size=500))
|
||||
|
||||
return render_to_string(template, {
|
||||
'version': VERSION,
|
||||
'git_sha': get_COMMIT_HASH() or VERSION,
|
||||
'num_links': str(len(snapshot_list)),
|
||||
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
|
||||
'links': snapshot_list,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
})
|
||||
return render_to_string(
|
||||
template,
|
||||
{
|
||||
"version": VERSION,
|
||||
"git_sha": get_COMMIT_HASH() or VERSION,
|
||||
"num_links": str(len(snapshot_list)),
|
||||
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
|
||||
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
|
||||
"links": snapshot_list,
|
||||
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
|
||||
filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
def get_snapshots(
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
|
||||
filter_patterns: list[str] | None = None,
|
||||
filter_type: str = "substring",
|
||||
after: float | None = None,
|
||||
before: float | None = None,
|
||||
out_dir: Path = DATA_DIR,
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@@ -155,29 +163,31 @@ def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
|
||||
result = _apply_pattern_filters(result, filter_patterns, filter_type)
|
||||
|
||||
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
|
||||
result = result.select_related('crawl', 'crawl__created_by')
|
||||
result = result.select_related("crawl", "crawl__created_by")
|
||||
|
||||
if not result.exists():
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@enforce_types
|
||||
def search(filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
status: str='indexed',
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
sort: str | None=None,
|
||||
json: bool=False,
|
||||
html: bool=False,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
def search(
|
||||
filter_patterns: list[str] | None = None,
|
||||
filter_type: str = "substring",
|
||||
status: str = "indexed",
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
sort: str | None = None,
|
||||
json: bool = False,
|
||||
html: bool = False,
|
||||
csv: str | None = None,
|
||||
with_headers: bool = False,
|
||||
):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
|
||||
raise SystemExit(2)
|
||||
|
||||
# Query DB directly - no filesystem scanning
|
||||
@@ -189,9 +199,9 @@ def search(filter_patterns: list[str] | None=None,
|
||||
)
|
||||
|
||||
# Apply status filter
|
||||
if status == 'archived':
|
||||
if status == "archived":
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
elif status == 'unarchived':
|
||||
elif status == "unarchived":
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
# 'indexed' = all snapshots (no filter)
|
||||
|
||||
@@ -204,9 +214,10 @@ def search(filter_patterns: list[str] | None=None,
|
||||
elif html:
|
||||
output = _snapshots_to_html(snapshots, with_headers=with_headers)
|
||||
elif csv:
|
||||
output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers)
|
||||
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
|
||||
# Convert to dict for printable_folders
|
||||
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
|
||||
output = printable_folders(folders, with_headers)
|
||||
@@ -214,28 +225,33 @@ def search(filter_patterns: list[str] | None=None,
|
||||
# Structured exports must be written directly to stdout.
|
||||
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
if not output.endswith("\n"):
|
||||
sys.stdout.write("\n")
|
||||
return output
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
|
||||
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
|
||||
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
|
||||
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
|
||||
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
|
||||
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
|
||||
@click.help_option('--help', '-h')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option(
|
||||
"--filter-type",
|
||||
"-f",
|
||||
type=click.Choice(["search", *LINK_FILTERS.keys()]),
|
||||
default="substring",
|
||||
help="Pattern matching type for filtering URLs",
|
||||
)
|
||||
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
|
||||
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
|
||||
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
|
||||
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
|
||||
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
|
||||
@click.help_option("--help", "-h")
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(search.__doc__)
|
||||
def main(**kwargs):
|
||||
return search(**kwargs)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
@@ -15,20 +15,23 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
|
||||
"""Stop any existing orchestrator process so the server can take ownership."""
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
process_model.cleanup_orphaned_workers()
|
||||
|
||||
running_runners = list(process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by('created_at'))
|
||||
running_runners = list(
|
||||
process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by("created_at"),
|
||||
)
|
||||
|
||||
if not running_runners:
|
||||
return 0
|
||||
|
||||
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
|
||||
log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]")
|
||||
|
||||
if supervisor is not None and stop_worker_fn is not None:
|
||||
for worker_name in ('worker_runner', 'worker_runner_watch'):
|
||||
for worker_name in ("worker_runner", "worker_runner_watch"):
|
||||
try:
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
except Exception:
|
||||
@@ -47,23 +50,70 @@ def stop_existing_background_runner(*, machine, process_model, supervisor=None,
|
||||
return len(running_runners)
|
||||
|
||||
|
||||
def _read_supervisor_worker_command(worker_name: str) -> str:
|
||||
from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file
|
||||
|
||||
worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf"
|
||||
if not worker_conf.exists():
|
||||
return ""
|
||||
|
||||
for line in worker_conf.read_text().splitlines():
|
||||
if line.startswith("command="):
|
||||
return line.removeprefix("command=").strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _worker_command_matches_bind(command: str, host: str, port: str) -> bool:
|
||||
if not command:
|
||||
return False
|
||||
return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command)
|
||||
|
||||
|
||||
def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int:
|
||||
"""Stop existing ArchiveBox web workers if they already own the requested bind."""
|
||||
stopped = 0
|
||||
|
||||
for worker_name in ("worker_runserver", "worker_daphne"):
|
||||
try:
|
||||
proc = supervisor.getProcessInfo(worker_name) if supervisor else None
|
||||
except Exception:
|
||||
proc = None
|
||||
if not isinstance(proc, dict) or proc.get("statename") != "RUNNING":
|
||||
continue
|
||||
|
||||
command = _read_supervisor_worker_command(worker_name)
|
||||
if not _worker_command_matches_bind(command, host, port):
|
||||
continue
|
||||
|
||||
if stopped == 0:
|
||||
log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]")
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
stopped += 1
|
||||
|
||||
return stopped
|
||||
|
||||
|
||||
@enforce_types
|
||||
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool=False,
|
||||
init: bool=False,
|
||||
debug: bool=False,
|
||||
daemonize: bool=False,
|
||||
nothreading: bool=False) -> None:
|
||||
def server(
|
||||
runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool = False,
|
||||
init: bool = False,
|
||||
debug: bool = False,
|
||||
daemonize: bool = False,
|
||||
nothreading: bool = False,
|
||||
) -> None:
|
||||
"""Run the ArchiveBox HTTP server"""
|
||||
|
||||
runserver_args = list(runserver_args)
|
||||
|
||||
|
||||
if init:
|
||||
from archivebox.cli.archivebox_init import init as archivebox_init
|
||||
|
||||
archivebox_init(quick=True)
|
||||
print()
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
|
||||
check_data_folder()
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
@@ -73,22 +123,24 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
SHELL_CONFIG.DEBUG = True
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
|
||||
print()
|
||||
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print(
|
||||
"[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:",
|
||||
)
|
||||
print(" [green]archivebox manage createsuperuser[/green]")
|
||||
print()
|
||||
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
host = "127.0.0.1"
|
||||
port = "8000"
|
||||
|
||||
try:
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||
if ':' in host_and_port:
|
||||
host, port = host_and_port.split(':')
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0]
|
||||
if ":" in host_and_port:
|
||||
host, port = host_and_port.split(":")
|
||||
else:
|
||||
if '.' in host_and_port:
|
||||
if "." in host_and_port:
|
||||
host = host_and_port
|
||||
else:
|
||||
port = host_and_port
|
||||
@@ -104,66 +156,80 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
machine = Machine.current()
|
||||
supervisor = get_existing_supervisord_process()
|
||||
stop_existing_background_runner(
|
||||
machine=machine,
|
||||
process_model=Process,
|
||||
supervisor=get_existing_supervisord_process(),
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
)
|
||||
if supervisor:
|
||||
stop_existing_server_workers(
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
host=host,
|
||||
port=port,
|
||||
)
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f"[red][X] Error: Port {port} is already in use[/red]")
|
||||
print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}")
|
||||
print(" Stop the conflicting process or choose a different port")
|
||||
sys.exit(1)
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
|
||||
server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne"
|
||||
server_proc = get_worker(supervisor, server_worker_name)
|
||||
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
|
||||
if server_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
if runner_watch_state == 'RUNNING':
|
||||
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
|
||||
server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None
|
||||
if server_state == "RUNNING":
|
||||
runner_proc = get_worker(supervisor, "worker_runner")
|
||||
runner_watch_proc = get_worker(supervisor, "worker_runner_watch")
|
||||
runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None
|
||||
print("[red][X] Error: ArchiveBox server is already running[/red]")
|
||||
print(
|
||||
f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
|
||||
)
|
||||
if runner_state == "RUNNING":
|
||||
print(" [green]√[/green] Background runner (worker_runner) is RUNNING")
|
||||
if runner_watch_state == "RUNNING":
|
||||
print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING")
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print("[yellow]To stop the existing server, run:[/yellow]")
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
print(" pkill -f supervisord")
|
||||
sys.exit(1)
|
||||
|
||||
if run_in_debug:
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]")
|
||||
else:
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print("[green][+] Starting ArchiveBox webserver...[/green]")
|
||||
print(
|
||||
f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
|
||||
)
|
||||
print(
|
||||
f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]",
|
||||
)
|
||||
print(" > Writing ArchiveBox error log to ./logs/errors.log")
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('runserver_args', nargs=-1)
|
||||
@click.option('--reload', is_flag=True, help='Enable auto-reloading when code or templates change')
|
||||
@click.option('--debug', is_flag=True, help='Enable DEBUG=True mode with more verbose errors')
|
||||
@click.option('--nothreading', is_flag=True, help='Force runserver to run in single-threaded mode')
|
||||
@click.option('--init', is_flag=True, help='Run a full archivebox init/upgrade before starting the server')
|
||||
@click.option('--daemonize', is_flag=True, help='Run the server in the background as a daemon')
|
||||
@click.argument("runserver_args", nargs=-1)
|
||||
@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change")
|
||||
@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors")
|
||||
@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode")
|
||||
@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server")
|
||||
@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon")
|
||||
@docstring(server.__doc__)
|
||||
def main(**kwargs):
|
||||
server(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,27 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def shell(args: Iterable[str]=()) -> None:
|
||||
def shell(args: Iterable[str] = ()) -> None:
|
||||
"""Enter an interactive ArchiveBox Django shell"""
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
call_command("shell_plus", *args)
|
||||
|
||||
|
||||
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.argument("args", nargs=-1)
|
||||
@docstring(shell.__doc__)
|
||||
def main(args: Iterable[str]=()) -> None:
|
||||
def main(args: Iterable[str] = ()) -> None:
|
||||
shell(args=args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,14 +27,16 @@ Examples:
|
||||
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox snapshot"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
from django.db.models import Q, Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
@@ -43,12 +45,13 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_snapshots(
|
||||
urls: Iterable[str],
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
tag: str = "",
|
||||
status: str = "queued",
|
||||
depth: int = 0,
|
||||
created_by_id: Optional[int] = None,
|
||||
created_by_id: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
||||
@@ -59,8 +62,10 @@ def create_snapshots(
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_CRAWL
|
||||
read_args_or_stdin,
|
||||
write_record,
|
||||
TYPE_SNAPSHOT,
|
||||
TYPE_CRAWL,
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -73,7 +78,7 @@ def create_snapshots(
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Process each record - handle Crawls and plain URLs/Snapshots
|
||||
@@ -81,7 +86,7 @@ def create_snapshots(
|
||||
pass_through_count = 0
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
@@ -91,14 +96,14 @@ def create_snapshots(
|
||||
|
||||
# Input is a Crawl - get or create it, then create Snapshots for its URLs
|
||||
crawl = None
|
||||
crawl_id = record.get('id')
|
||||
crawl_id = record.get("id")
|
||||
if crawl_id:
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if not crawl:
|
||||
continue
|
||||
@@ -109,27 +114,27 @@ def create_snapshots(
|
||||
if tag:
|
||||
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
|
||||
snapshot_record = {
|
||||
'url': url,
|
||||
'tags': merged_tags,
|
||||
'crawl_id': str(crawl.id),
|
||||
'depth': depth,
|
||||
'status': status,
|
||||
"url": url,
|
||||
"tags": merged_tags,
|
||||
"crawl_id": str(crawl.id),
|
||||
"depth": depth,
|
||||
"status": status,
|
||||
}
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or record.get('url'):
|
||||
elif record_type == TYPE_SNAPSHOT or record.get("url"):
|
||||
# Input is a Snapshot or plain URL
|
||||
if tag and not record.get('tags'):
|
||||
record['tags'] = tag
|
||||
if tag and not record.get("tags"):
|
||||
record["tags"] = tag
|
||||
if status:
|
||||
record['status'] = status
|
||||
record['depth'] = record.get('depth', depth)
|
||||
record["status"] = status
|
||||
record["depth"] = record.get("depth", depth)
|
||||
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
@@ -142,21 +147,21 @@ def create_snapshots(
|
||||
pass_through_count += 1
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
if not created_snapshots:
|
||||
if pass_through_count > 0:
|
||||
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
||||
rprint("[red]No snapshots created[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr)
|
||||
|
||||
if is_tty:
|
||||
for snapshot in created_snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
@@ -165,16 +170,19 @@ def create_snapshots(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_snapshots(
|
||||
status: Optional[str] = None,
|
||||
url__icontains: Optional[str] = None,
|
||||
url__istartswith: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
sort: Optional[str] = None,
|
||||
csv: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
url__icontains: str | None = None,
|
||||
url__istartswith: str | None = None,
|
||||
tag: str | None = None,
|
||||
crawl_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
sort: str | None = None,
|
||||
csv: str | None = None,
|
||||
with_headers: bool = False,
|
||||
search: str | None = None,
|
||||
query: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
@@ -184,64 +192,106 @@ def list_snapshots(
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.search import (
|
||||
get_default_search_mode,
|
||||
get_search_mode,
|
||||
prioritize_metadata_matches,
|
||||
query_search_index,
|
||||
)
|
||||
|
||||
if with_headers and not csv:
|
||||
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
|
||||
rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
is_tty = sys.stdout.isatty() and not csv
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'url__icontains': url__icontains,
|
||||
'url__istartswith': url__istartswith,
|
||||
'crawl_id': crawl_id,
|
||||
"status": status,
|
||||
"url__icontains": url__icontains,
|
||||
"url__istartswith": url__istartswith,
|
||||
"crawl_id": crawl_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
queryset = apply_filters(queryset, filter_kwargs)
|
||||
|
||||
# Tag filter requires special handling (M2M)
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
query = (query or "").strip()
|
||||
if query:
|
||||
metadata_qs = queryset.filter(
|
||||
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
|
||||
)
|
||||
requested_search_mode = (search or "").strip().lower()
|
||||
if requested_search_mode == "content":
|
||||
requested_search_mode = "contents"
|
||||
search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode)
|
||||
|
||||
if search_mode == "meta":
|
||||
queryset = metadata_qs
|
||||
else:
|
||||
try:
|
||||
deep_qsearch = None
|
||||
if search_mode == "deep":
|
||||
qsearch = query_search_index(query, search_mode="contents")
|
||||
deep_qsearch = query_search_index(query, search_mode="deep")
|
||||
else:
|
||||
qsearch = query_search_index(query, search_mode=search_mode)
|
||||
queryset = prioritize_metadata_matches(
|
||||
queryset,
|
||||
metadata_qs,
|
||||
qsearch,
|
||||
deep_queryset=deep_qsearch,
|
||||
ordering=("-created_at",) if not sort else None,
|
||||
)
|
||||
except Exception as err:
|
||||
rprint(
|
||||
f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
queryset = metadata_qs
|
||||
|
||||
if sort:
|
||||
queryset = queryset.order_by(sort)
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
count = 0
|
||||
if csv:
|
||||
cols = [col.strip() for col in csv.split(',') if col.strip()]
|
||||
cols = [col.strip() for col in csv.split(",") if col.strip()]
|
||||
if not cols:
|
||||
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
|
||||
rprint("[red]No CSV columns provided[/red]", file=sys.stderr)
|
||||
return 2
|
||||
rows: list[str] = []
|
||||
if with_headers:
|
||||
rows.append(','.join(cols))
|
||||
rows.append(",".join(cols))
|
||||
for snapshot in queryset.iterator(chunk_size=500):
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=','))
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=","))
|
||||
count += 1
|
||||
output = '\n'.join(rows)
|
||||
output = "\n".join(rows)
|
||||
if output:
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
if not output.endswith("\n"):
|
||||
sys.stdout.write("\n")
|
||||
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(snapshot.status, 'dim')
|
||||
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"sealed": "green",
|
||||
}.get(snapshot.status, "dim")
|
||||
rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}")
|
||||
else:
|
||||
write_record(snapshot.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -249,9 +299,10 @@ def list_snapshots(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_snapshots(
|
||||
status: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
tag: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Snapshots from stdin JSONL.
|
||||
@@ -272,12 +323,12 @@ def update_snapshots(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id')
|
||||
snapshot_id = record.get("id")
|
||||
if not snapshot_id:
|
||||
continue
|
||||
|
||||
@@ -292,6 +343,7 @@ def update_snapshots(
|
||||
# Add tag to existing tags
|
||||
snapshot.save() # Ensure saved before M2M
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
tag_obj, _ = Tag.objects.get_or_create(name=tag)
|
||||
snapshot.tags.add(tag_obj)
|
||||
|
||||
@@ -302,10 +354,10 @@ def update_snapshots(
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -313,6 +365,7 @@ def update_snapshots(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Snapshots from stdin JSONL.
|
||||
@@ -328,35 +381,35 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshot_ids = [r.get('id') for r in records if r.get('id')]
|
||||
snapshot_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
|
||||
count = snapshots.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr)
|
||||
for snapshot in snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = snapshots.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -364,57 +417,81 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Snapshot records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@main.command("create")
|
||||
@click.argument("urls", nargs=-1)
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
|
||||
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
|
||||
"""Create Snapshots from URLs or stdin JSONL."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--url__icontains", help="Filter by URL contains")
|
||||
@click.option("--url__istartswith", help="Filter by URL starts with")
|
||||
@click.option("--tag", "-t", help="Filter by tag name")
|
||||
@click.option("--crawl-id", help="Filter by crawl ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
|
||||
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
|
||||
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
|
||||
@click.argument("query", nargs=-1)
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
url__icontains: str | None,
|
||||
url__istartswith: str | None,
|
||||
tag: str | None,
|
||||
crawl_id: str | None,
|
||||
limit: int | None,
|
||||
sort: str | None,
|
||||
csv: str | None,
|
||||
with_headers: bool,
|
||||
search: str | None,
|
||||
query: tuple[str, ...],
|
||||
):
|
||||
"""List Snapshots as JSONL."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
search=search,
|
||||
query=" ".join(query),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--tag', '-t', help='Add tag')
|
||||
def update_cmd(status: Optional[str], tag: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
@click.option("--tag", "-t", help="Add tag")
|
||||
def update_cmd(status: str | None, tag: str | None):
|
||||
"""Update Snapshots from stdin JSONL."""
|
||||
sys.exit(update_snapshots(status=status, tag=tag))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Snapshots from stdin JSONL."""
|
||||
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox snapshot"
|
||||
|
||||
import sys
|
||||
|
||||
@@ -10,15 +10,15 @@ import rich_click as click
|
||||
from archivebox.cli.archivebox_snapshot import create_snapshots
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.command(context_settings={"ignore_unknown_options": True})
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
|
||||
@click.argument("urls", nargs=-1)
|
||||
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
@@ -16,31 +16,34 @@ from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
|
||||
@enforce_types
|
||||
def status(out_dir: Path=DATA_DIR) -> None:
|
||||
def status(out_dir: Path = DATA_DIR) -> None:
|
||||
"""Print out some info and statistics about the archive collection"""
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.db.models import Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
print('[green]\\[*] Scanning archive main index...[/green]')
|
||||
print(f'[yellow] {out_dir}/*[/yellow]')
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
|
||||
print("[green]\\[*] Scanning archive main index...[/green]")
|
||||
print(f"[yellow] {out_dir}/*[/yellow]")
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.")
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Index size: {size} across {num_files} files')
|
||||
print(f" Index size: {size} across {num_files} files")
|
||||
print()
|
||||
|
||||
links = list(Snapshot.objects.all())
|
||||
links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)))
|
||||
num_sql_links = len(links)
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
|
||||
print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})")
|
||||
print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)")
|
||||
print()
|
||||
print('[green]\\[*] Scanning archive data directories...[/green]')
|
||||
users_dir = out_dir / 'users'
|
||||
print("[green]\\[*] Scanning archive data directories...[/green]")
|
||||
users_dir = out_dir / "users"
|
||||
scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()]
|
||||
scan_roots_display = ', '.join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
|
||||
print(f'[yellow] {scan_roots_display}[/yellow]')
|
||||
scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
|
||||
print(f"[yellow] {scan_roots_display}[/yellow]")
|
||||
num_bytes = num_dirs = num_files = 0
|
||||
for root in scan_roots:
|
||||
root_bytes, root_dirs, root_files = get_dir_size(root)
|
||||
@@ -48,80 +51,66 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
num_dirs += root_dirs
|
||||
num_files += root_files
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print(f" Size: {size} across {num_files} files in {num_dirs} directories")
|
||||
|
||||
# Use DB as source of truth for snapshot status
|
||||
num_indexed = len(links)
|
||||
num_archived = sum(1 for snapshot in links if snapshot.is_archived)
|
||||
num_unarchived = max(num_indexed - num_archived, 0)
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
|
||||
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
|
||||
print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)")
|
||||
print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)")
|
||||
print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)")
|
||||
|
||||
# Count snapshot directories on filesystem across both legacy and current layouts.
|
||||
expected_snapshot_dirs = {
|
||||
str(Path(snapshot.output_dir).resolve())
|
||||
for snapshot in links
|
||||
if Path(snapshot.output_dir).exists()
|
||||
}
|
||||
expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()}
|
||||
discovered_snapshot_dirs = set()
|
||||
|
||||
if ARCHIVE_DIR.exists():
|
||||
discovered_snapshot_dirs.update(
|
||||
str(entry.resolve())
|
||||
for entry in ARCHIVE_DIR.iterdir()
|
||||
if entry.is_dir()
|
||||
)
|
||||
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir())
|
||||
|
||||
if users_dir.exists():
|
||||
discovered_snapshot_dirs.update(
|
||||
str(entry.resolve())
|
||||
for entry in users_dir.glob('*/snapshots/*/*/*')
|
||||
if entry.is_dir()
|
||||
)
|
||||
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir())
|
||||
|
||||
orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs)
|
||||
num_present = len(discovered_snapshot_dirs)
|
||||
num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs)
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), '(snapshot directories on disk)')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
|
||||
print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)")
|
||||
print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)")
|
||||
|
||||
num_orphaned = len(orphaned_dirs)
|
||||
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
|
||||
print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)")
|
||||
|
||||
if num_indexed:
|
||||
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
|
||||
print(" [violet]Hint:[/violet] You can list snapshots by status like so:")
|
||||
print(" [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]")
|
||||
|
||||
if orphaned_dirs:
|
||||
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:")
|
||||
print(" [green]archivebox init[/green]")
|
||||
|
||||
print()
|
||||
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
|
||||
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
|
||||
admin_users = User.objects.filter(is_superuser=True).exclude(username='system')
|
||||
print("[green]\\[*] Scanning recent archive changes and user logins:[/green]")
|
||||
print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]")
|
||||
admin_users = User.objects.filter(is_superuser=True).exclude(username="system")
|
||||
users = [user.get_username() for user in admin_users]
|
||||
print(f' UI users {len(users)}: {", ".join(users)}')
|
||||
last_login = admin_users.order_by('last_login').last()
|
||||
print(f" UI users {len(users)}: {', '.join(users)}")
|
||||
last_login = admin_users.order_by("last_login").last()
|
||||
if last_login:
|
||||
print(f' Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}')
|
||||
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
|
||||
print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}")
|
||||
last_downloaded = Snapshot.objects.order_by("downloaded_at").last()
|
||||
if last_downloaded:
|
||||
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
|
||||
print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}")
|
||||
|
||||
if not users:
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] You can create an admin user by running:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print(" [violet]Hint:[/violet] You can create an admin user by running:")
|
||||
print(" [green]archivebox manage createsuperuser[/green]")
|
||||
|
||||
print()
|
||||
recent_snapshots = sorted(
|
||||
links,
|
||||
key=lambda snapshot: (
|
||||
snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at
|
||||
),
|
||||
key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at,
|
||||
reverse=True,
|
||||
)[:10]
|
||||
for snapshot in recent_snapshots:
|
||||
@@ -129,14 +118,14 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
continue
|
||||
print(
|
||||
(
|
||||
'[grey53] '
|
||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||
"[grey53] "
|
||||
f" > {str(snapshot.downloaded_at)[:16]} "
|
||||
f"[{snapshot.num_outputs} {('X', '√')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] "
|
||||
f'"{snapshot.title}": {snapshot.url}'
|
||||
'[/grey53]'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||
"[/grey53]"
|
||||
)[: SHELL_CONFIG.TERM_WIDTH],
|
||||
)
|
||||
print('[grey53] ...')
|
||||
print("[grey53] ...")
|
||||
|
||||
|
||||
@click.command()
|
||||
@@ -146,5 +135,5 @@ def main(**kwargs):
|
||||
status(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,11 +27,11 @@ Examples:
|
||||
archivebox tag list --name=unused | archivebox tag delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox tag'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox tag"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -43,6 +43,7 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_tags(names: Iterable[str]) -> int:
|
||||
"""
|
||||
Create Tags from names.
|
||||
@@ -60,7 +61,7 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
created_count = 0
|
||||
@@ -76,11 +77,11 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
|
||||
if created:
|
||||
created_count += 1
|
||||
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr)
|
||||
|
||||
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -88,10 +89,11 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_tags(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
name__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Tags as JSONL with optional filters.
|
||||
@@ -104,12 +106,12 @@ def list_tags(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Tag.objects.all().order_by('name')
|
||||
queryset = Tag.objects.all().order_by("name")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
"name": name,
|
||||
"name__icontains": name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -117,12 +119,12 @@ def list_tags(
|
||||
for tag in queryset:
|
||||
snapshot_count = tag.snapshot_set.count()
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
|
||||
rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]")
|
||||
else:
|
||||
write_record(tag.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -130,7 +132,8 @@ def list_tags(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_tags(name: Optional[str] = None) -> int:
|
||||
|
||||
def update_tags(name: str | None = None) -> int:
|
||||
"""
|
||||
Update Tags from stdin JSONL.
|
||||
|
||||
@@ -148,13 +151,13 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
tag_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
tag_id = record.get("id")
|
||||
old_name = record.get("name")
|
||||
|
||||
if not tag_id and not old_name:
|
||||
continue
|
||||
@@ -176,10 +179,10 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
write_record(tag.to_json())
|
||||
|
||||
except Tag.DoesNotExist:
|
||||
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -187,6 +190,7 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Tags from stdin JSONL.
|
||||
@@ -202,23 +206,24 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect tag IDs or names
|
||||
tag_ids = []
|
||||
tag_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
tag_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
tag_names.append(r['name'])
|
||||
if r.get("id"):
|
||||
tag_ids.append(r["id"])
|
||||
elif r.get("name"):
|
||||
tag_names.append(r["name"])
|
||||
|
||||
if not tag_ids and not tag_names:
|
||||
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
query = Q()
|
||||
if tag_ids:
|
||||
query |= Q(id__in=tag_ids)
|
||||
@@ -229,22 +234,22 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
count = tags.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr)
|
||||
for tag in tags:
|
||||
rprint(f' {tag.name}', file=sys.stderr)
|
||||
rprint(f" {tag.name}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = tags.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -252,42 +257,43 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Tag records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@main.command("create")
|
||||
@click.argument("names", nargs=-1)
|
||||
def create_cmd(names: tuple):
|
||||
"""Create Tags from names."""
|
||||
sys.exit(create_tags(names))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", help="Filter by exact name")
|
||||
@click.option("--name__icontains", help="Filter by name contains")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
|
||||
"""List Tags as JSONL."""
|
||||
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--name", "-n", help="Set new name")
|
||||
def update_cmd(name: str | None):
|
||||
"""Update Tags from stdin JSONL."""
|
||||
sys.exit(update_tags(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Tags from stdin JSONL."""
|
||||
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from collections.abc import Callable, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
@@ -20,24 +21,22 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
||||
'exact': lambda pattern: Q(url=pattern),
|
||||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: (
|
||||
Q(url__istartswith=f'http://{pattern}')
|
||||
| Q(url__istartswith=f'https://{pattern}')
|
||||
| Q(url__istartswith=f'ftp://{pattern}')
|
||||
"exact": lambda pattern: Q(url=pattern),
|
||||
"substring": lambda pattern: Q(url__icontains=pattern),
|
||||
"regex": lambda pattern: Q(url__iregex=pattern),
|
||||
"domain": lambda pattern: (
|
||||
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
||||
),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
"tag": lambda pattern: Q(tags__name=pattern),
|
||||
"timestamp": lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
|
||||
def _apply_pattern_filters(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
filter_patterns: list[str],
|
||||
filter_type: str,
|
||||
) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
filter_builder = LINK_FILTERS.get(filter_type)
|
||||
if filter_builder is None:
|
||||
raise SystemExit(2)
|
||||
@@ -48,21 +47,120 @@ def _apply_pattern_filters(
|
||||
return snapshots.filter(query)
|
||||
|
||||
|
||||
def _get_snapshot_crawl(snapshot: 'Snapshot') -> 'Crawl | None':
|
||||
def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None":
|
||||
try:
|
||||
return snapshot.crawl
|
||||
except ObjectDoesNotExist:
|
||||
return None
|
||||
|
||||
|
||||
def _get_search_indexing_plugins() -> list[str]:
|
||||
from abx_dl.models import discover_plugins
|
||||
from archivebox.hooks import get_search_backends
|
||||
|
||||
available_backends = set(get_search_backends())
|
||||
plugins = discover_plugins()
|
||||
return sorted(
|
||||
plugin_name
|
||||
for plugin_name, plugin in plugins.items()
|
||||
if plugin_name.startswith("search_backend_")
|
||||
and plugin_name.removeprefix("search_backend_") in available_backends
|
||||
and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks)
|
||||
)
|
||||
|
||||
|
||||
def _build_filtered_snapshots_queryset(
|
||||
*,
|
||||
filter_patterns: Iterable[str],
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
resume: str | None = None,
|
||||
):
|
||||
from archivebox.core.models import Snapshot
|
||||
from datetime import datetime
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
|
||||
|
||||
if before:
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
if resume:
|
||||
snapshots = snapshots.filter(timestamp__lte=resume)
|
||||
|
||||
return snapshots.select_related("crawl").order_by("-bookmarked_at")
|
||||
|
||||
|
||||
def reindex_snapshots(
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
search_plugins: list[str],
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
from archivebox.cli.archivebox_extract import run_plugins
|
||||
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0}
|
||||
records: list[dict[str, str]] = []
|
||||
|
||||
total = snapshots.count()
|
||||
print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}")
|
||||
|
||||
for snapshot in snapshots.iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
continue
|
||||
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
if has_directory:
|
||||
snapshot.reconcile_with_index_json()
|
||||
stats["reconciled"] += 1
|
||||
|
||||
for plugin_name in search_plugins:
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
||||
if existing_result:
|
||||
existing_result.reset_for_retry()
|
||||
records.append(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": plugin_name,
|
||||
},
|
||||
)
|
||||
stats["queued"] += 1
|
||||
|
||||
if not records:
|
||||
return stats
|
||||
|
||||
exit_code = run_plugins(
|
||||
args=(),
|
||||
records=records,
|
||||
wait=True,
|
||||
emit_results=False,
|
||||
)
|
||||
if exit_code != 0:
|
||||
raise SystemExit(exit_code)
|
||||
|
||||
stats["reindexed"] = len(records)
|
||||
return stats
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = 'exact',
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False) -> None:
|
||||
def update(
|
||||
filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = "exact",
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False,
|
||||
index_only: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving.
|
||||
|
||||
@@ -77,41 +175,69 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
|
||||
from rich import print
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
# Run migrations first to ensure DB schema is up-to-date
|
||||
print('[*] Checking for pending migrations...')
|
||||
print("[*] Checking for pending migrations...")
|
||||
try:
|
||||
call_command('migrate', '--no-input', verbosity=0)
|
||||
call_command("migrate", "--no-input", verbosity=0)
|
||||
except Exception as e:
|
||||
print(f'[!] Warning: Migration check failed: {e}')
|
||||
print(f"[!] Warning: Migration check failed: {e}")
|
||||
|
||||
while True:
|
||||
if filter_patterns or before or after:
|
||||
if index_only:
|
||||
search_plugins = _get_search_indexing_plugins()
|
||||
if not search_plugins:
|
||||
print("[*] No search indexing plugins are available, nothing to backfill.")
|
||||
break
|
||||
|
||||
if not (filter_patterns or before or after):
|
||||
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
|
||||
drain_old_archive_dirs(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
snapshots = _build_filtered_snapshots_queryset(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
resume=resume,
|
||||
)
|
||||
stats = reindex_snapshots(
|
||||
snapshots,
|
||||
search_plugins=search_plugins,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
print_index_stats(stats)
|
||||
elif filter_patterns or before or after:
|
||||
# Filtered mode: query DB only
|
||||
print('[*] Processing filtered snapshots from database...')
|
||||
print("[*] Processing filtered snapshots from database...")
|
||||
stats = process_filtered_snapshots(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
batch_size=batch_size
|
||||
resume=resume,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
print_stats(stats)
|
||||
else:
|
||||
# Full mode: drain old dirs + process DB
|
||||
stats_combined = {'phase1': {}, 'phase2': {}}
|
||||
stats_combined = {"phase1": {}, "phase2": {}}
|
||||
|
||||
print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
|
||||
stats_combined['phase1'] = drain_old_archive_dirs(
|
||||
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
|
||||
stats_combined["phase1"] = drain_old_archive_dirs(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
print('[*] Phase 2: Processing all database snapshots (most recent first)...')
|
||||
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
|
||||
print("[*] Phase 2: Processing all database snapshots (most recent first)...")
|
||||
stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume)
|
||||
|
||||
# Phase 3: Deduplication (disabled for now)
|
||||
# print('[*] Phase 3: Deduplicating...')
|
||||
@@ -122,7 +248,7 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
if not continuous:
|
||||
break
|
||||
|
||||
print('[yellow]Sleeping 60s before next pass...[/yellow]')
|
||||
print("[yellow]Sleeping 60s before next pass...[/yellow]")
|
||||
time.sleep(60)
|
||||
resume = None
|
||||
|
||||
@@ -144,34 +270,34 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
from archivebox.config import CONSTANTS
|
||||
from django.db import transaction
|
||||
|
||||
stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
|
||||
stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0}
|
||||
|
||||
archive_dir = CONSTANTS.ARCHIVE_DIR
|
||||
if not archive_dir.exists():
|
||||
return stats
|
||||
|
||||
print('[DEBUG Phase1] Scanning for old directories in archive/...')
|
||||
print("[DEBUG Phase1] Scanning for old directories in archive/...")
|
||||
|
||||
# Scan for real directories only (skip symlinks - they're already migrated)
|
||||
all_entries = list(os.scandir(archive_dir))
|
||||
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
|
||||
print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}")
|
||||
entries = [
|
||||
(e.stat().st_mtime, e.path)
|
||||
for e in all_entries
|
||||
if e.is_dir(follow_symlinks=False) # Skip symlinks
|
||||
]
|
||||
entries.sort(reverse=True) # Newest first
|
||||
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
|
||||
print(f'[*] Found {len(entries)} old directories to drain')
|
||||
print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}")
|
||||
print(f"[*] Found {len(entries)} old directories to drain")
|
||||
|
||||
for mtime, entry_path in entries:
|
||||
entry_path = Path(entry_path)
|
||||
|
||||
# Resume from timestamp if specified
|
||||
if resume_from and entry_path.name < resume_from:
|
||||
if resume_from and entry_path.name > resume_from:
|
||||
continue
|
||||
|
||||
stats['processed'] += 1
|
||||
stats["processed"] += 1
|
||||
|
||||
# Try to load existing snapshot from DB
|
||||
snapshot = Snapshot.load_from_directory(entry_path)
|
||||
@@ -182,16 +308,16 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if not snapshot:
|
||||
# Invalid directory - move to invalid/
|
||||
Snapshot.move_directory_to_invalid(entry_path)
|
||||
stats['invalid'] += 1
|
||||
stats["invalid"] += 1
|
||||
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
|
||||
continue
|
||||
|
||||
try:
|
||||
snapshot.save()
|
||||
stats['migrated'] += 1
|
||||
stats["migrated"] += 1
|
||||
print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}")
|
||||
except Exception as e:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
continue
|
||||
|
||||
@@ -201,30 +327,35 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if not has_valid_crawl:
|
||||
# Create a new crawl (created_by will default to system user)
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.create(urls=snapshot.url)
|
||||
# Use queryset update to avoid triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
|
||||
# Refresh the instance
|
||||
snapshot.crawl = crawl
|
||||
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
|
||||
|
||||
# Check if needs migration (0.8.x → 0.9.x)
|
||||
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
print(
|
||||
f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
|
||||
)
|
||||
if snapshot.fs_migration_needed:
|
||||
try:
|
||||
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
|
||||
# because snapshot.timestamp might be truncated
|
||||
old_dir = entry_path
|
||||
new_dir = snapshot.get_storage_path_for_version('0.9.0')
|
||||
new_dir = snapshot.get_storage_path_for_version("0.9.0")
|
||||
print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")
|
||||
|
||||
# Manually migrate files
|
||||
if not new_dir.exists() and old_dir.exists():
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
import shutil
|
||||
|
||||
file_count = 0
|
||||
for old_file in old_dir.rglob('*'):
|
||||
for old_file in old_dir.rglob("*"):
|
||||
if old_file.is_file():
|
||||
rel_path = old_file.relative_to(old_dir)
|
||||
new_file = new_dir / rel_path
|
||||
@@ -236,7 +367,8 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
|
||||
# Update only fs_version field using queryset update (bypasses validation)
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
|
||||
|
||||
# Commit the transaction
|
||||
transaction.commit()
|
||||
@@ -245,22 +377,22 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if old_dir.exists() and old_dir != new_dir:
|
||||
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
|
||||
|
||||
stats['migrated'] += 1
|
||||
stats["migrated"] += 1
|
||||
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
|
||||
except Exception as e:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
|
||||
transaction.commit()
|
||||
return stats
|
||||
|
||||
|
||||
def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]:
|
||||
"""
|
||||
O(n) scan over entire DB from most recent to least recent.
|
||||
|
||||
@@ -275,24 +407,30 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0}
|
||||
|
||||
total = Snapshot.objects.count()
|
||||
print(f'[*] Processing {total} snapshots from database (most recent first)...')
|
||||
queryset = Snapshot.objects.all()
|
||||
if resume:
|
||||
queryset = queryset.filter(timestamp__lte=resume)
|
||||
total = queryset.count()
|
||||
print(f"[*] Processing {total} snapshots from database (most recent first)...")
|
||||
|
||||
# Process from most recent to least recent
|
||||
for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size):
|
||||
stats['processed'] += 1
|
||||
for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
# Skip snapshots with missing crawl references (orphaned by migration errors)
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
print(
|
||||
f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
|
||||
)
|
||||
|
||||
# Check if snapshot has a directory on disk
|
||||
from pathlib import Path
|
||||
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
|
||||
@@ -313,22 +451,23 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
|
||||
# Use queryset update to set fs_version without triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
snapshot.fs_version = '0.9.0'
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
|
||||
snapshot.fs_version = "0.9.0"
|
||||
|
||||
# Queue for archiving (state machine will handle it)
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1 if has_directory else 0
|
||||
stats['queued'] += 1
|
||||
stats["reconciled"] += 1 if has_directory else 0
|
||||
stats["queued"] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed (e.g., missing crawl)
|
||||
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
|
||||
continue
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
@@ -341,31 +480,28 @@ def process_filtered_snapshots(
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
batch_size: int
|
||||
resume: str | None,
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
"""Process snapshots matching filters (DB query only)."""
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from datetime import datetime
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0}
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
|
||||
|
||||
if before:
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
snapshots = _build_filtered_snapshots_queryset(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
resume=resume,
|
||||
)
|
||||
|
||||
total = snapshots.count()
|
||||
print(f'[*] Found {total} matching snapshots')
|
||||
print(f"[*] Found {total} matching snapshots")
|
||||
|
||||
for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size):
|
||||
stats['processed'] += 1
|
||||
for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
# Skip snapshots with missing crawl references
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
@@ -384,14 +520,14 @@ def process_filtered_snapshots(
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1
|
||||
stats['queued'] += 1
|
||||
stats["reconciled"] += 1
|
||||
stats["queued"] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed
|
||||
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
|
||||
continue
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
@@ -405,9 +541,9 @@ def print_stats(stats: dict):
|
||||
|
||||
print(f"""
|
||||
[green]Update Complete[/green]
|
||||
Processed: {stats['processed']}
|
||||
Reconciled: {stats['reconciled']}
|
||||
Queued: {stats['queued']}
|
||||
Processed: {stats["processed"]}
|
||||
Reconciled: {stats["reconciled"]}
|
||||
Queued: {stats["queued"]}
|
||||
""")
|
||||
|
||||
|
||||
@@ -415,37 +551,50 @@ def print_combined_stats(stats_combined: dict):
|
||||
"""Print statistics for full mode."""
|
||||
from rich import print
|
||||
|
||||
s1 = stats_combined['phase1']
|
||||
s2 = stats_combined['phase2']
|
||||
s1 = stats_combined["phase1"]
|
||||
s2 = stats_combined["phase2"]
|
||||
|
||||
print(f"""
|
||||
[green]Archive Update Complete[/green]
|
||||
|
||||
Phase 1 (Drain Old Dirs):
|
||||
Checked: {s1.get('processed', 0)}
|
||||
Migrated: {s1.get('migrated', 0)}
|
||||
Skipped: {s1.get('skipped', 0)}
|
||||
Invalid: {s1.get('invalid', 0)}
|
||||
Checked: {s1.get("processed", 0)}
|
||||
Migrated: {s1.get("migrated", 0)}
|
||||
Skipped: {s1.get("skipped", 0)}
|
||||
Invalid: {s1.get("invalid", 0)}
|
||||
|
||||
Phase 2 (Process DB):
|
||||
Processed: {s2.get('processed', 0)}
|
||||
Reconciled: {s2.get('reconciled', 0)}
|
||||
Queued: {s2.get('queued', 0)}
|
||||
Processed: {s2.get("processed", 0)}
|
||||
Reconciled: {s2.get("reconciled", 0)}
|
||||
Queued: {s2.get("queued", 0)}
|
||||
""")
|
||||
|
||||
|
||||
def print_index_stats(stats: dict[str, Any]) -> None:
|
||||
from rich import print
|
||||
|
||||
print(f"""
|
||||
[green]Search Reindex Complete[/green]
|
||||
Processed: {stats["processed"]}
|
||||
Reconciled: {stats["reconciled"]}
|
||||
Queued: {stats["queued"]}
|
||||
Reindexed: {stats["reindexed"]}
|
||||
""")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--resume', type=str, help='Resume from timestamp')
|
||||
@click.option('--before', type=float, help='Only snapshots before timestamp')
|
||||
@click.option('--after', type=float, help='Only snapshots after timestamp')
|
||||
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
|
||||
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
|
||||
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option("--resume", type=str, help="Resume from timestamp")
|
||||
@click.option("--before", type=float, help="Only snapshots before timestamp")
|
||||
@click.option("--after", type=float, help="Only snapshots after timestamp")
|
||||
@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact")
|
||||
@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots")
|
||||
@click.option("--continuous", is_flag=True, help="Run continuously as background worker")
|
||||
@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content")
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
update(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import sys
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -14,19 +14,22 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def version(quiet: bool=False,
|
||||
binaries: Iterable[str]=()) -> list[str]:
|
||||
def version(
|
||||
quiet: bool = False,
|
||||
binaries: Iterable[str] = (),
|
||||
) -> list[str]:
|
||||
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
|
||||
|
||||
|
||||
# fast path for just getting the version and exiting, dont do any slower imports
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
print(VERSION)
|
||||
if quiet or '--version' in sys.argv:
|
||||
if quiet or "--version" in sys.argv:
|
||||
return []
|
||||
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.console import Console
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS, DATA_DIR
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
|
||||
@@ -34,78 +37,89 @@ def version(quiet: bool=False,
|
||||
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.misc.logging_util import printable_folder_status
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
|
||||
# Check if LDAP is enabled (simple config lookup)
|
||||
config = get_config()
|
||||
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
|
||||
LDAP_ENABLED = config.get("LDAP_ENABLED", False)
|
||||
|
||||
p = platform.uname()
|
||||
COMMIT_HASH = get_COMMIT_HASH()
|
||||
prnt(
|
||||
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={get_BUILD_TIME()}',
|
||||
f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]",
|
||||
f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}",
|
||||
f"BUILD_TIME={get_BUILD_TIME()}",
|
||||
)
|
||||
prnt(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
|
||||
f'ARCH={p.machine}',
|
||||
f'OS={p.system}',
|
||||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||
f"IN_DOCKER={IN_DOCKER}",
|
||||
f"IN_QEMU={SHELL_CONFIG.IN_QEMU}",
|
||||
f"ARCH={p.machine}",
|
||||
f"OS={p.system}",
|
||||
f"PLATFORM={platform.platform()}",
|
||||
f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""),
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
except Exception:
|
||||
OUTPUT_IS_REMOTE_FS = False
|
||||
|
||||
|
||||
try:
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
|
||||
f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}",
|
||||
f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}",
|
||||
f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}",
|
||||
f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}",
|
||||
)
|
||||
except Exception:
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
|
||||
)
|
||||
|
||||
|
||||
prnt(
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
f'SUDO={CONSTANTS.IS_ROOT}',
|
||||
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_ENABLED}',
|
||||
f"DEBUG={SHELL_CONFIG.DEBUG}",
|
||||
f"IS_TTY={SHELL_CONFIG.IS_TTY}",
|
||||
f"SUDO={CONSTANTS.IS_ROOT}",
|
||||
f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}",
|
||||
f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}",
|
||||
f"LDAP={LDAP_ENABLED}",
|
||||
)
|
||||
prnt()
|
||||
|
||||
|
||||
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||
PANEL_TEXT = '\n'.join((
|
||||
'',
|
||||
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||
'',
|
||||
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
|
||||
'',
|
||||
))
|
||||
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
PANEL_TEXT = "\n".join(
|
||||
(
|
||||
"",
|
||||
"[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...",
|
||||
" [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.",
|
||||
"",
|
||||
" [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]",
|
||||
"",
|
||||
),
|
||||
)
|
||||
prnt(
|
||||
Panel(
|
||||
PANEL_TEXT,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]",
|
||||
subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
|
||||
),
|
||||
)
|
||||
prnt()
|
||||
return []
|
||||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]")
|
||||
failures = []
|
||||
|
||||
# Setup Django before importing models
|
||||
try:
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
@@ -113,12 +127,17 @@ def version(quiet: bool=False,
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all binaries from the database with timeout protection
|
||||
all_installed = Binary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
all_installed = (
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("name")
|
||||
)
|
||||
|
||||
if not all_installed.exists():
|
||||
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
|
||||
else:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
@@ -126,71 +145,91 @@ def version(quiet: bool=False,
|
||||
continue
|
||||
|
||||
if installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
|
||||
version_str = (installed.version or "unknown")[:15]
|
||||
provider = (installed.binprovider or "env")[:8]
|
||||
prnt(
|
||||
"",
|
||||
"[green]√[/green]",
|
||||
"",
|
||||
installed.name.ljust(18),
|
||||
version_str.ljust(16),
|
||||
provider.ljust(8),
|
||||
display_path,
|
||||
overflow="ignore",
|
||||
crop=False,
|
||||
)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
|
||||
failures.append(installed.name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()
|
||||
if not has_any_installed:
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]")
|
||||
|
||||
except Exception as e:
|
||||
# Handle database errors gracefully (locked, missing, etc.)
|
||||
prnt()
|
||||
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
|
||||
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
|
||||
prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]")
|
||||
prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]")
|
||||
|
||||
if not binaries:
|
||||
# Show code and data locations
|
||||
prnt()
|
||||
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||
prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]")
|
||||
try:
|
||||
for name, path in get_code_locations().items():
|
||||
if isinstance(name, str) and isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting code locations: {e}[/red]')
|
||||
prnt(f" [red]Error getting code locations: {e}[/red]")
|
||||
|
||||
prnt()
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||
prnt("[bright_yellow][i] Data locations:[/bright_yellow]")
|
||||
try:
|
||||
for name, path in get_data_locations().items():
|
||||
if isinstance(name, str) and isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting data locations: {e}[/red]')
|
||||
|
||||
prnt(f" [red]Error getting data locations: {e}[/red]")
|
||||
|
||||
try:
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
|
||||
check_data_dir_permissions()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
prnt()
|
||||
prnt('[red][i] Data locations:[/red] (not in a data directory)')
|
||||
|
||||
prnt("[red][i] Data locations:[/red] (not in a data directory)")
|
||||
|
||||
prnt()
|
||||
|
||||
|
||||
if failures:
|
||||
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
|
||||
prnt(f' [red]{", ".join(failures)}[/red]')
|
||||
prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]")
|
||||
prnt(f" [red]{', '.join(failures)}[/red]")
|
||||
prnt()
|
||||
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
|
||||
prnt(' [green]archivebox install[/green]')
|
||||
prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:")
|
||||
prnt(" [green]archivebox install[/green]")
|
||||
prnt()
|
||||
return failures
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
|
||||
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
|
||||
@click.option(
|
||||
"--quiet",
|
||||
"-q",
|
||||
is_flag=True,
|
||||
help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)",
|
||||
)
|
||||
@click.option(
|
||||
"--binaries",
|
||||
"-b",
|
||||
help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)",
|
||||
)
|
||||
@docstring(version.__doc__)
|
||||
def main(**kwargs):
|
||||
failures = version(**kwargs)
|
||||
@@ -198,5 +237,5 @@ def main(**kwargs):
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,12 +5,10 @@ This module contains common utilities used across multiple CLI commands,
|
||||
extracted to avoid code duplication.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None):
|
||||
"""
|
||||
Apply Django-style filters from CLI kwargs to a QuerySet.
|
||||
|
||||
@@ -31,11 +29,11 @@ def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is None or key in ('limit', 'offset'):
|
||||
if value is None or key in ("limit", "offset"):
|
||||
continue
|
||||
# Handle CSV lists for __in filters
|
||||
if key.endswith('__in') and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(',')]
|
||||
if key.endswith("__in") and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(",")]
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
|
||||
@@ -5,16 +5,16 @@ This module provides backwards-compatible config exports for extractors
|
||||
and other modules that expect to import config values directly.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
__order__ = 200
|
||||
|
||||
from .paths import (
|
||||
PACKAGE_DIR, # noqa
|
||||
DATA_DIR, # noqa
|
||||
ARCHIVE_DIR, # noqa
|
||||
PACKAGE_DIR,
|
||||
DATA_DIR,
|
||||
ARCHIVE_DIR,
|
||||
)
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .version import VERSION # noqa
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .version import VERSION # noqa
|
||||
|
||||
|
||||
###############################################################################
|
||||
@@ -22,15 +22,18 @@ from .version import VERSION # noqa
|
||||
# These provide backwards compatibility with extractors that import from ..config
|
||||
###############################################################################
|
||||
|
||||
|
||||
def _get_config():
|
||||
"""Lazy import to avoid circular imports."""
|
||||
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
return ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
|
||||
# Direct exports (evaluated at import time for backwards compat)
|
||||
# These are recalculated each time the module attribute is accessed
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""
|
||||
Module-level __getattr__ for lazy config loading.
|
||||
@@ -40,38 +43,38 @@ def __getattr__(name: str):
|
||||
"""
|
||||
|
||||
# Generic timeout settings (used by multiple plugins)
|
||||
if name == 'TIMEOUT':
|
||||
if name == "TIMEOUT":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.TIMEOUT
|
||||
|
||||
# Generic SSL/Security settings (used by multiple plugins)
|
||||
if name == 'CHECK_SSL_VALIDITY':
|
||||
if name == "CHECK_SSL_VALIDITY":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.CHECK_SSL_VALIDITY
|
||||
|
||||
# Generic storage settings (used by multiple plugins)
|
||||
if name == 'RESTRICT_FILE_NAMES':
|
||||
if name == "RESTRICT_FILE_NAMES":
|
||||
_, storage = _get_config()
|
||||
return storage.RESTRICT_FILE_NAMES
|
||||
|
||||
# Generic user agent / cookies (used by multiple plugins)
|
||||
if name == 'COOKIES_FILE':
|
||||
if name == "COOKIES_FILE":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.COOKIES_FILE
|
||||
if name == 'USER_AGENT':
|
||||
if name == "USER_AGENT":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
|
||||
# Generic resolution settings (used by multiple plugins)
|
||||
if name == 'RESOLUTION':
|
||||
if name == "RESOLUTION":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.RESOLUTION
|
||||
|
||||
# Allowlist/Denylist patterns (compiled regexes)
|
||||
if name == 'SAVE_ALLOWLIST_PTN':
|
||||
if name == "SAVE_ALLOWLIST_PTN":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_ALLOWLIST_PTNS
|
||||
if name == 'SAVE_DENYLIST_PTN':
|
||||
if name == "SAVE_DENYLIST_PTN":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_DENYLIST_PTNS
|
||||
|
||||
@@ -90,12 +93,13 @@ def get_CONFIG():
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
from .ldap import LDAP_CONFIG
|
||||
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
'LDAP_CONFIG': LDAP_CONFIG,
|
||||
"SHELL_CONFIG": SHELL_CONFIG,
|
||||
"STORAGE_CONFIG": STORAGE_CONFIG,
|
||||
"GENERAL_CONFIG": GENERAL_CONFIG,
|
||||
"SERVER_CONFIG": SERVER_CONFIG,
|
||||
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
|
||||
"SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
|
||||
"LDAP_CONFIG": LDAP_CONFIG,
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import json
|
||||
from typing import Any, Optional, Type, Tuple, Dict
|
||||
from typing import Any
|
||||
|
||||
from pathlib import Path
|
||||
from configparser import ConfigParser
|
||||
@@ -27,13 +27,15 @@ def get_real_name(key: str) -> str:
|
||||
return key
|
||||
|
||||
|
||||
def load_config_val(key: str,
|
||||
default: Any=None,
|
||||
type: Optional[Type]=None,
|
||||
aliases: Optional[Tuple[str, ...]]=None,
|
||||
config: Optional[benedict]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> Any:
|
||||
def load_config_val(
|
||||
key: str,
|
||||
default: Any = None,
|
||||
type: type | None = None,
|
||||
aliases: tuple[str, ...] | None = None,
|
||||
config: benedict | None = None,
|
||||
env_vars: os._Environ | None = None,
|
||||
config_file_vars: dict[str, str] | None = None,
|
||||
) -> Any:
|
||||
"""parse bool, int, and str key=value pairs from env"""
|
||||
|
||||
assert isinstance(config, dict)
|
||||
@@ -67,8 +69,8 @@ def load_config_val(key: str,
|
||||
assert isinstance(val, str)
|
||||
|
||||
# calculate value based on expected type
|
||||
BOOL_TRUEIES = ('true', 'yes', '1')
|
||||
BOOL_FALSEIES = ('false', 'no', '0')
|
||||
BOOL_TRUEIES = ("true", "yes", "1")
|
||||
BOOL_FALSEIES = ("false", "no", "0")
|
||||
|
||||
if type is bool:
|
||||
if val.lower() in BOOL_TRUEIES:
|
||||
@@ -76,28 +78,28 @@ def load_config_val(key: str,
|
||||
elif val.lower() in BOOL_FALSEIES:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
|
||||
raise ValueError(f"Invalid configuration option {key}={val} (expected a boolean: True/False)")
|
||||
|
||||
elif type is str:
|
||||
if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES):
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)')
|
||||
raise ValueError(f"Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)")
|
||||
return val.strip()
|
||||
|
||||
elif type is int:
|
||||
if not val.strip().isdigit():
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
|
||||
raise ValueError(f"Invalid configuration option {key}={val} (expected an integer)")
|
||||
return int(val.strip())
|
||||
|
||||
elif type is list or type is dict:
|
||||
return json.loads(val)
|
||||
|
||||
|
||||
elif type is Path:
|
||||
return Path(val)
|
||||
|
||||
raise Exception('Config values can only be str, bool, int, or json')
|
||||
raise Exception("Config values can only be str, bool, int, or json")
|
||||
|
||||
|
||||
def load_config_file() -> Optional[benedict]:
|
||||
def load_config_file() -> benedict | None:
|
||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
@@ -105,17 +107,16 @@ def load_config_file() -> Optional[benedict]:
|
||||
config_file = CaseConfigParser()
|
||||
config_file.read(config_path)
|
||||
# flatten into one namespace
|
||||
config_file_vars = benedict({
|
||||
key.upper(): val
|
||||
for section, options in config_file.items()
|
||||
for key, val in options.items()
|
||||
})
|
||||
config_file_vars = benedict({key.upper(): val for section, options in config_file.items() for key, val in options.items()})
|
||||
# print('[i] Loaded config file', os.path.abspath(config_path))
|
||||
# print(config_file_vars)
|
||||
return config_file_vars
|
||||
return None
|
||||
|
||||
|
||||
class PluginConfigSection:
|
||||
"""Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
|
||||
|
||||
toml_section_header = "PLUGINS"
|
||||
|
||||
def __init__(self, key: str):
|
||||
@@ -144,8 +145,14 @@ def section_for_key(key: str) -> Any:
|
||||
)
|
||||
|
||||
# First check core config sections
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
for section in [
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
]:
|
||||
if hasattr(section, key):
|
||||
return section
|
||||
|
||||
@@ -154,20 +161,19 @@ def section_for_key(key: str) -> Any:
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' in schema and key in schema['properties']:
|
||||
if "properties" in schema and key in schema["properties"]:
|
||||
# All plugin config goes to [PLUGINS] section
|
||||
return PluginConfigSection(key)
|
||||
|
||||
raise ValueError(f'No config section found for key: {key}')
|
||||
raise ValueError(f"No config section found for key: {key}")
|
||||
|
||||
|
||||
def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
def write_config_file(config: dict[str, str]) -> benedict:
|
||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||
|
||||
from archivebox.misc.system import atomic_write
|
||||
|
||||
CONFIG_HEADER = (
|
||||
"""# This is the config file for your ArchiveBox collection.
|
||||
CONFIG_HEADER = """# This is the config file for your ArchiveBox collection.
|
||||
#
|
||||
# You can add options here manually in INI format, or automatically by running:
|
||||
# archivebox config --set KEY=VALUE
|
||||
@@ -178,7 +184,7 @@ def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
# A list of all possible config with documentation and examples can be found here:
|
||||
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||
|
||||
""")
|
||||
"""
|
||||
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
|
||||
@@ -188,57 +194,56 @@ def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
config_file = CaseConfigParser()
|
||||
config_file.read(config_path)
|
||||
|
||||
with open(config_path, 'r', encoding='utf-8') as old:
|
||||
atomic_write(f'{config_path}.bak', old.read())
|
||||
with open(config_path, encoding="utf-8") as old:
|
||||
atomic_write(f"{config_path}.bak", old.read())
|
||||
|
||||
# Set up sections in empty config file
|
||||
for key, val in config.items():
|
||||
section = section_for_key(key)
|
||||
assert section is not None
|
||||
|
||||
if not hasattr(section, 'toml_section_header'):
|
||||
raise ValueError(f'{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.')
|
||||
|
||||
|
||||
if not hasattr(section, "toml_section_header"):
|
||||
raise ValueError(f"{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.")
|
||||
|
||||
section_name = section.toml_section_header
|
||||
|
||||
|
||||
if section_name in config_file:
|
||||
existing_config = dict(config_file[section_name])
|
||||
else:
|
||||
existing_config = {}
|
||||
|
||||
|
||||
config_file[section_name] = benedict({**existing_config, key: val})
|
||||
section.update_in_place(warn=False, persist=False, **{key: val})
|
||||
|
||||
with open(config_path, 'w+', encoding='utf-8') as new:
|
||||
with open(config_path, "w+", encoding="utf-8") as new:
|
||||
config_file.write(new)
|
||||
|
||||
updated_config = {}
|
||||
try:
|
||||
# validate the updated_config by attempting to re-parse it
|
||||
from archivebox.config.configset import get_flat_config
|
||||
|
||||
updated_config = {**load_all_config(), **get_flat_config()}
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
# something went horribly wrong, revert to the previous version
|
||||
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
||||
with open(f"{config_path}.bak", encoding="utf-8") as old:
|
||||
atomic_write(config_path, old.read())
|
||||
|
||||
raise
|
||||
|
||||
if os.access(f'{config_path}.bak', os.F_OK):
|
||||
os.remove(f'{config_path}.bak')
|
||||
if os.access(f"{config_path}.bak", os.F_OK):
|
||||
os.remove(f"{config_path}.bak")
|
||||
|
||||
return benedict({
|
||||
key.upper(): updated_config.get(key.upper())
|
||||
for key in config.keys()
|
||||
})
|
||||
return benedict({key.upper(): updated_config.get(key.upper()) for key in config.keys()})
|
||||
|
||||
|
||||
|
||||
def load_config(defaults: Dict[str, Any],
|
||||
config: Optional[benedict]=None,
|
||||
out_dir: Optional[str]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> benedict:
|
||||
def load_config(
|
||||
defaults: dict[str, Any],
|
||||
config: benedict | None = None,
|
||||
out_dir: str | None = None,
|
||||
env_vars: os._Environ | None = None,
|
||||
config_file_vars: dict[str, str] | None = None,
|
||||
) -> benedict:
|
||||
|
||||
env_vars = env_vars or os.environ
|
||||
config_file_vars = config_file_vars or load_config_file()
|
||||
@@ -249,9 +254,9 @@ def load_config(defaults: Dict[str, Any],
|
||||
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
|
||||
extended_config[key] = load_config_val(
|
||||
key,
|
||||
default=default['default'],
|
||||
type=default.get('type'),
|
||||
aliases=default.get('aliases'),
|
||||
default=default["default"],
|
||||
type=default.get("type"),
|
||||
aliases=default.get("aliases"),
|
||||
config=extended_config,
|
||||
env_vars=env_vars,
|
||||
config_file_vars=config_file_vars,
|
||||
@@ -260,19 +265,20 @@ def load_config(defaults: Dict[str, Any],
|
||||
raise SystemExit(0)
|
||||
except Exception as e:
|
||||
stderr()
|
||||
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
|
||||
stderr(' {}: {}'.format(e.__class__.__name__, e))
|
||||
stderr(f"[X] Error while loading configuration value: {key}", color="red", config=extended_config)
|
||||
stderr(f" {e.__class__.__name__}: {e}")
|
||||
stderr()
|
||||
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
||||
stderr(" Check your config for mistakes and try again (your archive data is unaffected).")
|
||||
stderr()
|
||||
stderr(' For config documentation and examples see:')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
||||
stderr(" For config documentation and examples see:")
|
||||
stderr(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration")
|
||||
stderr()
|
||||
# raise
|
||||
# raise SystemExit(2)
|
||||
|
||||
return benedict(extended_config)
|
||||
|
||||
|
||||
def load_all_config():
|
||||
"""Load all config sections and return as a flat dict."""
|
||||
from archivebox.config.common import (
|
||||
@@ -283,11 +289,17 @@ def load_all_config():
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
|
||||
flat_config = benedict()
|
||||
|
||||
for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
|
||||
for config_section in [
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
]:
|
||||
flat_config.update(dict(config_section))
|
||||
|
||||
|
||||
return flat_config
|
||||
|
||||
@@ -4,7 +4,7 @@ import re
|
||||
import secrets
|
||||
import sys
|
||||
import shutil
|
||||
from typing import ClassVar, Dict, Optional, List
|
||||
from typing import ClassVar
|
||||
from pathlib import Path
|
||||
|
||||
from rich.console import Console
|
||||
@@ -39,8 +39,8 @@ class ShellConfig(BaseConfigSet):
|
||||
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
||||
IN_QEMU: bool = Field(default=False)
|
||||
|
||||
ANSI: Dict[str, str] = Field(
|
||||
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
|
||||
ANSI: dict[str, str] = Field(
|
||||
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -50,7 +50,7 @@ class ShellConfig(BaseConfigSet):
|
||||
return shutil.get_terminal_size((140, 10)).columns
|
||||
|
||||
@property
|
||||
def COMMIT_HASH(self) -> Optional[str]:
|
||||
def COMMIT_HASH(self) -> str | None:
|
||||
return get_COMMIT_HASH()
|
||||
|
||||
@property
|
||||
@@ -112,7 +112,7 @@ class ServerConfig(BaseConfigSet):
|
||||
"danger-onedomain-fullreplay",
|
||||
)
|
||||
|
||||
SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
|
||||
SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
|
||||
BIND_ADDR: str = Field(default="127.0.0.1:8000")
|
||||
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
|
||||
ADMIN_BASE_URL: str = Field(default="")
|
||||
@@ -124,7 +124,7 @@ class ServerConfig(BaseConfigSet):
|
||||
SNAPSHOTS_PER_PAGE: int = Field(default=40)
|
||||
PREVIEW_ORIGINALS: bool = Field(default=True)
|
||||
FOOTER_INFO: str = Field(
|
||||
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
||||
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.",
|
||||
)
|
||||
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
|
||||
|
||||
@@ -132,8 +132,8 @@ class ServerConfig(BaseConfigSet):
|
||||
PUBLIC_SNAPSHOTS: bool = Field(default=True)
|
||||
PUBLIC_ADD_VIEW: bool = Field(default=False)
|
||||
|
||||
ADMIN_USERNAME: Optional[str] = Field(default=None)
|
||||
ADMIN_PASSWORD: Optional[str] = Field(default=None)
|
||||
ADMIN_USERNAME: str | None = Field(default=None)
|
||||
ADMIN_PASSWORD: str | None = Field(default=None)
|
||||
|
||||
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
|
||||
REVERSE_PROXY_WHITELIST: str = Field(default="")
|
||||
@@ -234,22 +234,22 @@ class ArchivingConfig(BaseConfigSet):
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
USER_AGENT: str = Field(
|
||||
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
|
||||
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)",
|
||||
)
|
||||
COOKIES_FILE: Path | None = Field(default=None)
|
||||
|
||||
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
|
||||
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
|
||||
|
||||
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
||||
SAVE_ALLOWLIST: dict[str, list[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||
SAVE_DENYLIST: dict[str, list[str]] = Field(default={})
|
||||
|
||||
DEFAULT_PERSONA: str = Field(default="Default")
|
||||
|
||||
def warn_if_invalid(self) -> None:
|
||||
if int(self.TIMEOUT) < 5:
|
||||
rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
||||
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
|
||||
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr)
|
||||
rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
|
||||
rprint(file=sys.stderr)
|
||||
rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
|
||||
@@ -274,7 +274,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
|
||||
|
||||
@property
|
||||
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||
def SAVE_ALLOWLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
|
||||
return (
|
||||
{
|
||||
# regexp: methods list
|
||||
@@ -286,7 +286,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
)
|
||||
|
||||
@property
|
||||
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||
def SAVE_DENYLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
|
||||
return (
|
||||
{
|
||||
# regexp: methods list
|
||||
|
||||
@@ -11,7 +11,7 @@ __package__ = "archivebox.config"
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Type, Tuple
|
||||
from typing import Any
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
|
||||
@@ -28,17 +28,18 @@ class IniConfigSettingsSource(PydanticBaseSettingsSource):
|
||||
Flattens all sections into a single namespace.
|
||||
"""
|
||||
|
||||
def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
|
||||
def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]:
|
||||
config_vals = self._load_config_file()
|
||||
field_value = config_vals.get(field_name.upper())
|
||||
return field_value, field_name, False
|
||||
|
||||
def __call__(self) -> Dict[str, Any]:
|
||||
def __call__(self) -> dict[str, Any]:
|
||||
return self._load_config_file()
|
||||
|
||||
def _load_config_file(self) -> Dict[str, Any]:
|
||||
def _load_config_file(self) -> dict[str, Any]:
|
||||
try:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
except ImportError:
|
||||
return {}
|
||||
@@ -78,25 +79,25 @@ class BaseConfigSet(BaseSettings):
|
||||
@classmethod
|
||||
def settings_customise_sources(
|
||||
cls,
|
||||
settings_cls: Type[BaseSettings],
|
||||
settings_cls: type[BaseSettings],
|
||||
init_settings: PydanticBaseSettingsSource,
|
||||
env_settings: PydanticBaseSettingsSource,
|
||||
dotenv_settings: PydanticBaseSettingsSource,
|
||||
file_secret_settings: PydanticBaseSettingsSource,
|
||||
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
||||
) -> tuple[PydanticBaseSettingsSource, ...]:
|
||||
"""
|
||||
Define the order of settings sources (first = highest priority).
|
||||
"""
|
||||
return (
|
||||
init_settings, # 1. Passed to __init__
|
||||
env_settings, # 2. Environment variables
|
||||
init_settings, # 1. Passed to __init__
|
||||
env_settings, # 2. Environment variables
|
||||
IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
|
||||
# dotenv_settings, # Skip .env files
|
||||
# file_secret_settings, # Skip secrets files
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
|
||||
def load_from_file(cls, config_path: Path) -> dict[str, str]:
|
||||
"""Load config values from INI file."""
|
||||
if not config_path.exists():
|
||||
return {}
|
||||
@@ -120,14 +121,14 @@ class BaseConfigSet(BaseSettings):
|
||||
|
||||
|
||||
def get_config(
|
||||
defaults: Optional[Dict] = None,
|
||||
defaults: dict | None = None,
|
||||
persona: Any = None,
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
archiveresult: Any = None,
|
||||
machine: Any = None,
|
||||
) -> Dict[str, Any]:
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Get merged config from all sources.
|
||||
|
||||
@@ -176,7 +177,7 @@ def get_config(
|
||||
if persona_id:
|
||||
persona = Persona.objects.filter(id=persona_id).first()
|
||||
if persona is None:
|
||||
raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
|
||||
raise Persona.DoesNotExist(f"Crawl {getattr(crawl, 'id', None)} references missing Persona {persona_id}")
|
||||
|
||||
if persona is None:
|
||||
crawl_config = getattr(crawl, "config", None) or {}
|
||||
@@ -200,6 +201,7 @@ def get_config(
|
||||
# Add plugin config defaults from JSONSchema config.json files
|
||||
try:
|
||||
from archivebox.hooks import get_config_defaults_from_plugins
|
||||
|
||||
plugin_defaults = get_config_defaults_from_plugins()
|
||||
config.update(plugin_defaults)
|
||||
except ImportError:
|
||||
@@ -224,6 +226,7 @@ def get_config(
|
||||
# Default to current machine if not provided
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
except Exception:
|
||||
pass # Machine might not be available during early init
|
||||
@@ -246,16 +249,17 @@ def get_config(
|
||||
# Also check plugin config aliases in environment
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop_schema in schema.get('properties', {}).items():
|
||||
for key, prop_schema in schema.get("properties", {}).items():
|
||||
# Check x-aliases
|
||||
for alias in prop_schema.get('x-aliases', []):
|
||||
for alias in prop_schema.get("x-aliases", []):
|
||||
if alias in os.environ and key not in os.environ:
|
||||
config[key] = _parse_env_value(os.environ[alias], config.get(key))
|
||||
break
|
||||
# Check x-fallback
|
||||
fallback = prop_schema.get('x-fallback')
|
||||
fallback = prop_schema.get("x-fallback")
|
||||
if fallback and fallback in config and key not in config:
|
||||
config[key] = config[fallback]
|
||||
except ImportError:
|
||||
@@ -275,33 +279,34 @@ def get_config(
|
||||
|
||||
# Add crawl path aliases for hooks that need shared crawl state.
|
||||
if crawl and hasattr(crawl, "output_dir"):
|
||||
config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
|
||||
config['CRAWL_DIR'] = str(crawl.output_dir)
|
||||
config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
|
||||
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
|
||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
|
||||
|
||||
# Apply snapshot config overrides (highest priority)
|
||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||
config.update(snapshot.config)
|
||||
|
||||
if snapshot:
|
||||
config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
|
||||
config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
|
||||
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID")
|
||||
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
|
||||
if hasattr(snapshot, "output_dir"):
|
||||
config['SNAP_DIR'] = str(snapshot.output_dir)
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
if getattr(snapshot, "crawl_id", None):
|
||||
config['CRAWL_ID'] = str(snapshot.crawl_id)
|
||||
config["CRAWL_ID"] = str(snapshot.crawl_id)
|
||||
|
||||
# Normalize all aliases to canonical names (after all sources merged)
|
||||
# This handles aliases that came from user/crawl/snapshot configs, not just env
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
aliases_to_normalize = {} # {alias_key: canonical_key}
|
||||
|
||||
# Build alias mapping from all plugin schemas
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for canonical_key, prop_schema in schema.get('properties', {}).items():
|
||||
for alias in prop_schema.get('x-aliases', []):
|
||||
for canonical_key, prop_schema in schema.get("properties", {}).items():
|
||||
for alias in prop_schema.get("x-aliases", []):
|
||||
aliases_to_normalize[alias] = canonical_key
|
||||
|
||||
# Normalize: copy alias values to canonical keys (aliases take precedence)
|
||||
@@ -314,10 +319,14 @@ def get_config(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if not config.get("DATA_DIR"):
|
||||
config["DATA_DIR"] = str(CONSTANTS.DATA_DIR)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def get_flat_config() -> Dict[str, Any]:
|
||||
def get_flat_config() -> dict[str, Any]:
|
||||
"""
|
||||
Get a flat dictionary of all config values.
|
||||
|
||||
@@ -326,20 +335,24 @@ def get_flat_config() -> Dict[str, Any]:
|
||||
return get_config()
|
||||
|
||||
|
||||
def get_all_configs() -> Dict[str, BaseConfigSet]:
|
||||
def get_all_configs() -> dict[str, BaseConfigSet]:
|
||||
"""
|
||||
Get all config section objects as a dictionary.
|
||||
|
||||
Replaces abx.pm.hook.get_CONFIGS()
|
||||
"""
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
SHELL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
"SHELL_CONFIG": SHELL_CONFIG,
|
||||
"SERVER_CONFIG": SERVER_CONFIG,
|
||||
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
|
||||
"SEARCH_BACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
|
||||
@@ -394,7 +407,7 @@ DEFAULT_WORKER_CONCURRENCY = {
|
||||
}
|
||||
|
||||
|
||||
def get_worker_concurrency() -> Dict[str, int]:
|
||||
def get_worker_concurrency() -> dict[str, int]:
|
||||
"""
|
||||
Get worker concurrency settings.
|
||||
|
||||
|
||||
@@ -5,17 +5,16 @@ Constants are for things that never change at runtime.
|
||||
DATA_DIR will never change at runtime, but you can run
|
||||
archivebox from inside a different DATA_DIR on the same machine.
|
||||
|
||||
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||
or imported from this file should import anything from archivebox.config.common,
|
||||
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||
or imported from this file should import anything from archivebox.config.common,
|
||||
django, other INSTALLED_APPS, or anything else that is not in a standard library.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
||||
from benedict import benedict
|
||||
@@ -46,184 +45,235 @@ from .version import detect_installed_version
|
||||
|
||||
|
||||
class ConstantsDict:
|
||||
PACKAGE_DIR: Path = PACKAGE_DIR
|
||||
DATA_DIR: Path = DATA_DIR
|
||||
ARCHIVE_DIR: Path = ARCHIVE_DIR
|
||||
|
||||
MACHINE_TYPE: str = get_machine_type()
|
||||
MACHINE_ID: str = get_machine_id()
|
||||
COLLECTION_ID: str = get_collection_id(DATA_DIR)
|
||||
|
||||
PACKAGE_DIR: Path = PACKAGE_DIR
|
||||
DATA_DIR: Path = DATA_DIR
|
||||
ARCHIVE_DIR: Path = ARCHIVE_DIR
|
||||
|
||||
MACHINE_TYPE: str = get_machine_type()
|
||||
MACHINE_ID: str = get_machine_id()
|
||||
COLLECTION_ID: str = get_collection_id(DATA_DIR)
|
||||
|
||||
# Host system
|
||||
VERSION: str = detect_installed_version(PACKAGE_DIR)
|
||||
IN_DOCKER: bool = IN_DOCKER
|
||||
|
||||
VERSION: str = detect_installed_version(PACKAGE_DIR)
|
||||
IN_DOCKER: bool = IN_DOCKER
|
||||
|
||||
# Permissions
|
||||
IS_ROOT: bool = IS_ROOT
|
||||
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
|
||||
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
|
||||
RUNNING_AS_UID: int = RUNNING_AS_UID
|
||||
RUNNING_AS_GID: int = RUNNING_AS_GID
|
||||
DEFAULT_PUID: int = DEFAULT_PUID
|
||||
DEFAULT_PGID: int = DEFAULT_PGID
|
||||
IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
|
||||
|
||||
IS_ROOT: bool = IS_ROOT
|
||||
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
|
||||
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
|
||||
RUNNING_AS_UID: int = RUNNING_AS_UID
|
||||
RUNNING_AS_GID: int = RUNNING_AS_GID
|
||||
DEFAULT_PUID: int = DEFAULT_PUID
|
||||
DEFAULT_PGID: int = DEFAULT_PGID
|
||||
IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
|
||||
|
||||
# Source code dirs
|
||||
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
|
||||
TEMPLATES_DIR_NAME: str = 'templates'
|
||||
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
|
||||
STATIC_DIR_NAME: str = 'static'
|
||||
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
|
||||
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
|
||||
TEMPLATES_DIR_NAME: str = "templates"
|
||||
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
|
||||
STATIC_DIR_NAME: str = "static"
|
||||
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
|
||||
|
||||
# Data dirs
|
||||
ARCHIVE_DIR_NAME: str = 'archive'
|
||||
SOURCES_DIR_NAME: str = 'sources'
|
||||
PERSONAS_DIR_NAME: str = 'personas'
|
||||
CACHE_DIR_NAME: str = 'cache'
|
||||
LOGS_DIR_NAME: str = 'logs'
|
||||
CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins'
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates'
|
||||
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
||||
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
||||
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
||||
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
|
||||
ARCHIVE_DIR_NAME: str = "archive"
|
||||
SOURCES_DIR_NAME: str = "sources"
|
||||
PERSONAS_DIR_NAME: str = "personas"
|
||||
CACHE_DIR_NAME: str = "cache"
|
||||
LOGS_DIR_NAME: str = "logs"
|
||||
CUSTOM_PLUGINS_DIR_NAME: str = "custom_plugins"
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = "custom_templates"
|
||||
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
||||
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
||||
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
||||
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
|
||||
|
||||
# Data dir files
|
||||
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||
|
||||
JSON_INDEX_FILENAME: str = 'index.json'
|
||||
JSONL_INDEX_FILENAME: str = 'index.jsonl'
|
||||
HTML_INDEX_FILENAME: str = 'index.html'
|
||||
ROBOTS_TXT_FILENAME: str = 'robots.txt'
|
||||
FAVICON_FILENAME: str = 'favicon.ico'
|
||||
|
||||
# Runtime dirs
|
||||
TMP_DIR_NAME: str = 'tmp'
|
||||
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
|
||||
CONFIG_FILENAME: str = "ArchiveBox.conf"
|
||||
SQL_INDEX_FILENAME: str = "index.sqlite3"
|
||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||
|
||||
LIB_DIR_NAME: str = 'lib'
|
||||
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
|
||||
DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / 'bin' # ./data/lib/arm64-linux-docker/bin
|
||||
JSON_INDEX_FILENAME: str = "index.json"
|
||||
JSONL_INDEX_FILENAME: str = "index.jsonl"
|
||||
HTML_INDEX_FILENAME: str = "index.html"
|
||||
ROBOTS_TXT_FILENAME: str = "robots.txt"
|
||||
FAVICON_FILENAME: str = "favicon.ico"
|
||||
|
||||
# Runtime dirs
|
||||
TMP_DIR_NAME: str = "tmp"
|
||||
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
|
||||
|
||||
LIB_DIR_NAME: str = "lib"
|
||||
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
|
||||
DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / "bin" # ./data/lib/arm64-linux-docker/bin
|
||||
|
||||
# Config constants
|
||||
TIMEZONE: str = 'UTC'
|
||||
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
|
||||
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
|
||||
TIMEZONE: str = "UTC"
|
||||
DEFAULT_CLI_COLORS: dict[str, str] = DEFAULT_CLI_COLORS
|
||||
DISABLED_CLI_COLORS: dict[str, str] = benedict({k: "" for k in DEFAULT_CLI_COLORS})
|
||||
|
||||
# Hard safety limits (seconds)
|
||||
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
|
||||
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
||||
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||
'atom', 'rss', 'css', 'js', 'json',
|
||||
'dmg', 'iso', 'img',
|
||||
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
||||
STATICFILE_EXTENSIONS: frozenset[str] = frozenset(
|
||||
(
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
"gif",
|
||||
"jpeg",
|
||||
"jpg",
|
||||
"png",
|
||||
"tif",
|
||||
"tiff",
|
||||
"wbmp",
|
||||
"ico",
|
||||
"jng",
|
||||
"bmp",
|
||||
"svg",
|
||||
"svgz",
|
||||
"webp",
|
||||
"ps",
|
||||
"eps",
|
||||
"ai",
|
||||
"mp3",
|
||||
"mp4",
|
||||
"m4a",
|
||||
"mpeg",
|
||||
"mpg",
|
||||
"mkv",
|
||||
"mov",
|
||||
"webm",
|
||||
"m4v",
|
||||
"flv",
|
||||
"wmv",
|
||||
"avi",
|
||||
"ogg",
|
||||
"ts",
|
||||
"m3u8",
|
||||
"pdf",
|
||||
"txt",
|
||||
"rtf",
|
||||
"rtfd",
|
||||
"doc",
|
||||
"docx",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"xls",
|
||||
"xlsx",
|
||||
"atom",
|
||||
"rss",
|
||||
"css",
|
||||
"js",
|
||||
"json",
|
||||
"dmg",
|
||||
"iso",
|
||||
"img",
|
||||
"rar",
|
||||
"war",
|
||||
"hqx",
|
||||
"zip",
|
||||
"gz",
|
||||
"bz2",
|
||||
"7z",
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
# These are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
),
|
||||
)
|
||||
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
|
||||
# These are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
))
|
||||
|
||||
PIP_RELATED_NAMES: frozenset[str] = frozenset((
|
||||
".venv",
|
||||
"venv",
|
||||
"virtualenv",
|
||||
".virtualenv",
|
||||
))
|
||||
NPM_RELATED_NAMES: frozenset[str] = frozenset((
|
||||
"node_modules",
|
||||
"package.json",
|
||||
"package-lock.json",
|
||||
"yarn.lock",
|
||||
))
|
||||
PIP_RELATED_NAMES: frozenset[str] = frozenset(
|
||||
(
|
||||
".venv",
|
||||
"venv",
|
||||
"virtualenv",
|
||||
".virtualenv",
|
||||
),
|
||||
)
|
||||
NPM_RELATED_NAMES: frozenset[str] = frozenset(
|
||||
(
|
||||
"node_modules",
|
||||
"package.json",
|
||||
"package-lock.json",
|
||||
"yarn.lock",
|
||||
),
|
||||
)
|
||||
|
||||
# When initializing archivebox in a new directory, we check to make sure the dir is
|
||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
|
||||
### Dirs:
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
CACHE_DIR_NAME,
|
||||
LIB_DIR_NAME,
|
||||
TMP_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
CUSTOM_TEMPLATES_DIR_NAME,
|
||||
CUSTOM_PLUGINS_DIR_NAME,
|
||||
"invalid",
|
||||
"users",
|
||||
"machine",
|
||||
# Backwards compatibility with old directory names
|
||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount / sonic FTS process
|
||||
".git",
|
||||
".svn",
|
||||
|
||||
### Files:
|
||||
CONFIG_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f"{SQL_INDEX_FILENAME}-wal",
|
||||
f"{SQL_INDEX_FILENAME}-shm",
|
||||
"search.sqlite3",
|
||||
"queue.sqlite3",
|
||||
"queue.sqlite3-wal",
|
||||
"queue.sqlite3-shm",
|
||||
JSON_INDEX_FILENAME,
|
||||
JSONL_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f"{CONFIG_FILENAME}.bak",
|
||||
f".{CONFIG_FILENAME}.bak",
|
||||
"static_index.json",
|
||||
".DS_Store",
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".env",
|
||||
".collection_id",
|
||||
".archivebox_id",
|
||||
"Dockerfile",
|
||||
))
|
||||
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset(
|
||||
(
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
### Dirs:
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
CACHE_DIR_NAME,
|
||||
LIB_DIR_NAME,
|
||||
TMP_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
CUSTOM_TEMPLATES_DIR_NAME,
|
||||
CUSTOM_PLUGINS_DIR_NAME,
|
||||
"invalid",
|
||||
"users",
|
||||
"machine",
|
||||
# Backwards compatibility with old directory names
|
||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount / sonic FTS process
|
||||
".git",
|
||||
".svn",
|
||||
### Files:
|
||||
CONFIG_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f"{SQL_INDEX_FILENAME}-wal",
|
||||
f"{SQL_INDEX_FILENAME}-shm",
|
||||
"search.sqlite3",
|
||||
"queue.sqlite3",
|
||||
"queue.sqlite3-wal",
|
||||
"queue.sqlite3-shm",
|
||||
JSON_INDEX_FILENAME,
|
||||
JSONL_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f"{CONFIG_FILENAME}.bak",
|
||||
f".{CONFIG_FILENAME}.bak",
|
||||
"static_index.json",
|
||||
".DS_Store",
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".env",
|
||||
".collection_id",
|
||||
".archivebox_id",
|
||||
"Dockerfile",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def __getitem__(cls, key: str):
|
||||
# so it behaves like a dict[key] == dict.key or object attr
|
||||
return getattr(cls, key)
|
||||
|
||||
|
||||
@classmethod
|
||||
def __benedict__(cls):
|
||||
# when casting to benedict, only include uppercase keys that don't start with an underscore
|
||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith("_")})
|
||||
|
||||
|
||||
CONSTANTS = ConstantsDict
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -17,9 +17,9 @@ from .common import SHELL_CONFIG
|
||||
|
||||
|
||||
if not SHELL_CONFIG.USE_COLOR:
|
||||
os.environ['NO_COLOR'] = '1'
|
||||
os.environ["NO_COLOR"] = "1"
|
||||
if not SHELL_CONFIG.SHOW_PROGRESS:
|
||||
os.environ['TERM'] = 'dumb'
|
||||
os.environ["TERM"] = "dumb"
|
||||
|
||||
# recreate rich console obj based on new config values
|
||||
STDOUT = CONSOLE = Console()
|
||||
@@ -32,7 +32,8 @@ def setup_django_minimal():
|
||||
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
|
||||
# os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
# django.setup()
|
||||
raise Exception('dont use this anymore')
|
||||
raise Exception("dont use this anymore")
|
||||
|
||||
|
||||
DJANGO_SET_UP = False
|
||||
|
||||
@@ -61,15 +62,18 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
# This warning can be triggered during django.setup() but is safe to ignore
|
||||
# since we're doing intentional setup operations
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore',
|
||||
message='.*Accessing the database during app initialization.*',
|
||||
category=RuntimeWarning)
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=".*Accessing the database during app initialization.*",
|
||||
category=RuntimeWarning,
|
||||
)
|
||||
|
||||
try:
|
||||
from django.core.management import call_command
|
||||
|
||||
if in_memory_db:
|
||||
raise Exception('dont use this anymore')
|
||||
raise Exception("dont use this anymore")
|
||||
|
||||
# some commands dont store a long-lived sqlite3 db file on disk.
|
||||
# in those cases we create a temporary in-memory db and run the migrations
|
||||
@@ -84,19 +88,22 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
try:
|
||||
django.setup()
|
||||
except Exception as e:
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ("help", "version", "--help", "--version"))
|
||||
if not is_using_meta_cmd:
|
||||
# show error message to user only if they're not running a meta command / just trying to get help
|
||||
STDERR.print()
|
||||
STDERR.print(Panel(
|
||||
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
|
||||
title='\n\n[red][X] Error while trying to load database![/red]',
|
||||
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
|
||||
expand=False,
|
||||
style='bold red',
|
||||
))
|
||||
STDERR.print(
|
||||
Panel(
|
||||
f"\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n",
|
||||
title="\n\n[red][X] Error while trying to load database![/red]",
|
||||
subtitle="[grey53]NO WRITES CAN BE PERFORMED[/grey53]",
|
||||
expand=False,
|
||||
style="bold red",
|
||||
),
|
||||
)
|
||||
STDERR.print()
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
@@ -104,28 +111,29 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG
|
||||
|
||||
# log startup message to the error log
|
||||
error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG)
|
||||
with open(error_log, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
error_log = getattr(settings, "ERROR_LOG", DEFAULT_ERROR_LOG)
|
||||
with open(error_log, "a", encoding="utf-8") as f:
|
||||
command = " ".join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d__%H:%M:%S")
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
if check_db:
|
||||
# make sure the data dir is owned by a non-root user
|
||||
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
|
||||
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
|
||||
STDERR.print(f' {CONSTANTS.DATA_DIR}')
|
||||
STDERR.print("[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]")
|
||||
STDERR.print(f" {CONSTANTS.DATA_DIR}")
|
||||
STDERR.print()
|
||||
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
|
||||
STDERR.print(' cd path/to/your/archive/data')
|
||||
STDERR.print(' archivebox [command]')
|
||||
STDERR.print("[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)")
|
||||
STDERR.print(" cd path/to/your/archive/data")
|
||||
STDERR.print(" archivebox [command]")
|
||||
STDERR.print()
|
||||
raise SystemExit(9)
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
cache.get('test', None)
|
||||
|
||||
cache.get("test", None)
|
||||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
@@ -133,12 +141,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
from django.db import connections
|
||||
|
||||
for conn in connections.all():
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
|
||||
sql_index_path = CONSTANTS.DATABASE_FILE
|
||||
assert os.access(sql_index_path, os.F_OK), (
|
||||
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
|
||||
f"No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)"
|
||||
)
|
||||
|
||||
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
|
||||
# if settings.DEBUG_LOGFIRE:
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import Field
|
||||
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
@@ -13,13 +12,14 @@ class LDAPConfig(BaseConfigSet):
|
||||
Only loads and validates if django-auth-ldap is installed.
|
||||
These settings integrate with Django's LDAP authentication backend.
|
||||
"""
|
||||
|
||||
toml_section_header: str = "LDAP_CONFIG"
|
||||
|
||||
LDAP_ENABLED: bool = Field(default=False)
|
||||
LDAP_SERVER_URI: Optional[str] = Field(default=None)
|
||||
LDAP_BIND_DN: Optional[str] = Field(default=None)
|
||||
LDAP_BIND_PASSWORD: Optional[str] = Field(default=None)
|
||||
LDAP_USER_BASE: Optional[str] = Field(default=None)
|
||||
LDAP_SERVER_URI: str | None = Field(default=None)
|
||||
LDAP_BIND_DN: str | None = Field(default=None)
|
||||
LDAP_BIND_PASSWORD: str | None = Field(default=None)
|
||||
LDAP_USER_BASE: str | None = Field(default=None)
|
||||
LDAP_USER_FILTER: str = Field(default="(uid=%(user)s)")
|
||||
LDAP_USERNAME_ATTR: str = Field(default="username")
|
||||
LDAP_FIRSTNAME_ATTR: str = Field(default="givenName")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import socket
|
||||
@@ -15,24 +15,25 @@ from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
|
||||
|
||||
#############################################################################################
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir
|
||||
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
|
||||
DATABASE_FILE = DATA_DIR / 'index.sqlite3'
|
||||
DATABASE_FILE = DATA_DIR / "index.sqlite3"
|
||||
|
||||
#############################################################################################
|
||||
|
||||
|
||||
def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
|
||||
collection_id_file = DATA_DIR / '.archivebox_id'
|
||||
|
||||
collection_id_file = DATA_DIR / ".archivebox_id"
|
||||
|
||||
try:
|
||||
return collection_id_file.read_text().strip()
|
||||
except (OSError, FileNotFoundError, PermissionError):
|
||||
pass
|
||||
|
||||
|
||||
# hash the machine_id + collection dir path + creation time to get a unique collection_id
|
||||
machine_id = get_machine_id()
|
||||
collection_path = DATA_DIR.resolve()
|
||||
@@ -40,55 +41,60 @@ def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
|
||||
creation_date = DATA_DIR.stat().st_ctime
|
||||
except Exception:
|
||||
creation_date = datetime.now().isoformat()
|
||||
collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8]
|
||||
|
||||
collection_id = hashlib.sha256(f"{machine_id}:{collection_path}@{creation_date}".encode()).hexdigest()[:8]
|
||||
|
||||
try:
|
||||
# only persist collection_id file if we already have an index.sqlite3 file present
|
||||
# otherwise we might be running in a directory that is not a collection, no point creating cruft files
|
||||
collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
|
||||
if collection_is_active or force_create:
|
||||
collection_id_file.write_text(collection_id)
|
||||
|
||||
|
||||
# if we're running as root right now, make sure the collection_id file is owned by the archivebox user
|
||||
if IS_ROOT:
|
||||
with SudoPermission(uid=0):
|
||||
if ARCHIVEBOX_USER == 0:
|
||||
os.system(f'chmod 777 "{collection_id_file}"')
|
||||
else:
|
||||
else:
|
||||
os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"')
|
||||
except (OSError, FileNotFoundError, PermissionError):
|
||||
pass
|
||||
return collection_id
|
||||
|
||||
|
||||
@cache
|
||||
def get_collection_id(DATA_DIR=DATA_DIR) -> str:
|
||||
"""Get a short, stable, unique ID for the current collection (e.g. abc45678)"""
|
||||
return _get_collection_id(DATA_DIR=DATA_DIR)
|
||||
|
||||
|
||||
@cache
|
||||
def get_machine_id() -> str:
|
||||
"""Get a short, stable, unique ID for the current machine (e.g. abc45678)"""
|
||||
|
||||
MACHINE_ID = 'unknown'
|
||||
|
||||
MACHINE_ID = "unknown"
|
||||
try:
|
||||
import machineid
|
||||
MACHINE_ID = machineid.hashed_id('archivebox')[:8]
|
||||
|
||||
MACHINE_ID = machineid.hashed_id("archivebox")[:8]
|
||||
except Exception:
|
||||
try:
|
||||
import uuid
|
||||
import hashlib
|
||||
|
||||
MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8]
|
||||
except Exception:
|
||||
pass
|
||||
return MACHINE_ID
|
||||
|
||||
|
||||
@cache
|
||||
def get_machine_type() -> str:
|
||||
"""Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)"""
|
||||
|
||||
OS: str = platform.system().lower() # darwin, linux, etc.
|
||||
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
|
||||
LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}'
|
||||
|
||||
OS: str = platform.system().lower() # darwin, linux, etc.
|
||||
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
|
||||
LIB_DIR_SCOPE: str = f"{ARCH}-{OS}-docker" if IN_DOCKER else f"{ARCH}-{OS}"
|
||||
return LIB_DIR_SCOPE
|
||||
|
||||
|
||||
@@ -97,27 +103,28 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
|
||||
current_uid, current_gid = os.geteuid(), os.getegid()
|
||||
uid, gid = uid or current_uid, gid or current_gid
|
||||
|
||||
test_file = dir_path / '.permissions_test'
|
||||
test_file = dir_path / ".permissions_test"
|
||||
try:
|
||||
with SudoPermission(uid=uid, fallback=fallback):
|
||||
test_file.exists()
|
||||
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
|
||||
test_file.write_text(f"Checking if PUID={uid} PGID={gid} can write to dir")
|
||||
test_file.unlink()
|
||||
return True
|
||||
except (IOError, OSError, PermissionError):
|
||||
if chown:
|
||||
except (OSError, PermissionError):
|
||||
if chown:
|
||||
# try fixing it using sudo permissions
|
||||
with SudoPermission(uid=uid, fallback=fallback):
|
||||
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
|
||||
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
|
||||
return False
|
||||
|
||||
|
||||
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
|
||||
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
|
||||
from archivebox.misc.logging_util import pretty_path
|
||||
|
||||
|
||||
try:
|
||||
socket_path = str(dir_path / '.test_socket.sock')
|
||||
socket_path = str(dir_path / ".test_socket.sock")
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
try:
|
||||
os.remove(socket_path)
|
||||
@@ -130,8 +137,8 @@ def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
|
||||
except OSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
|
||||
|
||||
raise Exception(f"ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}") from e
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -143,8 +150,9 @@ def create_and_chown_dir(dir_path: Path) -> None:
|
||||
|
||||
|
||||
def tmp_dir_socket_path_is_short_enough(dir_path: Path) -> bool:
|
||||
socket_file = dir_path.absolute().resolve() / 'supervisord.sock'
|
||||
return len(f'file://{socket_file}') <= 96
|
||||
socket_file = dir_path.absolute().resolve() / "supervisord.sock"
|
||||
return len(f"file://{socket_file}") <= 96
|
||||
|
||||
|
||||
@cache
|
||||
def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
@@ -154,14 +162,18 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
|
||||
# try a few potential directories in order of preference
|
||||
CANDIDATES = [
|
||||
STORAGE_CONFIG.TMP_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
|
||||
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
|
||||
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
|
||||
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
|
||||
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
|
||||
STORAGE_CONFIG.TMP_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
|
||||
Path("/var/run/archivebox") / get_collection_id(), # /var/run/archivebox/abc5d8512
|
||||
Path("/tmp") / "archivebox" / get_collection_id(), # /tmp/archivebox/abc5d8512
|
||||
Path("~/.tmp/archivebox").expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir())
|
||||
/ "archivebox"
|
||||
/ get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir())
|
||||
/ "archivebox"
|
||||
/ get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
|
||||
Path(tempfile.gettempdir()) / "abx" / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
|
||||
]
|
||||
fallback_candidate = None
|
||||
for candidate in CANDIDATES:
|
||||
@@ -174,7 +186,12 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
|
||||
return candidate
|
||||
try:
|
||||
if fallback_candidate is None and candidate.exists() and dir_is_writable(candidate) and tmp_dir_socket_path_is_short_enough(candidate):
|
||||
if (
|
||||
fallback_candidate is None
|
||||
and candidate.exists()
|
||||
and dir_is_writable(candidate)
|
||||
and tmp_dir_socket_path_is_short_enough(candidate)
|
||||
):
|
||||
fallback_candidate = candidate
|
||||
except Exception:
|
||||
pass
|
||||
@@ -186,25 +203,28 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
if autofix and STORAGE_CONFIG.TMP_DIR != fallback_candidate:
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=fallback_candidate)
|
||||
return fallback_candidate
|
||||
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
|
||||
raise OSError(f"ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!")
|
||||
|
||||
|
||||
@cache
|
||||
def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.misc.checks import check_lib_dir
|
||||
|
||||
|
||||
# try a few potential directories in order of preference
|
||||
CANDIDATES = [
|
||||
STORAGE_CONFIG.LIB_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
|
||||
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
|
||||
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
|
||||
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
|
||||
STORAGE_CONFIG.LIB_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
|
||||
Path("/usr/local/share/archivebox") / get_collection_id(), # /usr/local/share/archivebox/abc5
|
||||
*(
|
||||
[Path("/opt/homebrew/share/archivebox") / get_collection_id()] if os.path.isfile("/opt/homebrew/bin/archivebox") else []
|
||||
), # /opt/homebrew/share/archivebox/abc5
|
||||
Path("~/.local/share/archivebox").expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
|
||||
]
|
||||
|
||||
|
||||
for candidate in CANDIDATES:
|
||||
try:
|
||||
create_and_chown_dir(candidate)
|
||||
@@ -214,10 +234,9 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
||||
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
||||
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
|
||||
return candidate
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f"ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!")
|
||||
|
||||
|
||||
@cache
|
||||
@@ -229,57 +248,68 @@ def get_data_locations():
|
||||
tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR
|
||||
except Exception:
|
||||
tmp_dir = STORAGE_CONFIG.TMP_DIR
|
||||
|
||||
return benedict({
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
|
||||
return benedict(
|
||||
{
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONSTANTS.CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE)
|
||||
and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)
|
||||
and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": CONSTANTS.SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR)
|
||||
and os.access(CONSTANTS.SOURCES_DIR, os.R_OK)
|
||||
and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": CONSTANTS.PERSONAS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR)
|
||||
and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK)
|
||||
and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": CONSTANTS.LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR)
|
||||
and os.access(CONSTANTS.LOGS_DIR, os.R_OK)
|
||||
and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"TMP_DIR": {
|
||||
"path": tmp_dir.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
||||
# },
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONSTANTS.CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": CONSTANTS.SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": CONSTANTS.PERSONAS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": CONSTANTS.LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
'TMP_DIR': {
|
||||
'path': tmp_dir.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
||||
# },
|
||||
})
|
||||
)
|
||||
|
||||
|
||||
@cache
|
||||
def get_code_locations():
|
||||
@@ -291,41 +321,45 @@ def get_code_locations():
|
||||
except Exception:
|
||||
lib_dir = STORAGE_CONFIG.LIB_DIR
|
||||
|
||||
lib_bin_dir = lib_dir / 'bin'
|
||||
|
||||
return benedict({
|
||||
'PACKAGE_DIR': {
|
||||
'path': (PACKAGE_DIR).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
'enabled': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR),
|
||||
'is_valid': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
||||
},
|
||||
'USER_PLUGINS_DIR': {
|
||||
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
|
||||
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
|
||||
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
|
||||
},
|
||||
'LIB_DIR': {
|
||||
'path': lib_dir.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write
|
||||
},
|
||||
'LIB_BIN_DIR': {
|
||||
'path': lib_bin_dir.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(lib_bin_dir) and os.access(lib_bin_dir, os.R_OK) and os.access(lib_bin_dir, os.W_OK), # read + write
|
||||
},
|
||||
})
|
||||
lib_bin_dir = lib_dir / "bin"
|
||||
|
||||
return benedict(
|
||||
{
|
||||
"PACKAGE_DIR": {
|
||||
"path": (PACKAGE_DIR).resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.access(PACKAGE_DIR / "__main__.py", os.X_OK), # executable
|
||||
},
|
||||
"TEMPLATES_DIR": {
|
||||
"path": CONSTANTS.TEMPLATES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
"CUSTOM_TEMPLATES_DIR": {
|
||||
"path": STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
"enabled": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR),
|
||||
"is_valid": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR)
|
||||
and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
||||
},
|
||||
"USER_PLUGINS_DIR": {
|
||||
"path": CONSTANTS.USER_PLUGINS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
|
||||
},
|
||||
"LIB_DIR": {
|
||||
"path": lib_dir.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write
|
||||
},
|
||||
"LIB_BIN_DIR": {
|
||||
"path": lib_bin_dir.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(lib_bin_dir)
|
||||
and os.access(lib_bin_dir, os.R_OK)
|
||||
and os.access(lib_bin_dir, os.W_OK), # read + write
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# @cache
|
||||
@@ -340,9 +374,9 @@ def get_code_locations():
|
||||
# - ok to have a long path (doesnt contain SOCKETS)
|
||||
# """
|
||||
# from .version import detect_installed_version
|
||||
|
||||
|
||||
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
||||
|
||||
|
||||
# lib_dir = tempfile.gettempdir()
|
||||
# try:
|
||||
# if 'SYSTEM_LIB_DIR' in os.environ:
|
||||
@@ -350,7 +384,7 @@ def get_code_locations():
|
||||
# else:
|
||||
# with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
|
||||
# lib_dir = HOST_DIRS.site_data_path
|
||||
|
||||
|
||||
# # Docker: /usr/local/share/archivebox/0.8.5
|
||||
# # Ubuntu: /usr/local/share/archivebox/0.8.5
|
||||
# # macOS: /Library/Application Support/archivebox
|
||||
@@ -358,16 +392,16 @@ def get_code_locations():
|
||||
# with SudoPermission(uid=0, fallback=True):
|
||||
# lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
# except PermissionError:
|
||||
# # our user cannot
|
||||
# # our user cannot
|
||||
# lib_dir = HOST_DIRS.user_data_path
|
||||
# lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER):
|
||||
# if IS_ROOT:
|
||||
# # make sure lib dir is owned by the archivebox user, not root
|
||||
# with SudoPermission(uid=0):
|
||||
# if ARCHIVEBOX_USER == 0:
|
||||
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
|
||||
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
|
||||
# os.system(f'chmod -R 777 "{lib_dir}"')
|
||||
# else:
|
||||
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
|
||||
@@ -376,9 +410,9 @@ def get_code_locations():
|
||||
# except (PermissionError, AssertionError):
|
||||
# # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
# print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
|
||||
|
||||
|
||||
# return lib_dir
|
||||
|
||||
|
||||
# @cache
|
||||
# def get_TMP_DIR():
|
||||
# """
|
||||
@@ -390,9 +424,9 @@ def get_code_locations():
|
||||
# - must be cleared on every archivebox version upgrade
|
||||
# """
|
||||
# from .version import detect_installed_version
|
||||
|
||||
|
||||
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
||||
|
||||
|
||||
# # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
|
||||
# # print('RUNNING AS:', self.PUID, self.PGID)
|
||||
# run_dir = tempfile.gettempdir()
|
||||
@@ -405,7 +439,7 @@ def get_code_locations():
|
||||
# if IS_ROOT:
|
||||
# with SudoPermission(uid=0, fallback=False):
|
||||
# if ARCHIVEBOX_USER == 0:
|
||||
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# os.system(f'chmod -R 777 "{run_dir}"')
|
||||
# else:
|
||||
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
||||
@@ -413,30 +447,30 @@ def get_code_locations():
|
||||
# raise PermissionError()
|
||||
# assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
||||
# return run_dir
|
||||
|
||||
|
||||
# run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
|
||||
# try:
|
||||
# assert len(str(run_dir)) + len('/supervisord.sock') < 95
|
||||
# except AssertionError:
|
||||
# run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
|
||||
# assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
||||
|
||||
|
||||
# with SudoPermission(uid=0, fallback=True):
|
||||
# run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
|
||||
# if IS_ROOT:
|
||||
# with SudoPermission(uid=0):
|
||||
# if ARCHIVEBOX_USER == 0:
|
||||
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# os.system(f'chmod -R 777 "{run_dir}"')
|
||||
# else:
|
||||
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
||||
# else:
|
||||
# raise PermissionError()
|
||||
|
||||
|
||||
# except (PermissionError, AssertionError):
|
||||
# # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
# print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
|
||||
|
||||
|
||||
# return run_dir
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import pwd
|
||||
@@ -17,26 +17,26 @@ from contextlib import contextmanager
|
||||
DATA_DIR = Path(os.getcwd())
|
||||
|
||||
try:
|
||||
DATA_DIR_STAT = DATA_DIR.stat()
|
||||
DATA_DIR_UID = DATA_DIR_STAT.st_uid
|
||||
DATA_DIR_GID = DATA_DIR_STAT.st_gid
|
||||
DATA_DIR_STAT = DATA_DIR.stat()
|
||||
DATA_DIR_UID = DATA_DIR_STAT.st_uid
|
||||
DATA_DIR_GID = DATA_DIR_STAT.st_gid
|
||||
except PermissionError:
|
||||
DATA_DIR_UID = 0
|
||||
DATA_DIR_GID = 0
|
||||
DATA_DIR_UID = 0
|
||||
DATA_DIR_GID = 0
|
||||
|
||||
DEFAULT_PUID = 911
|
||||
DEFAULT_PGID = 911
|
||||
RUNNING_AS_UID = os.getuid()
|
||||
RUNNING_AS_GID = os.getgid()
|
||||
EUID = os.geteuid()
|
||||
EGID = os.getegid()
|
||||
SUDO_UID = int(os.environ.get('SUDO_UID', 0))
|
||||
SUDO_GID = int(os.environ.get('SUDO_GID', 0))
|
||||
USER: str = Path('~').expanduser().resolve().name
|
||||
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
|
||||
DEFAULT_PUID = 911
|
||||
DEFAULT_PGID = 911
|
||||
RUNNING_AS_UID = os.getuid()
|
||||
RUNNING_AS_GID = os.getgid()
|
||||
EUID = os.geteuid()
|
||||
EGID = os.getegid()
|
||||
SUDO_UID = int(os.environ.get("SUDO_UID", 0))
|
||||
SUDO_GID = int(os.environ.get("SUDO_GID", 0))
|
||||
USER: str = Path("~").expanduser().resolve().name
|
||||
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
|
||||
|
||||
IS_ROOT = RUNNING_AS_UID == 0
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose
|
||||
|
||||
|
||||
@@ -47,74 +47,79 @@ if RUNNING_AS_UID == 0:
|
||||
# if we are running as root it's really hard to figure out what the correct archivebox user should be
|
||||
# as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users)
|
||||
# check if 911:911 archivebox user exists on host system, and use it instead of 0
|
||||
if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox':
|
||||
if pwd.getpwuid(DEFAULT_PUID).pw_name == "archivebox":
|
||||
FALLBACK_UID = DEFAULT_PUID
|
||||
FALLBACK_GID = DEFAULT_PGID
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
os.environ.setdefault('PUID', str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID))
|
||||
os.environ.setdefault('PGID', str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID))
|
||||
os.environ.setdefault("PUID", str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID))
|
||||
os.environ.setdefault("PGID", str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID))
|
||||
|
||||
ARCHIVEBOX_USER = int(os.environ['PUID'])
|
||||
ARCHIVEBOX_GROUP = int(os.environ['PGID'])
|
||||
ARCHIVEBOX_USER = int(os.environ["PUID"])
|
||||
ARCHIVEBOX_GROUP = int(os.environ["PGID"])
|
||||
if not USER:
|
||||
try:
|
||||
# alternative method 1 to get username
|
||||
USER = pwd.getpwuid(ARCHIVEBOX_USER).pw_name
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if not USER:
|
||||
try:
|
||||
# alternative method 2 to get username
|
||||
import getpass
|
||||
|
||||
USER = getpass.getuser()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if not USER:
|
||||
try:
|
||||
# alternative method 3 to get username
|
||||
USER = os.getlogin() or 'archivebox'
|
||||
USER = os.getlogin() or "archivebox"
|
||||
except Exception:
|
||||
USER = 'archivebox'
|
||||
|
||||
USER = "archivebox"
|
||||
|
||||
ARCHIVEBOX_USER_EXISTS = False
|
||||
try:
|
||||
pwd.getpwuid(ARCHIVEBOX_USER)
|
||||
ARCHIVEBOX_USER_EXISTS = True
|
||||
except Exception:
|
||||
ARCHIVEBOX_USER_EXISTS = False
|
||||
|
||||
|
||||
|
||||
#############################################################################################
|
||||
|
||||
|
||||
def drop_privileges():
|
||||
"""If running as root, drop privileges to the user that owns the data dir (or PUID)"""
|
||||
|
||||
|
||||
# always run archivebox as the user that owns the data dir, never as root
|
||||
if os.getuid() == 0:
|
||||
# drop permissions to the user that owns the data dir / provided PUID
|
||||
if os.geteuid() != ARCHIVEBOX_USER and ARCHIVEBOX_USER != 0 and ARCHIVEBOX_USER_EXISTS:
|
||||
# drop our effective UID to the archivebox user's UID
|
||||
os.seteuid(ARCHIVEBOX_USER)
|
||||
|
||||
|
||||
# update environment variables so that subprocesses dont try to write to /root
|
||||
pw_record = pwd.getpwuid(ARCHIVEBOX_USER)
|
||||
os.environ['HOME'] = pw_record.pw_dir
|
||||
os.environ['LOGNAME'] = pw_record.pw_name
|
||||
os.environ['USER'] = pw_record.pw_name
|
||||
os.environ["HOME"] = pw_record.pw_dir
|
||||
os.environ["LOGNAME"] = pw_record.pw_name
|
||||
os.environ["USER"] = pw_record.pw_name
|
||||
|
||||
if ARCHIVEBOX_USER == 0 or not ARCHIVEBOX_USER_EXISTS:
|
||||
print('[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)', file=sys.stderr)
|
||||
print(
|
||||
"[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def SudoPermission(uid=0, fallback=False):
|
||||
"""Attempt to run code with sudo permissions for a given user (or root)"""
|
||||
|
||||
|
||||
if os.geteuid() == uid:
|
||||
# no need to change effective UID, we are already that user
|
||||
yield
|
||||
@@ -125,7 +130,7 @@ def SudoPermission(uid=0, fallback=False):
|
||||
os.seteuid(uid)
|
||||
except PermissionError as err:
|
||||
if not fallback:
|
||||
raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err
|
||||
raise PermissionError(f"Not enough permissions to run code as uid={uid}, please retry with sudo") from err
|
||||
try:
|
||||
# yield back to the caller so they can run code inside context as root
|
||||
yield
|
||||
@@ -135,4 +140,4 @@ def SudoPermission(uid=0, fallback=False):
|
||||
os.seteuid(ARCHIVEBOX_USER)
|
||||
except PermissionError as err:
|
||||
if not fallback:
|
||||
raise PermissionError(f'Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo') from err
|
||||
raise PermissionError(f"Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo") from err
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import importlib.metadata
|
||||
@@ -6,71 +6,71 @@ import importlib.metadata
|
||||
from pathlib import Path
|
||||
from functools import cache
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
#############################################################################################
|
||||
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir
|
||||
|
||||
#############################################################################################
|
||||
|
||||
|
||||
@cache
|
||||
def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
|
||||
def detect_installed_version(PACKAGE_DIR: Path = PACKAGE_DIR):
|
||||
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
|
||||
try:
|
||||
# if in production install, use pip-installed package metadata
|
||||
return importlib.metadata.version('archivebox').strip()
|
||||
return importlib.metadata.version("archivebox").strip()
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# if in dev Git repo dir, use pyproject.toml file
|
||||
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
|
||||
pyproject_config = (PACKAGE_DIR.parent / "pyproject.toml").read_text().split("\n")
|
||||
for line in pyproject_config:
|
||||
if line.startswith('version = '):
|
||||
return line.split(' = ', 1)[-1].strip('"').strip()
|
||||
if line.startswith("version = "):
|
||||
return line.split(" = ", 1)[-1].strip('"').strip()
|
||||
except FileNotFoundError:
|
||||
# building docs, pyproject.toml is not available
|
||||
pass
|
||||
|
||||
# raise Exception('Failed to detect installed archivebox version!')
|
||||
return 'dev'
|
||||
return "dev"
|
||||
|
||||
|
||||
@cache
|
||||
def get_COMMIT_HASH() -> Optional[str]:
|
||||
def get_COMMIT_HASH() -> str | None:
|
||||
try:
|
||||
git_dir = PACKAGE_DIR.parent / '.git'
|
||||
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
||||
git_dir = PACKAGE_DIR.parent / ".git"
|
||||
ref = (git_dir / "HEAD").read_text().strip().split(" ")[-1]
|
||||
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
||||
return commit_hash
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
return list((PACKAGE_DIR.parent / ".git/refs/heads/").glob("*"))[0].read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@cache
|
||||
def get_BUILD_TIME() -> str:
|
||||
if IN_DOCKER:
|
||||
try:
|
||||
# if we're in the archivebox official docker image, /VERSION.txt will contain the build time
|
||||
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
||||
docker_build_end_time = Path("/VERSION.txt").read_text().rsplit("BUILD_END_TIME=")[-1].split("\n", 1)[0]
|
||||
return docker_build_end_time
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
||||
src_last_modified_unix_timestamp = (PACKAGE_DIR / "README.md").stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime("%Y-%m-%d %H:%M:%S %s")
|
||||
|
||||
|
||||
# def get_versions_available_on_github(config):
|
||||
@@ -78,14 +78,14 @@ def get_BUILD_TIME() -> str:
|
||||
# returns a dictionary containing the ArchiveBox GitHub release info for
|
||||
# the recommended upgrade version and the currently installed version
|
||||
# """
|
||||
|
||||
|
||||
# # we only want to perform the (relatively expensive) check for new versions
|
||||
# # when its most relevant, e.g. when the user runs a long-running command
|
||||
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
|
||||
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
|
||||
# if subcommand_run_by_user not in long_running_commands:
|
||||
# return None
|
||||
|
||||
|
||||
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
|
||||
# response = requests.get(github_releases_api)
|
||||
# if response.status_code != 200:
|
||||
@@ -104,7 +104,7 @@ def get_BUILD_TIME() -> str:
|
||||
# break
|
||||
|
||||
# current_version = current_version or all_releases[-1]
|
||||
|
||||
|
||||
# # recommended version is whatever comes after current_version in the release list
|
||||
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
|
||||
# try:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import html
|
||||
import json
|
||||
@@ -6,7 +6,8 @@ import os
|
||||
import inspect
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict
|
||||
from typing import Any
|
||||
from collections.abc import Callable
|
||||
from urllib.parse import quote, urlencode
|
||||
from django.http import HttpRequest
|
||||
from django.utils import timezone
|
||||
@@ -21,30 +22,48 @@ from archivebox.misc.util import parse_date
|
||||
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
|
||||
LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
|
||||
ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
|
||||
INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
|
||||
ABX_PLUGINS_DOCS_BASE_URL = "https://archivebox.github.io/abx-plugins/"
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/"
|
||||
LIVE_CONFIG_BASE_URL = "/admin/environment/config/"
|
||||
ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/"
|
||||
INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/"
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
KNOWN_BINARIES = [
|
||||
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
|
||||
'node', 'npm', 'npx', 'yt-dlp',
|
||||
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
|
||||
'python3', 'python', 'bash', 'zsh',
|
||||
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
|
||||
"wget",
|
||||
"curl",
|
||||
"chromium",
|
||||
"chrome",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"node",
|
||||
"npm",
|
||||
"npx",
|
||||
"yt-dlp",
|
||||
"git",
|
||||
"singlefile",
|
||||
"readability-extractor",
|
||||
"mercury-parser",
|
||||
"python3",
|
||||
"python",
|
||||
"bash",
|
||||
"zsh",
|
||||
"ffmpeg",
|
||||
"ripgrep",
|
||||
"rg",
|
||||
"sonic",
|
||||
"archivebox",
|
||||
]
|
||||
|
||||
CANONICAL_BINARY_ALIASES = {
|
||||
'youtube-dl': 'yt-dlp',
|
||||
'ytdlp': 'yt-dlp',
|
||||
"youtube-dl": "yt-dlp",
|
||||
"ytdlp": "yt-dlp",
|
||||
}
|
||||
|
||||
|
||||
def is_superuser(request: HttpRequest) -> bool:
|
||||
return bool(getattr(request.user, 'is_superuser', False))
|
||||
return bool(getattr(request.user, "is_superuser", False))
|
||||
|
||||
|
||||
def format_parsed_datetime(value: object) -> str:
|
||||
@@ -55,9 +74,9 @@ def format_parsed_datetime(value: object) -> str:
|
||||
JSON_TOKEN_RE = re.compile(
|
||||
r'(?P<key>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
|
||||
r'|(?P<string>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
|
||||
r'|(?P<boolean>\btrue\b|\bfalse\b)'
|
||||
r'|(?P<null>\bnull\b)'
|
||||
r'|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
|
||||
r"|(?P<boolean>\btrue\b|\bfalse\b)"
|
||||
r"|(?P<null>\bnull\b)"
|
||||
r"|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)",
|
||||
)
|
||||
|
||||
|
||||
@@ -65,13 +84,14 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str:
|
||||
code = html.escape(text, quote=False)
|
||||
|
||||
if highlighted:
|
||||
|
||||
def _wrap_token(match: re.Match[str]) -> str:
|
||||
styles = {
|
||||
'key': 'color: #0550ae;',
|
||||
'string': 'color: #0a7f45;',
|
||||
'boolean': 'color: #8250df; font-weight: 600;',
|
||||
'null': 'color: #6e7781; font-style: italic;',
|
||||
'number': 'color: #b35900;',
|
||||
"key": "color: #0550ae;",
|
||||
"string": "color: #0a7f45;",
|
||||
"boolean": "color: #8250df; font-weight: 600;",
|
||||
"null": "color: #6e7781; font-style: italic;",
|
||||
"number": "color: #b35900;",
|
||||
}
|
||||
token_type = next(name for name, value in match.groupdict().items() if value is not None)
|
||||
return f'<span style="{styles[token_type]}">{match.group(0)}</span>'
|
||||
@@ -82,9 +102,9 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str:
|
||||
'<pre style="max-height: 600px; overflow: auto; background: #f6f8fa; '
|
||||
'border: 1px solid #d0d7de; border-radius: 6px; padding: 12px; margin: 0;">'
|
||||
'<code style="font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, '
|
||||
'\'Liberation Mono\', monospace; white-space: pre; line-height: 1.5;">'
|
||||
f'{code}'
|
||||
'</code></pre>'
|
||||
"'Liberation Mono', monospace; white-space: pre; line-height: 1.5;\">"
|
||||
f"{code}"
|
||||
"</code></pre>"
|
||||
)
|
||||
|
||||
|
||||
@@ -93,34 +113,35 @@ def render_highlighted_json_block(value: Any) -> str:
|
||||
|
||||
|
||||
def get_plugin_docs_url(plugin_name: str) -> str:
|
||||
return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
|
||||
return f"{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}"
|
||||
|
||||
|
||||
def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
|
||||
return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
|
||||
return f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}"
|
||||
|
||||
|
||||
def get_live_config_url(key: str) -> str:
|
||||
return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
|
||||
return f"{LIVE_CONFIG_BASE_URL}{quote(key)}/"
|
||||
|
||||
|
||||
def get_environment_binary_url(name: str) -> str:
|
||||
return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
|
||||
return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/"
|
||||
|
||||
|
||||
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
|
||||
binary_id = getattr(binary, 'id', None)
|
||||
binary_id = getattr(binary, "id", None)
|
||||
if not binary_id:
|
||||
return None
|
||||
|
||||
base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
|
||||
changelist_filters = urlencode({'q': canonical_binary_name(name)})
|
||||
return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
|
||||
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/"
|
||||
changelist_filters = urlencode({"q": canonical_binary_name(name)})
|
||||
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
|
||||
|
||||
|
||||
def get_machine_admin_url() -> str | None:
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.current().admin_change_url
|
||||
except Exception:
|
||||
return None
|
||||
@@ -130,12 +151,14 @@ def render_code_tag_list(values: list[str]) -> str:
|
||||
if not values:
|
||||
return '<span style="color: #6e7781;">(none)</span>'
|
||||
|
||||
tags = ''.join(
|
||||
str(format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
))
|
||||
tags = "".join(
|
||||
str(
|
||||
format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
),
|
||||
)
|
||||
for value in values
|
||||
)
|
||||
return f'<div style="display: flex; flex-wrap: wrap;">{tags}</div>'
|
||||
@@ -143,22 +166,21 @@ def render_code_tag_list(values: list[str]) -> str:
|
||||
|
||||
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
|
||||
rows = (
|
||||
('Title', config.get('title') or '(none)'),
|
||||
('Description', config.get('description') or '(none)'),
|
||||
('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
|
||||
('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
|
||||
('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
|
||||
("Title", config.get("title") or "(none)"),
|
||||
("Description", config.get("description") or "(none)"),
|
||||
("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))),
|
||||
("Required Binaries", mark_safe(render_link_tag_list(config.get("required_binaries") or [], get_environment_binary_url))),
|
||||
("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))),
|
||||
)
|
||||
|
||||
rendered_rows = ''.join(
|
||||
str(format_html(
|
||||
'<div style="margin: 0 0 14px 0;">'
|
||||
'<div style="font-weight: 600; margin-bottom: 4px;">{}</div>'
|
||||
'<div>{}</div>'
|
||||
'</div>',
|
||||
label,
|
||||
value,
|
||||
))
|
||||
rendered_rows = "".join(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 14px 0;"><div style="font-weight: 600; margin-bottom: 4px;">{}</div><div>{}</div></div>',
|
||||
label,
|
||||
value,
|
||||
),
|
||||
)
|
||||
for label, value in rows
|
||||
)
|
||||
return f'<div style="margin: 4px 0 0 0;">{rendered_rows}</div>'
|
||||
@@ -171,20 +193,28 @@ def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] |
|
||||
tags = []
|
||||
for value in values:
|
||||
if url_resolver is None:
|
||||
tags.append(str(format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
)))
|
||||
tags.append(
|
||||
str(
|
||||
format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
tags.append(str(format_html(
|
||||
'<a href="{}" style="text-decoration: none;">'
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
|
||||
'</a>',
|
||||
url_resolver(value),
|
||||
value,
|
||||
)))
|
||||
tags.append(
|
||||
str(
|
||||
format_html(
|
||||
'<a href="{}" style="text-decoration: none;">'
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
|
||||
"</a>",
|
||||
url_resolver(value),
|
||||
value,
|
||||
),
|
||||
),
|
||||
)
|
||||
return f'<div style="display: flex; flex-wrap: wrap;">{"".join(tags)}</div>'
|
||||
|
||||
|
||||
@@ -195,21 +225,21 @@ def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_adm
|
||||
if machine_admin_url:
|
||||
links.append(str(format_html('<a href="{}">Edit override</a>', machine_admin_url)))
|
||||
|
||||
fallback = prop_info.get('x-fallback')
|
||||
fallback = prop_info.get("x-fallback")
|
||||
if isinstance(fallback, str) and fallback:
|
||||
links.append(str(format_html('<a href="{}">Fallback: <code>{}</code></a>', get_live_config_url(fallback), fallback)))
|
||||
|
||||
aliases = prop_info.get('x-aliases') or []
|
||||
aliases = prop_info.get("x-aliases") or []
|
||||
if isinstance(aliases, list):
|
||||
for alias in aliases:
|
||||
if isinstance(alias, str) and alias:
|
||||
links.append(str(format_html('<a href="{}">Alias: <code>{}</code></a>', get_live_config_url(alias), alias)))
|
||||
|
||||
default = prop_info.get('default')
|
||||
if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
|
||||
default = prop_info.get("default")
|
||||
if prop_name.endswith("_BINARY") and isinstance(default, str) and default:
|
||||
links.append(str(format_html('<a href="{}">Binary: <code>{}</code></a>', get_environment_binary_url(default), default)))
|
||||
|
||||
return ' '.join(links)
|
||||
return " ".join(links)
|
||||
|
||||
|
||||
def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
|
||||
@@ -221,42 +251,48 @@ def render_config_properties_html(properties: dict[str, Any], machine_admin_url:
|
||||
header_links.insert(0, str(format_html('<a href="{}">Machine Config Editor</a>', machine_admin_url)))
|
||||
|
||||
cards = [
|
||||
f'<div style="margin: 0 0 16px 0;">{" | ".join(header_links)}</div>'
|
||||
f'<div style="margin: 0 0 16px 0;">{" | ".join(header_links)}</div>',
|
||||
]
|
||||
|
||||
for prop_name, prop_info in properties.items():
|
||||
prop_type = prop_info.get('type', 'unknown')
|
||||
prop_type = prop_info.get("type", "unknown")
|
||||
if isinstance(prop_type, list):
|
||||
prop_type = ' | '.join(str(type_name) for type_name in prop_type)
|
||||
prop_desc = prop_info.get('description', '')
|
||||
prop_type = " | ".join(str(type_name) for type_name in prop_type)
|
||||
prop_desc = prop_info.get("description", "")
|
||||
|
||||
default_html = ''
|
||||
if 'default' in prop_info:
|
||||
default_html = str(format_html(
|
||||
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
|
||||
prop_info['default'],
|
||||
))
|
||||
default_html = ""
|
||||
if "default" in prop_info:
|
||||
default_html = str(
|
||||
format_html(
|
||||
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
|
||||
prop_info["default"],
|
||||
),
|
||||
)
|
||||
|
||||
description_html = prop_desc or mark_safe('<span style="color: #6e7781;">(no description)</span>')
|
||||
cards.append(str(format_html(
|
||||
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
|
||||
'<div style="margin-bottom: 6px;">'
|
||||
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
|
||||
' <span style="color: #6e7781;">({})</span>'
|
||||
'</div>'
|
||||
'<div style="margin-bottom: 6px;">{}</div>'
|
||||
'<div style="font-size: 0.95em;">{}</div>'
|
||||
'{}'
|
||||
'</div>',
|
||||
get_live_config_url(prop_name),
|
||||
prop_name,
|
||||
prop_type,
|
||||
description_html,
|
||||
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
|
||||
mark_safe(default_html),
|
||||
)))
|
||||
cards.append(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
|
||||
'<div style="margin-bottom: 6px;">'
|
||||
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
|
||||
' <span style="color: #6e7781;">({})</span>'
|
||||
"</div>"
|
||||
'<div style="margin-bottom: 6px;">{}</div>'
|
||||
'<div style="font-size: 0.95em;">{}</div>'
|
||||
"{}"
|
||||
"</div>",
|
||||
get_live_config_url(prop_name),
|
||||
prop_name,
|
||||
prop_type,
|
||||
description_html,
|
||||
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
|
||||
mark_safe(default_html),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
return ''.join(cards)
|
||||
return "".join(cards)
|
||||
|
||||
|
||||
def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
|
||||
@@ -265,40 +301,47 @@ def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> s
|
||||
|
||||
items = []
|
||||
for hook_name in hooks:
|
||||
if source == 'builtin':
|
||||
items.append(str(format_html(
|
||||
'<div style="margin: 0 0 8px 0;">'
|
||||
'<a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a>'
|
||||
'</div>',
|
||||
get_plugin_hook_source_url(plugin_name, hook_name),
|
||||
hook_name,
|
||||
)))
|
||||
if source == "builtin":
|
||||
items.append(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a></div>',
|
||||
get_plugin_hook_source_url(plugin_name, hook_name),
|
||||
hook_name,
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
items.append(str(format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
|
||||
hook_name,
|
||||
)))
|
||||
return ''.join(items)
|
||||
items.append(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
|
||||
hook_name,
|
||||
),
|
||||
),
|
||||
)
|
||||
return "".join(items)
|
||||
|
||||
|
||||
def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
|
||||
installed_binary_url = get_installed_binary_change_url(name, db_binary)
|
||||
|
||||
if installed_binary_url:
|
||||
return str(format_html(
|
||||
'<code>{}</code><br/>'
|
||||
'<a href="{}">View Installed Binary Record</a>',
|
||||
merged['abspath'],
|
||||
installed_binary_url,
|
||||
))
|
||||
return str(
|
||||
format_html(
|
||||
'<code>{}</code><br/><a href="{}">View Installed Binary Record</a>',
|
||||
merged["abspath"],
|
||||
installed_binary_url,
|
||||
),
|
||||
)
|
||||
|
||||
return str(format_html('<code>{}</code>', merged['abspath']))
|
||||
return str(format_html("<code>{}</code>", merged["abspath"]))
|
||||
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
indent_str = " " * indent
|
||||
if indent == 0:
|
||||
indent_str = '\n' # put extra newline between top-level entries
|
||||
indent_str = "\n" # put extra newline between top-level entries
|
||||
|
||||
if isinstance(obj, dict):
|
||||
if not obj:
|
||||
@@ -326,11 +369,11 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
return f" {str(obj)}"
|
||||
|
||||
elif callable(obj):
|
||||
source = '\n'.join(
|
||||
'' if 'def ' in line else line
|
||||
for line in inspect.getsource(obj).split('\n')
|
||||
if line.strip()
|
||||
).split('lambda: ')[-1].rstrip(',')
|
||||
source = (
|
||||
"\n".join("" if "def " in line else line for line in inspect.getsource(obj).split("\n") if line.strip())
|
||||
.split("lambda: ")[-1]
|
||||
.rstrip(",")
|
||||
)
|
||||
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
|
||||
|
||||
else:
|
||||
@@ -350,67 +393,64 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
|
||||
)
|
||||
|
||||
|
||||
def get_db_binaries_by_name() -> Dict[str, Binary]:
|
||||
grouped: Dict[str, list[Binary]] = {}
|
||||
def get_db_binaries_by_name() -> dict[str, Binary]:
|
||||
grouped: dict[str, list[Binary]] = {}
|
||||
for binary in Binary.objects.all():
|
||||
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
|
||||
|
||||
return {
|
||||
name: max(records, key=_binary_sort_key)
|
||||
for name, records in grouped.items()
|
||||
}
|
||||
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
|
||||
|
||||
|
||||
def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
|
||||
def serialize_binary_record(name: str, binary: Binary | None) -> dict[str, Any]:
|
||||
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
|
||||
return {
|
||||
'name': canonical_binary_name(name),
|
||||
'version': str(getattr(binary, 'version', '') or ''),
|
||||
'binprovider': str(getattr(binary, 'binprovider', '') or ''),
|
||||
'abspath': str(getattr(binary, 'abspath', '') or ''),
|
||||
'sha256': str(getattr(binary, 'sha256', '') or ''),
|
||||
'status': str(getattr(binary, 'status', '') or ''),
|
||||
'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
|
||||
"name": canonical_binary_name(name),
|
||||
"version": str(getattr(binary, "version", "") or ""),
|
||||
"binprovider": str(getattr(binary, "binprovider", "") or ""),
|
||||
"abspath": str(getattr(binary, "abspath", "") or ""),
|
||||
"sha256": str(getattr(binary, "sha256", "") or ""),
|
||||
"status": str(getattr(binary, "status", "") or ""),
|
||||
"is_available": is_installed and bool(getattr(binary, "abspath", "") or ""),
|
||||
}
|
||||
|
||||
|
||||
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
|
||||
def get_filesystem_plugins() -> dict[str, dict[str, Any]]:
|
||||
"""Discover plugins from filesystem directories."""
|
||||
import json
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
|
||||
|
||||
plugins = {}
|
||||
|
||||
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
|
||||
for base_dir, source in [(BUILTIN_PLUGINS_DIR, "builtin"), (USER_PLUGINS_DIR, "user")]:
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
|
||||
plugin_id = f'{source}.{plugin_dir.name}'
|
||||
if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"):
|
||||
plugin_id = f"{source}.{plugin_dir.name}"
|
||||
|
||||
# Find hook scripts
|
||||
hooks = []
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
|
||||
for ext in ("sh", "py", "js"):
|
||||
hooks.extend(plugin_dir.glob(f"on_*__*.{ext}"))
|
||||
|
||||
# Load config.json if it exists
|
||||
config_file = plugin_dir / 'config.json'
|
||||
config_file = plugin_dir / "config.json"
|
||||
config_data = None
|
||||
if config_file.exists():
|
||||
try:
|
||||
with open(config_file, 'r') as f:
|
||||
with open(config_file) as f:
|
||||
config_data = json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
except (json.JSONDecodeError, OSError):
|
||||
config_data = None
|
||||
|
||||
plugins[plugin_id] = {
|
||||
'id': plugin_id,
|
||||
'name': plugin_dir.name,
|
||||
'path': str(plugin_dir),
|
||||
'source': source,
|
||||
'hooks': [str(h.name) for h in hooks],
|
||||
'config': config_data,
|
||||
"id": plugin_id,
|
||||
"name": plugin_dir.name,
|
||||
"path": str(plugin_dir),
|
||||
"source": source,
|
||||
"hooks": [str(h.name) for h in hooks],
|
||||
"config": config_data,
|
||||
}
|
||||
|
||||
return plugins
|
||||
@@ -418,7 +458,7 @@ def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
|
||||
|
||||
@render_with_table_view
|
||||
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
rows = {
|
||||
"Binary Name": [],
|
||||
@@ -433,16 +473,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
for name in all_binary_names:
|
||||
merged = serialize_binary_record(name, db_binaries.get(name))
|
||||
|
||||
rows['Binary Name'].append(ItemLink(name, key=name))
|
||||
rows["Binary Name"].append(ItemLink(name, key=name))
|
||||
|
||||
if merged['is_available']:
|
||||
rows['Found Version'].append(f"✅ {merged['version']}" if merged['version'] else '✅ found')
|
||||
rows['Provided By'].append(merged['binprovider'] or '-')
|
||||
rows['Found Abspath'].append(merged['abspath'] or '-')
|
||||
if merged["is_available"]:
|
||||
rows["Found Version"].append(f"✅ {merged['version']}" if merged["version"] else "✅ found")
|
||||
rows["Provided By"].append(merged["binprovider"] or "-")
|
||||
rows["Found Abspath"].append(merged["abspath"] or "-")
|
||||
else:
|
||||
rows['Found Version'].append('❌ missing')
|
||||
rows['Provided By'].append('-')
|
||||
rows['Found Abspath'].append('-')
|
||||
rows["Found Version"].append("❌ missing")
|
||||
rows["Provided By"].append("-")
|
||||
rows["Found Abspath"].append("-")
|
||||
|
||||
return TableContext(
|
||||
title="Binaries",
|
||||
@@ -452,23 +492,23 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
key = canonical_binary_name(key)
|
||||
|
||||
db_binary = get_db_binaries_by_name().get(key)
|
||||
merged = serialize_binary_record(key, db_binary)
|
||||
|
||||
if merged['is_available']:
|
||||
if merged["is_available"]:
|
||||
section: SectionData = {
|
||||
"name": key,
|
||||
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': merged['binprovider'] or '-',
|
||||
'abspath': merged['abspath'] or 'not found',
|
||||
'version': merged['version'] or 'unknown',
|
||||
'sha256': merged['sha256'],
|
||||
'status': merged['status'],
|
||||
"name": key,
|
||||
"binprovider": merged["binprovider"] or "-",
|
||||
"abspath": merged["abspath"] or "not found",
|
||||
"version": merged["version"] or "unknown",
|
||||
"sha256": merged["sha256"],
|
||||
"status": merged["status"],
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -482,11 +522,11 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"name": key,
|
||||
"description": "No persisted Binary record found",
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': merged['binprovider'] or 'not recorded',
|
||||
'abspath': merged['abspath'] or 'not recorded',
|
||||
'version': merged['version'] or 'N/A',
|
||||
'status': merged['status'] or 'unrecorded',
|
||||
"name": key,
|
||||
"binprovider": merged["binprovider"] or "not recorded",
|
||||
"abspath": merged["abspath"] or "not recorded",
|
||||
"version": merged["version"] or "N/A",
|
||||
"status": merged["status"] or "unrecorded",
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -499,7 +539,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
@render_with_table_view
|
||||
def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
rows = {
|
||||
"Name": [],
|
||||
@@ -512,26 +552,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
plugins = get_filesystem_plugins()
|
||||
|
||||
for plugin_id, plugin in plugins.items():
|
||||
rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
|
||||
rows['Source'].append(plugin['source'])
|
||||
rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
|
||||
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
|
||||
rows["Name"].append(ItemLink(plugin["name"], key=plugin_id))
|
||||
rows["Source"].append(plugin["source"])
|
||||
rows["Path"].append(format_html("<code>{}</code>", plugin["path"]))
|
||||
rows["Hooks"].append(", ".join(plugin["hooks"]) or "(none)")
|
||||
|
||||
# Show config status
|
||||
if plugin.get('config'):
|
||||
config_properties = plugin['config'].get('properties', {})
|
||||
if plugin.get("config"):
|
||||
config_properties = plugin["config"].get("properties", {})
|
||||
config_count = len(config_properties)
|
||||
rows['Config'].append(f'✅ {config_count} properties' if config_count > 0 else '✅ present')
|
||||
rows["Config"].append(f"✅ {config_count} properties" if config_count > 0 else "✅ present")
|
||||
else:
|
||||
rows['Config'].append('❌ none')
|
||||
rows["Config"].append("❌ none")
|
||||
|
||||
if not plugins:
|
||||
# Show a helpful message when no plugins found
|
||||
rows['Name'].append('(no plugins found)')
|
||||
rows['Source'].append('-')
|
||||
rows['Path'].append(mark_safe('<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>'))
|
||||
rows['Hooks'].append('-')
|
||||
rows['Config'].append('-')
|
||||
rows["Name"].append("(no plugins found)")
|
||||
rows["Source"].append("-")
|
||||
rows["Path"].append(mark_safe("<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>"))
|
||||
rows["Hooks"].append("-")
|
||||
rows["Config"].append("-")
|
||||
|
||||
return TableContext(
|
||||
title="Installed plugins",
|
||||
@@ -541,7 +581,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
plugins = get_filesystem_plugins()
|
||||
|
||||
@@ -549,65 +589,75 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
if not plugin:
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=f'Plugin not found: {key}',
|
||||
title=f"Plugin not found: {key}",
|
||||
data=[],
|
||||
)
|
||||
|
||||
# Base fields that all plugins have
|
||||
docs_url = get_plugin_docs_url(plugin['name'])
|
||||
docs_url = get_plugin_docs_url(plugin["name"])
|
||||
machine_admin_url = get_machine_admin_url()
|
||||
fields = {
|
||||
"id": plugin['id'],
|
||||
"name": plugin['name'],
|
||||
"source": plugin['source'],
|
||||
"id": plugin["id"],
|
||||
"name": plugin["name"],
|
||||
"source": plugin["source"],
|
||||
}
|
||||
|
||||
sections: list[SectionData] = [{
|
||||
"name": plugin['name'],
|
||||
"description": format_html(
|
||||
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
|
||||
plugin['path'],
|
||||
docs_url,
|
||||
),
|
||||
"fields": fields,
|
||||
"help_texts": {},
|
||||
}]
|
||||
|
||||
if plugin['hooks']:
|
||||
sections.append({
|
||||
"name": "Hooks",
|
||||
"description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
|
||||
"fields": {},
|
||||
sections: list[SectionData] = [
|
||||
{
|
||||
"name": plugin["name"],
|
||||
"description": format_html(
|
||||
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
|
||||
plugin["path"],
|
||||
docs_url,
|
||||
),
|
||||
"fields": fields,
|
||||
"help_texts": {},
|
||||
})
|
||||
},
|
||||
]
|
||||
|
||||
if plugin.get('config'):
|
||||
sections.append({
|
||||
"name": "Plugin Metadata",
|
||||
"description": mark_safe(render_plugin_metadata_html(plugin['config'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
sections.append({
|
||||
"name": "config.json",
|
||||
"description": mark_safe(render_highlighted_json_block(plugin['config'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
config_properties = plugin['config'].get('properties', {})
|
||||
if config_properties:
|
||||
sections.append({
|
||||
"name": "Config Properties",
|
||||
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
|
||||
if plugin["hooks"]:
|
||||
sections.append(
|
||||
{
|
||||
"name": "Hooks",
|
||||
"description": mark_safe(render_hook_links_html(plugin["name"], plugin["hooks"], plugin["source"])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
},
|
||||
)
|
||||
|
||||
if plugin.get("config"):
|
||||
sections.append(
|
||||
{
|
||||
"name": "Plugin Metadata",
|
||||
"description": mark_safe(render_plugin_metadata_html(plugin["config"])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
},
|
||||
)
|
||||
|
||||
sections.append(
|
||||
{
|
||||
"name": "config.json",
|
||||
"description": mark_safe(render_highlighted_json_block(plugin["config"])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
},
|
||||
)
|
||||
|
||||
config_properties = plugin["config"].get("properties", {})
|
||||
if config_properties:
|
||||
sections.append(
|
||||
{
|
||||
"name": "Config Properties",
|
||||
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
},
|
||||
)
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=plugin['name'],
|
||||
title=plugin["name"],
|
||||
data=sections,
|
||||
)
|
||||
|
||||
@@ -648,20 +698,20 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
all_config[config_name] = config_data
|
||||
|
||||
# Add top row for supervisord process manager
|
||||
rows["Name"].append(ItemLink('supervisord', key='supervisord'))
|
||||
rows["Name"].append(ItemLink("supervisord", key="supervisord"))
|
||||
supervisor_state = supervisor.getState()
|
||||
rows["State"].append(str(supervisor_state.get('statename') if isinstance(supervisor_state, dict) else ''))
|
||||
rows['PID'].append(str(supervisor.getPID()))
|
||||
rows["Started"].append('-')
|
||||
rows["Command"].append('supervisord --configuration=tmp/supervisord.conf')
|
||||
rows["State"].append(str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""))
|
||||
rows["PID"].append(str(supervisor.getPID()))
|
||||
rows["Started"].append("-")
|
||||
rows["Command"].append("supervisord --configuration=tmp/supervisord.conf")
|
||||
rows["Logfile"].append(
|
||||
format_html(
|
||||
'<a href="/admin/environment/logs/{}/">{}</a>',
|
||||
'supervisord',
|
||||
'logs/supervisord.log',
|
||||
)
|
||||
"supervisord",
|
||||
"logs/supervisord.log",
|
||||
),
|
||||
)
|
||||
rows['Exit Status'].append('0')
|
||||
rows["Exit Status"].append("0")
|
||||
|
||||
# Add a row for each worker process managed by supervisord
|
||||
process_items = supervisor.getAllProcessInfo()
|
||||
@@ -678,15 +728,15 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
rows["Name"].append(ItemLink(proc_name, key=proc_name))
|
||||
rows["State"].append(str(proc_data.get("statename") or ""))
|
||||
rows['PID'].append(proc_description.replace('pid ', ''))
|
||||
rows["PID"].append(proc_description.replace("pid ", ""))
|
||||
rows["Started"].append(format_parsed_datetime(proc_start))
|
||||
rows["Command"].append(str(proc_config.get("command") or ""))
|
||||
rows["Logfile"].append(
|
||||
format_html(
|
||||
'<a href="/admin/environment/logs/{}/">{}</a>',
|
||||
proc_logfile.split("/")[-1].split('.')[0],
|
||||
proc_logfile.split("/")[-1].split(".")[0],
|
||||
proc_logfile,
|
||||
)
|
||||
),
|
||||
)
|
||||
rows["Exit Status"].append(str(proc_data.get("exitstatus") or ""))
|
||||
|
||||
@@ -708,8 +758,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is None:
|
||||
return ItemContext(
|
||||
slug='none',
|
||||
title='error: No running supervisord process.',
|
||||
slug="none",
|
||||
title="error: No running supervisord process.",
|
||||
data=[],
|
||||
)
|
||||
|
||||
@@ -721,7 +771,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
if isinstance(config_data, dict):
|
||||
all_config.append(config_data)
|
||||
|
||||
if key == 'supervisord':
|
||||
if key == "supervisord":
|
||||
relevant_config = CONFIG_FILE.read_text()
|
||||
relevant_logs = str(supervisor.readLog(0, 10_000_000))
|
||||
start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0]
|
||||
@@ -729,7 +779,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else ""
|
||||
supervisor_state = supervisor.getState()
|
||||
|
||||
proc: Dict[str, object] = {
|
||||
proc: dict[str, object] = {
|
||||
"name": "supervisord",
|
||||
"pid": supervisor.getPID(),
|
||||
"statename": str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""),
|
||||
@@ -737,12 +787,12 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"stop": None,
|
||||
"exitstatus": "",
|
||||
"stdout_logfile": "logs/supervisord.log",
|
||||
"description": f'pid 000, uptime {uptime}',
|
||||
"description": f"pid 000, uptime {uptime}",
|
||||
}
|
||||
else:
|
||||
worker_data = get_worker(supervisor, key)
|
||||
proc = worker_data if isinstance(worker_data, dict) else {}
|
||||
relevant_config = next((config for config in all_config if config.get('name') == key), {})
|
||||
relevant_config = next((config for config in all_config if config.get("name") == key), {})
|
||||
log_result = supervisor.tailProcessStdoutLog(key, 0, 10_000_000)
|
||||
relevant_logs = str(log_result[0] if isinstance(log_result, tuple) else log_result)
|
||||
|
||||
@@ -775,7 +825,6 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
|
||||
log_files: list[Path] = []
|
||||
for logfile in sorted(CONSTANTS.LOGS_DIR.glob("*.log"), key=os.path.getmtime)[::-1]:
|
||||
if isinstance(logfile, Path):
|
||||
@@ -793,14 +842,14 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
st = logfile.stat()
|
||||
rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name))
|
||||
rows["Last Updated"].append(format_parsed_datetime(st.st_mtime))
|
||||
rows["Size"].append(f'{st.st_size//1000} kb')
|
||||
rows["Size"].append(f"{st.st_size // 1000} kb")
|
||||
|
||||
with open(logfile, 'rb') as f:
|
||||
with open(logfile, "rb") as f:
|
||||
try:
|
||||
f.seek(-1024, os.SEEK_END)
|
||||
except OSError:
|
||||
f.seek(0)
|
||||
last_lines = f.read().decode('utf-8', errors='replace').split("\n")
|
||||
last_lines = f.read().decode("utf-8", errors="replace").split("\n")
|
||||
non_empty_lines = [line for line in last_lines if line.strip()]
|
||||
rows["Most Recent Lines"].append(non_empty_lines[-1])
|
||||
|
||||
@@ -814,7 +863,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
|
||||
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob("*.log") if key in logfile.name][0]
|
||||
|
||||
log_text = log_file.read_text()
|
||||
log_stat = log_file.stat()
|
||||
@@ -824,7 +873,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"description": key,
|
||||
"fields": {
|
||||
"Path": str(log_file),
|
||||
"Size": f"{log_stat.st_size//1000} kb",
|
||||
"Size": f"{log_stat.st_size // 1000} kb",
|
||||
"Last Updated": format_parsed_datetime(log_stat.st_mtime),
|
||||
"Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]),
|
||||
"Full Log": log_text,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
__order__ = 100
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from archivebox.core.admin import register_admin as do_register
|
||||
|
||||
do_register(admin_site)
|
||||
|
||||
|
||||
@@ -17,11 +18,12 @@ def get_CONFIG():
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
"SHELL_CONFIG": SHELL_CONFIG,
|
||||
"STORAGE_CONFIG": STORAGE_CONFIG,
|
||||
"GENERAL_CONFIG": GENERAL_CONFIG,
|
||||
"SERVER_CONFIG": SERVER_CONFIG,
|
||||
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
|
||||
"SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
import html
|
||||
import json
|
||||
@@ -21,57 +21,45 @@ from django.utils.text import smart_split
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.misc.paginators import AcceleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.widgets import InlineTagEditorWidget
|
||||
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
|
||||
from archivebox.machine.env_utils import env_to_shell_exports
|
||||
|
||||
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def _stringify_env_value(value) -> str:
|
||||
if value is None:
|
||||
return ''
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return json.dumps(value, separators=(',', ':'))
|
||||
|
||||
|
||||
def _quote_shell_string(value: str) -> str:
|
||||
return "'" + str(value).replace("'", "'\"'\"'") + "'"
|
||||
|
||||
|
||||
def _get_replay_source_url(result: ArchiveResult) -> str:
|
||||
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
|
||||
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
|
||||
process_env = getattr(getattr(result, "process", None), "env", None) or {}
|
||||
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "")
|
||||
|
||||
|
||||
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
||||
source_url = _get_replay_source_url(result)
|
||||
plugin_name = str(result.plugin or '').strip()
|
||||
plugin_name = str(result.plugin or "").strip()
|
||||
if not plugin_name and not source_url:
|
||||
return 'abx-dl'
|
||||
return "abx-dl"
|
||||
if not source_url:
|
||||
return f'abx-dl --plugins={plugin_name}'
|
||||
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
|
||||
return f"abx-dl --plugins={plugin_name}"
|
||||
return f"abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}"
|
||||
|
||||
|
||||
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
|
||||
display_command = build_abx_dl_display_command(result)
|
||||
process = getattr(result, 'process', None)
|
||||
env = getattr(process, 'env', None) or {}
|
||||
env_items = ' '.join(
|
||||
f'{key}={shlex.quote(_stringify_env_value(value))}'
|
||||
for key, value in sorted(env.items())
|
||||
if value is not None
|
||||
)
|
||||
process = getattr(result, "process", None)
|
||||
env_items = env_to_shell_exports(getattr(process, "env", None) or {})
|
||||
snapshot_dir = shlex.quote(str(result.snapshot_dir))
|
||||
if env_items:
|
||||
return f'cd {snapshot_dir}; env {env_items} {display_command}'
|
||||
return f'cd {snapshot_dir}; {display_command}'
|
||||
return f"cd {snapshot_dir}; env {env_items} {display_command}"
|
||||
return f"cd {snapshot_dir}; {display_command}"
|
||||
|
||||
|
||||
def get_plugin_admin_url(plugin_name: str) -> str:
|
||||
@@ -81,50 +69,87 @@ def get_plugin_admin_url(plugin_name: str) -> str:
|
||||
if plugin_dir:
|
||||
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(builtin_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/"
|
||||
|
||||
user_root = USER_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(user_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
|
||||
return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/"
|
||||
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/"
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
|
||||
|
||||
results = list(archiveresults_qs.order_by('plugin').select_related('snapshot')[:limit])
|
||||
result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit])
|
||||
if not result_ids:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
|
||||
results_by_id = {
|
||||
result.pk: result
|
||||
for result in ArchiveResult.objects.filter(pk__in=result_ids).select_related("snapshot", "process", "process__machine")
|
||||
}
|
||||
results = [results_by_id[result_id] for result_id in result_ids if result_id in results_by_id]
|
||||
|
||||
if not results:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
|
||||
# Status colors
|
||||
status_colors = {
|
||||
'succeeded': ('#166534', '#dcfce7'), # green
|
||||
'failed': ('#991b1b', '#fee2e2'), # red
|
||||
'queued': ('#6b7280', '#f3f4f6'), # gray
|
||||
'started': ('#92400e', '#fef3c7'), # amber
|
||||
'backoff': ('#92400e', '#fef3c7'),
|
||||
'skipped': ('#475569', '#f1f5f9'),
|
||||
'noresults': ('#475569', '#f1f5f9'),
|
||||
"succeeded": ("#166534", "#dcfce7"), # green
|
||||
"failed": ("#991b1b", "#fee2e2"), # red
|
||||
"queued": ("#6b7280", "#f3f4f6"), # gray
|
||||
"started": ("#92400e", "#fef3c7"), # amber
|
||||
"backoff": ("#92400e", "#fef3c7"),
|
||||
"skipped": ("#475569", "#f1f5f9"),
|
||||
"noresults": ("#475569", "#f1f5f9"),
|
||||
}
|
||||
|
||||
rows = []
|
||||
for idx, result in enumerate(results):
|
||||
status = result.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
|
||||
status = result.status or "queued"
|
||||
color, bg = status_colors.get(status, ("#6b7280", "#f3f4f6"))
|
||||
output_files = result.output_files or {}
|
||||
if isinstance(output_files, dict):
|
||||
output_file_count = len(output_files)
|
||||
elif isinstance(output_files, (list, tuple, set)):
|
||||
output_file_count = len(output_files)
|
||||
elif isinstance(output_files, str):
|
||||
try:
|
||||
parsed = json.loads(output_files)
|
||||
output_file_count = len(parsed) if isinstance(parsed, (dict, list, tuple, set)) else 0
|
||||
except Exception:
|
||||
output_file_count = 0
|
||||
else:
|
||||
output_file_count = 0
|
||||
|
||||
# Get plugin icon
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
|
||||
# Format timestamp
|
||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||
end_time = result.end_ts.strftime("%Y-%m-%d %H:%M:%S") if result.end_ts else "-"
|
||||
|
||||
process_display = "-"
|
||||
if result.process_id and result.process:
|
||||
process_display = f'''
|
||||
<a href="{reverse("admin:machine_process_change", args=[result.process_id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px;"
|
||||
title="View process">{result.process.pid or "-"}</a>
|
||||
'''
|
||||
|
||||
machine_display = "-"
|
||||
if result.process_id and result.process and result.process.machine_id:
|
||||
machine_display = f'''
|
||||
<a href="{reverse("admin:machine_machine_change", args=[result.process.machine_id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-size: 12px;"
|
||||
title="View machine">{result.process.machine.hostname}</a>
|
||||
'''
|
||||
|
||||
# Truncate output for display
|
||||
full_output = result.output_str or '-'
|
||||
full_output = result.output_str or "-"
|
||||
output_display = full_output[:60]
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
output_display += "..."
|
||||
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
@@ -132,23 +157,23 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
cmd_attr = html.escape(replay_cmd, quote=True)
|
||||
|
||||
# Build output link - use embed_path() which checks output_files first
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
snapshot_id = str(getattr(result, 'snapshot_id', ''))
|
||||
if embed_path and result.status == 'succeeded':
|
||||
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
snapshot_id = str(getattr(result, "snapshot_id", ""))
|
||||
if embed_path and result.status == "succeeded":
|
||||
output_link = build_snapshot_url(snapshot_id, embed_path)
|
||||
else:
|
||||
output_link = build_snapshot_url(snapshot_id, '')
|
||||
output_link = build_snapshot_url(snapshot_id, "")
|
||||
|
||||
# Get version - try cmd_version field
|
||||
version = result.cmd_version if result.cmd_version else '-'
|
||||
version = result.cmd_version if result.cmd_version else "-"
|
||||
|
||||
# Unique ID for this row's expandable output
|
||||
row_id = f'output_{idx}_{str(result.id)[:8]}'
|
||||
row_id = f"output_{idx}_{str(result.id)[:8]}"
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
<a href="{reverse("admin:core_archiveresult_change", args=[result.id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
|
||||
title="View/edit archive result">
|
||||
<code>{str(result.id)[-8:]}</code>
|
||||
@@ -178,9 +203,18 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
{output_display}
|
||||
</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px; text-align: right;">
|
||||
{output_file_count}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
|
||||
{end_time}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
{process_display}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
{machine_display}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
|
||||
{version}
|
||||
</td>
|
||||
@@ -189,14 +223,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="View output">📄</a>
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
<a href="{reverse("admin:core_archiveresult_change", args=[result.id])}"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="Edit">✏️</a>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr style="border-bottom: 1px solid #e2e8f0;">
|
||||
<td colspan="8" style="padding: 0 12px 10px 12px;">
|
||||
<td colspan="11" style="padding: 0 12px 10px 12px;">
|
||||
<details id="{row_id}" style="margin: 0;">
|
||||
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
|
||||
Details & Output
|
||||
@@ -205,7 +239,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
|
||||
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or "-"}</code></span>
|
||||
</div>
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<b>Output:</b>
|
||||
@@ -230,19 +264,19 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
''')
|
||||
|
||||
total_count = archiveresults_qs.count()
|
||||
footer = ''
|
||||
footer = ""
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
footer = f"""
|
||||
<tr>
|
||||
<td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
<td colspan="11" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
Showing {limit} of {total_count} results
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ""}"
|
||||
style="color: #2563eb;">View all →</a>
|
||||
</td>
|
||||
</tr>
|
||||
'''
|
||||
"""
|
||||
|
||||
return mark_safe(f'''
|
||||
return mark_safe(f"""
|
||||
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
@@ -252,86 +286,92 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
|
||||
<th style="padding: 10px 12px; text-align: right; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Files</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Process</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Machine</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
|
||||
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{''.join(rows)}
|
||||
{"".join(rows)}
|
||||
{footer}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
''')
|
||||
|
||||
""")
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
name = 'Archive Results Log'
|
||||
name = "Archive Results Log"
|
||||
model = ArchiveResult
|
||||
parent_model = Snapshot
|
||||
# fk_name = 'snapshot'
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
|
||||
sort_fields = ("end_ts", "plugin", "output_str", "status", "cmd_version")
|
||||
readonly_fields = ("id", "result_id", "completed", "command", "version")
|
||||
fields = ("start_ts", "end_ts", *readonly_fields, "plugin", "cmd", "cmd_version", "pwd", "status", "output_str")
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
ordering = ("end_ts",)
|
||||
show_change_link = True
|
||||
# # classes = ['collapse']
|
||||
|
||||
def get_parent_object_from_request(self, request):
|
||||
resolved = resolve(request.path_info)
|
||||
try:
|
||||
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
|
||||
return self.parent_model.objects.get(pk=resolved.kwargs["object_id"])
|
||||
except (self.parent_model.DoesNotExist, ValidationError):
|
||||
return None
|
||||
|
||||
@admin.display(
|
||||
description='Completed',
|
||||
ordering='end_ts',
|
||||
description="Completed",
|
||||
ordering="end_ts",
|
||||
)
|
||||
def completed(self, obj):
|
||||
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
def result_id(self, obj):
|
||||
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), str(obj.id)[:8])
|
||||
|
||||
return format_html(
|
||||
'<a href="{}"><code style="font-size: 10px">[{}]</code></a>',
|
||||
reverse("admin:core_archiveresult_change", args=(obj.id,)),
|
||||
str(obj.id)[:8],
|
||||
)
|
||||
|
||||
def command(self, obj):
|
||||
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
|
||||
|
||||
return format_html("<small><code>{}</code></small>", " ".join(obj.cmd or []))
|
||||
|
||||
def version(self, obj):
|
||||
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
|
||||
|
||||
return format_html("<small><code>{}</code></small>", obj.cmd_version or "-")
|
||||
|
||||
def get_formset(self, request, obj=None, **kwargs):
|
||||
formset = super().get_formset(request, obj, **kwargs)
|
||||
snapshot = self.get_parent_object_from_request(request)
|
||||
form_class = getattr(formset, 'form', None)
|
||||
base_fields = getattr(form_class, 'base_fields', {})
|
||||
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ''
|
||||
form_class = getattr(formset, "form", None)
|
||||
base_fields = getattr(form_class, "base_fields", {})
|
||||
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ""
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
|
||||
|
||||
|
||||
# default values for new entries
|
||||
base_fields['status'].initial = 'succeeded'
|
||||
base_fields['start_ts'].initial = timezone.now()
|
||||
base_fields['end_ts'].initial = timezone.now()
|
||||
base_fields['cmd_version'].initial = '-'
|
||||
base_fields['pwd'].initial = snapshot_output_dir
|
||||
base_fields['cmd'].initial = '["-"]'
|
||||
base_fields['output_str'].initial = 'Manually recorded cmd output...'
|
||||
base_fields["status"].initial = "succeeded"
|
||||
base_fields["start_ts"].initial = timezone.now()
|
||||
base_fields["end_ts"].initial = timezone.now()
|
||||
base_fields["cmd_version"].initial = "-"
|
||||
base_fields["pwd"].initial = snapshot_output_dir
|
||||
base_fields["cmd"].initial = '["-"]'
|
||||
base_fields["output_str"].initial = "Manually recorded cmd output..."
|
||||
|
||||
if obj is not None:
|
||||
# hidden values for existing entries and new entries
|
||||
base_fields['start_ts'].widget = base_fields['start_ts'].hidden_widget()
|
||||
base_fields['end_ts'].widget = base_fields['end_ts'].hidden_widget()
|
||||
base_fields['cmd'].widget = base_fields['cmd'].hidden_widget()
|
||||
base_fields['pwd'].widget = base_fields['pwd'].hidden_widget()
|
||||
base_fields['cmd_version'].widget = base_fields['cmd_version'].hidden_widget()
|
||||
base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget()
|
||||
base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget()
|
||||
base_fields["cmd"].widget = base_fields["cmd"].hidden_widget()
|
||||
base_fields["pwd"].widget = base_fields["pwd"].hidden_widget()
|
||||
base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget()
|
||||
return formset
|
||||
|
||||
|
||||
def get_readonly_fields(self, request, obj=None):
|
||||
if obj is not None:
|
||||
return self.readonly_fields
|
||||
@@ -339,62 +379,122 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
return []
|
||||
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
|
||||
list_display = (
|
||||
"details_link",
|
||||
"zip_link",
|
||||
"created_at",
|
||||
"snapshot_info",
|
||||
"tags_inline",
|
||||
"status_badge",
|
||||
"plugin_with_icon",
|
||||
"process_link",
|
||||
"machine_link",
|
||||
"cmd_str",
|
||||
"output_str_display",
|
||||
)
|
||||
list_display_links = None
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
|
||||
search_fields = ()
|
||||
autocomplete_fields = ['snapshot']
|
||||
sort_fields = ("id", "created_at", "plugin", "status")
|
||||
readonly_fields = (
|
||||
"admin_actions",
|
||||
"cmd",
|
||||
"cmd_version",
|
||||
"pwd",
|
||||
"cmd_str",
|
||||
"snapshot_info",
|
||||
"tags_str",
|
||||
"created_at",
|
||||
"modified_at",
|
||||
"output_summary",
|
||||
"plugin_with_icon",
|
||||
"process_link",
|
||||
)
|
||||
search_fields = (
|
||||
"snapshot__id",
|
||||
"snapshot__url",
|
||||
"snapshot__tags__name",
|
||||
"snapshot__crawl_id",
|
||||
"plugin",
|
||||
"hook_name",
|
||||
"output_str",
|
||||
"output_json",
|
||||
"process__cmd",
|
||||
)
|
||||
autocomplete_fields = ["snapshot"]
|
||||
|
||||
fieldsets = (
|
||||
('Snapshot', {
|
||||
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Plugin', {
|
||||
'fields': ('plugin_with_icon', 'process_link', 'status'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Command', {
|
||||
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
(
|
||||
"Actions",
|
||||
{
|
||||
"fields": ("admin_actions",),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Snapshot",
|
||||
{
|
||||
"fields": ("snapshot", "snapshot_info", "tags_str"),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Plugin",
|
||||
{
|
||||
"fields": ("plugin_with_icon", "process_link", "status"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Timing",
|
||||
{
|
||||
"fields": ("start_ts", "end_ts", "created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Command",
|
||||
{
|
||||
"fields": ("cmd", "cmd_str", "cmd_version", "pwd"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Output",
|
||||
{
|
||||
"fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'plugin', 'start_ts')
|
||||
ordering = ['-start_ts']
|
||||
list_filter = ("status", "plugin", "start_ts")
|
||||
ordering = ["-start_ts"]
|
||||
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
paginator = AcceleratedPaginator
|
||||
save_on_top = True
|
||||
|
||||
actions = ['delete_selected']
|
||||
actions = ["delete_selected"]
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results'
|
||||
verbose_name = "Archive Result"
|
||||
verbose_name_plural = "Archive Results"
|
||||
|
||||
def change_view(self, request, object_id, form_url="", extra_context=None):
|
||||
self.request = request
|
||||
return super().change_view(request, object_id, form_url, extra_context)
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
self.request = request
|
||||
return super().changelist_view(request, extra_context)
|
||||
|
||||
def get_queryset(self, request):
|
||||
return (
|
||||
super()
|
||||
.get_queryset(request)
|
||||
.select_related('snapshot', 'process')
|
||||
.prefetch_related('snapshot__tags')
|
||||
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
|
||||
.select_related("snapshot", "process")
|
||||
.prefetch_related("snapshot__tags")
|
||||
.annotate(snapshot_first_tag=Min("snapshot__tags__name"))
|
||||
)
|
||||
|
||||
def get_search_results(self, request, queryset, search_term):
|
||||
@@ -402,15 +502,14 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
return queryset, False
|
||||
|
||||
queryset = queryset.annotate(
|
||||
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
|
||||
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
|
||||
output_json_text=Cast('output_json', output_field=TextField()),
|
||||
cmd_text=Cast('process__cmd', output_field=TextField()),
|
||||
snapshot_id_text=Cast("snapshot__id", output_field=TextField()),
|
||||
snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()),
|
||||
output_json_text=Cast("output_json", output_field=TextField()),
|
||||
cmd_text=Cast("process__cmd", output_field=TextField()),
|
||||
)
|
||||
|
||||
search_bits = [
|
||||
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
|
||||
for bit in smart_split(search_term)
|
||||
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term)
|
||||
]
|
||||
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
|
||||
if not search_bits:
|
||||
@@ -427,22 +526,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
| Q(hook_name__icontains=bit)
|
||||
| Q(output_str__icontains=bit)
|
||||
| Q(output_json_text__icontains=bit)
|
||||
| Q(cmd_text__icontains=bit)
|
||||
| Q(cmd_text__icontains=bit),
|
||||
)
|
||||
|
||||
return queryset.filter(reduce(and_, filters)).distinct(), True
|
||||
|
||||
@admin.display(description='Details', ordering='id')
|
||||
def get_snapshot_view_url(self, result: ArchiveResult) -> str:
|
||||
return build_snapshot_url(str(result.snapshot_id), request=getattr(self, "request", None))
|
||||
|
||||
def get_output_view_url(self, result: ArchiveResult) -> str:
|
||||
output_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
if not output_path:
|
||||
output_path = result.plugin or ""
|
||||
return build_snapshot_url(str(result.snapshot_id), output_path, request=getattr(self, "request", None))
|
||||
|
||||
def get_output_files_url(self, result: ArchiveResult) -> str:
|
||||
return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=getattr(self, 'request', None))}/?files=1"
|
||||
|
||||
def get_output_zip_url(self, result: ArchiveResult) -> str:
|
||||
return f"{self.get_output_files_url(result)}&download=zip"
|
||||
|
||||
@admin.display(description="Details", ordering="id")
|
||||
def details_link(self, result):
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:core_archiveresult_change', args=[result.id]),
|
||||
reverse("admin:core_archiveresult_change", args=[result.id]),
|
||||
str(result.id)[-8:],
|
||||
)
|
||||
|
||||
@admin.display(description="Zip")
|
||||
def zip_link(self, result):
|
||||
return format_html(
|
||||
'<a href="{}" class="archivebox-zip-button" data-loading-mode="spinner-only" onclick="return window.archiveboxHandleZipClick(this, event);" style="display:inline-flex; align-items:center; justify-content:center; gap:4px; width:48px; min-width:48px; height:24px; padding:0; box-sizing:border-box; border-radius:999px; border:1px solid #bfdbfe; background:#eff6ff; color:#1d4ed8; font-size:11px; font-weight:600; line-height:1; text-decoration:none;"><span class="archivebox-zip-spinner" aria-hidden="true"></span><span class="archivebox-zip-label">⬇ ZIP</span></a>',
|
||||
self.get_output_zip_url(result),
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot',
|
||||
ordering='snapshot__url',
|
||||
description="Snapshot",
|
||||
ordering="snapshot__url",
|
||||
)
|
||||
def snapshot_info(self, result):
|
||||
snapshot_id = str(result.snapshot_id)
|
||||
@@ -450,29 +571,28 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'<a href="{}"><b><code>[{}]</code></b> {} {}</a><br/>',
|
||||
build_snapshot_url(snapshot_id, "index.html"),
|
||||
snapshot_id[:8],
|
||||
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
|
||||
result.snapshot.bookmarked_at.strftime("%Y-%m-%d %H:%M"),
|
||||
result.snapshot.url[:128],
|
||||
)
|
||||
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot Tags'
|
||||
description="Snapshot Tags",
|
||||
)
|
||||
def tags_str(self, result):
|
||||
return result.snapshot.tags_str()
|
||||
|
||||
@admin.display(description='Tags', ordering='snapshot_first_tag')
|
||||
@admin.display(description="Tags", ordering="snapshot_first_tag")
|
||||
def tags_inline(self, result):
|
||||
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
|
||||
tags_html = widget.render(
|
||||
name=f'tags_{result.snapshot_id}',
|
||||
name=f"tags_{result.snapshot_id}",
|
||||
value=result.snapshot.tags.all(),
|
||||
attrs={'id': f'tags_{result.snapshot_id}'},
|
||||
attrs={"id": f"tags_{result.snapshot_id}"},
|
||||
snapshot_id=str(result.snapshot_id),
|
||||
)
|
||||
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
|
||||
|
||||
@admin.display(description='Status', ordering='status')
|
||||
@admin.display(description="Status", ordering="status")
|
||||
def status_badge(self, result):
|
||||
status = result.status or ArchiveResult.StatusChoices.QUEUED
|
||||
return format_html(
|
||||
@@ -482,7 +602,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.get_status_display() or status,
|
||||
)
|
||||
|
||||
@admin.display(description='Plugin', ordering='plugin')
|
||||
@admin.display(description="Plugin", ordering="plugin")
|
||||
def plugin_with_icon(self, result):
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
return format_html(
|
||||
@@ -494,36 +614,36 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.plugin,
|
||||
)
|
||||
|
||||
@admin.display(description='Process', ordering='process__pid')
|
||||
@admin.display(description="Process", ordering="process__pid")
|
||||
def process_link(self, result):
|
||||
if not result.process_id:
|
||||
return '-'
|
||||
process_label = result.process.pid if result.process and result.process.pid else '-'
|
||||
return "-"
|
||||
process_label = result.process.pid if result.process and result.process.pid else "-"
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:machine_process_change', args=[result.process_id]),
|
||||
reverse("admin:machine_process_change", args=[result.process_id]),
|
||||
process_label,
|
||||
)
|
||||
|
||||
@admin.display(description='Machine', ordering='process__machine__hostname')
|
||||
@admin.display(description="Machine", ordering="process__machine__hostname")
|
||||
def machine_link(self, result):
|
||||
if not result.process_id or not result.process or not result.process.machine_id:
|
||||
return '-'
|
||||
return "-"
|
||||
machine = result.process.machine
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code> {}</a>',
|
||||
reverse('admin:machine_machine_change', args=[machine.id]),
|
||||
reverse("admin:machine_machine_change", args=[machine.id]),
|
||||
str(machine.id)[:8],
|
||||
machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Command')
|
||||
@admin.display(description="Command")
|
||||
def cmd_str(self, result):
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
return format_html(
|
||||
'''
|
||||
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
|
||||
"""
|
||||
<div style="position: relative; width: 100%; max-width: 100%; overflow: hidden; box-sizing: border-box;">
|
||||
<button type="button"
|
||||
data-command="{}"
|
||||
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
|
||||
@@ -534,7 +654,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
{}
|
||||
</code>
|
||||
</div>
|
||||
''',
|
||||
""",
|
||||
replay_cmd,
|
||||
replay_cmd,
|
||||
display_cmd,
|
||||
@@ -542,8 +662,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
|
||||
def output_display(self, result):
|
||||
# Determine output link path - use embed_path() which checks output_files
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
|
||||
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html"
|
||||
snapshot_id = str(result.snapshot_id)
|
||||
return format_html(
|
||||
'<a href="{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
@@ -551,13 +671,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.output_str,
|
||||
)
|
||||
|
||||
@admin.display(description='Output', ordering='output_str')
|
||||
@admin.display(description="Output", ordering="output_str")
|
||||
def output_str_display(self, result):
|
||||
output_text = str(result.output_str or '').strip()
|
||||
output_text = str(result.output_str or "").strip()
|
||||
if not output_text:
|
||||
return '-'
|
||||
return "-"
|
||||
|
||||
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
live_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
if live_path:
|
||||
return format_html(
|
||||
'<a href="{}" title="{}"><code>{}</code></a>',
|
||||
@@ -572,8 +692,48 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
output_text,
|
||||
)
|
||||
|
||||
@admin.display(description="")
|
||||
def admin_actions(self, result):
|
||||
return format_html(
|
||||
"""
|
||||
<div style="display:flex; flex-wrap:wrap; gap:12px; align-items:center;">
|
||||
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📄 View Output
|
||||
</a>
|
||||
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📁 Output files
|
||||
</a>
|
||||
<a class="btn archivebox-zip-button" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#eff6ff; border:1px solid #bfdbfe; border-radius:8px; color:#1d4ed8; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
data-loading-label="Preparing..."
|
||||
onclick="return window.archiveboxHandleZipClick(this, event);"
|
||||
onmouseover="this.style.background='#dbeafe'; this.style.borderColor='#93c5fd';"
|
||||
onmouseout="this.style.background='#eff6ff'; this.style.borderColor='#bfdbfe';">
|
||||
<span class="archivebox-zip-spinner" aria-hidden="true"></span>
|
||||
<span class="archivebox-zip-label">⬇ Download Zip</span>
|
||||
</a>
|
||||
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
🗂 Snapshot
|
||||
</a>
|
||||
</div>
|
||||
""",
|
||||
self.get_output_view_url(result),
|
||||
self.get_output_files_url(result),
|
||||
self.get_output_zip_url(result),
|
||||
self.get_snapshot_view_url(result),
|
||||
)
|
||||
|
||||
def output_summary(self, result):
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split("data/", 1)[-1]
|
||||
output_html = format_html(
|
||||
'<pre style="display: inline-block">{}</pre><br/>',
|
||||
result.output_str,
|
||||
@@ -583,9 +743,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'<a href="{}#all">See result files ...</a><br/><pre><code>',
|
||||
build_snapshot_url(snapshot_id, "index.html"),
|
||||
)
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||
path_from_embed = (snapshot_dir / (embed_path or ''))
|
||||
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
|
||||
embed_path = result.embed_path() if hasattr(result, "embed_path") else ""
|
||||
path_from_embed = snapshot_dir / (embed_path or "")
|
||||
output_html += format_html(
|
||||
'<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>',
|
||||
str(snapshot_dir),
|
||||
str(embed_path),
|
||||
)
|
||||
if os.access(path_from_embed, os.R_OK):
|
||||
root_dir = str(path_from_embed)
|
||||
else:
|
||||
@@ -594,19 +758,22 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
# print(root_dir, str(list(os.walk(root_dir))))
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
depth = root.replace(root_dir, '').count(os.sep) + 1
|
||||
depth = root.replace(root_dir, "").count(os.sep) + 1
|
||||
if depth > 2:
|
||||
continue
|
||||
indent = ' ' * 4 * (depth)
|
||||
indent = " " * 4 * (depth)
|
||||
output_html += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
|
||||
indentation_str = ' ' * 4 * (depth + 1)
|
||||
indentation_str = " " * 4 * (depth + 1)
|
||||
for filename in sorted(files):
|
||||
is_hidden = filename.startswith('.')
|
||||
output_html += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
||||
|
||||
return output_html + mark_safe('</code></pre>')
|
||||
|
||||
is_hidden = filename.startswith(".")
|
||||
output_html += format_html(
|
||||
'<span style="opacity: {}.2">{}{}</span><br/>',
|
||||
int(not is_hidden),
|
||||
indentation_str,
|
||||
filename.strip(),
|
||||
)
|
||||
|
||||
return output_html + mark_safe("</code></pre>")
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@@ -18,23 +18,23 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Admin Views'
|
||||
site_title = 'Admin'
|
||||
namespace = 'admin'
|
||||
site_header = "ArchiveBox"
|
||||
index_title = "Admin Views"
|
||||
site_title = "Admin"
|
||||
namespace = "admin"
|
||||
|
||||
def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']:
|
||||
def get_app_list(self, request: "HttpRequest", app_label: str | None = None) -> list["AppDict"]:
|
||||
if app_label is None:
|
||||
return adv_get_app_list(self, request)
|
||||
return adv_get_app_list(self, request, app_label)
|
||||
|
||||
def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse':
|
||||
def admin_data_index_view(self, request: "HttpRequest", **kwargs: Any) -> "TemplateResponse":
|
||||
return adv_admin_data_index_view(self, request, **kwargs)
|
||||
|
||||
def get_admin_data_urls(self) -> list['URLResolver | URLPattern']:
|
||||
def get_admin_data_urls(self) -> list["URLResolver | URLPattern"]:
|
||||
return adv_get_admin_data_urls(self)
|
||||
|
||||
def get_urls(self) -> list['URLResolver | URLPattern']:
|
||||
def get_urls(self) -> list["URLResolver | URLPattern"]:
|
||||
return self.get_admin_data_urls() + super().get_urls()
|
||||
|
||||
|
||||
@@ -43,7 +43,6 @@ archivebox_admin = ArchiveBoxAdmin()
|
||||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
|
||||
|
||||
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from urllib.parse import quote
|
||||
|
||||
@@ -28,92 +28,107 @@ from archivebox.core.host_utils import build_snapshot_url
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
model = SnapshotTag
|
||||
fields = ('id', 'tag')
|
||||
fields = ("id", "tag")
|
||||
extra = 1
|
||||
max_num = 1000
|
||||
autocomplete_fields = (
|
||||
'tag',
|
||||
)
|
||||
autocomplete_fields = ("tag",)
|
||||
|
||||
|
||||
class TagAdminForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = Tag
|
||||
fields = '__all__'
|
||||
fields = "__all__"
|
||||
widgets = {
|
||||
'name': forms.TextInput(attrs={
|
||||
'placeholder': 'research, receipts, product-design...',
|
||||
'autocomplete': 'off',
|
||||
'spellcheck': 'false',
|
||||
'data-tag-name-input': '1',
|
||||
}),
|
||||
"name": forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "research, receipts, product-design...",
|
||||
"autocomplete": "off",
|
||||
"spellcheck": "false",
|
||||
"data-tag-name-input": "1",
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
def clean_name(self):
|
||||
name = (self.cleaned_data.get('name') or '').strip()
|
||||
name = (self.cleaned_data.get("name") or "").strip()
|
||||
if not name:
|
||||
raise forms.ValidationError('Tag name is required.')
|
||||
raise forms.ValidationError("Tag name is required.")
|
||||
return name
|
||||
|
||||
|
||||
class TagAdmin(BaseModelAdmin):
|
||||
form = TagAdminForm
|
||||
change_list_template = 'admin/core/tag/change_list.html'
|
||||
change_form_template = 'admin/core/tag/change_form.html'
|
||||
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
|
||||
list_filter = ('created_at', 'created_by')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
actions = ['delete_selected']
|
||||
ordering = ['name', 'id']
|
||||
change_list_template = "admin/core/tag/change_list.html"
|
||||
change_form_template = "admin/core/tag/change_form.html"
|
||||
list_display = ("name", "num_snapshots", "created_at", "created_by")
|
||||
list_filter = ("created_at", "created_by")
|
||||
search_fields = ("id", "name", "slug")
|
||||
readonly_fields = ("slug", "id", "created_at", "modified_at", "snapshots")
|
||||
actions = ["delete_selected"]
|
||||
ordering = ["name", "id"]
|
||||
|
||||
fieldsets = (
|
||||
('Tag', {
|
||||
'fields': ('name', 'slug'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Recent Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
(
|
||||
"Tag",
|
||||
{
|
||||
"fields": ("name", "slug"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Metadata",
|
||||
{
|
||||
"fields": ("id", "created_by", "created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Recent Snapshots",
|
||||
{
|
||||
"fields": ("snapshots",),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
add_fieldsets = (
|
||||
('Tag', {
|
||||
'fields': ('name',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
(
|
||||
"Tag",
|
||||
{
|
||||
"fields": ("name",),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Metadata",
|
||||
{
|
||||
"fields": ("created_by",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
|
||||
return self.fieldsets if obj else self.add_fieldsets
|
||||
|
||||
def changelist_view(self, request: HttpRequest, extra_context=None):
|
||||
query = (request.GET.get('q') or '').strip()
|
||||
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
|
||||
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
|
||||
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
|
||||
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
|
||||
query = (request.GET.get("q") or "").strip()
|
||||
sort = normalize_tag_sort((request.GET.get("sort") or "created_desc").strip())
|
||||
created_by = normalize_created_by_filter((request.GET.get("created_by") or "").strip())
|
||||
year = normalize_created_year_filter((request.GET.get("year") or "").strip())
|
||||
has_snapshots = normalize_has_snapshots_filter((request.GET.get("has_snapshots") or "all").strip())
|
||||
extra_context = {
|
||||
**(extra_context or {}),
|
||||
'initial_query': query,
|
||||
'initial_sort': sort,
|
||||
'initial_created_by': created_by,
|
||||
'initial_year': year,
|
||||
'initial_has_snapshots': has_snapshots,
|
||||
'tag_sort_choices': TAG_SORT_CHOICES,
|
||||
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
'tag_created_by_choices': get_tag_creator_choices(),
|
||||
'tag_year_choices': get_tag_year_choices(),
|
||||
'initial_tag_cards': build_tag_cards(
|
||||
"initial_query": query,
|
||||
"initial_sort": sort,
|
||||
"initial_created_by": created_by,
|
||||
"initial_year": year,
|
||||
"initial_has_snapshots": has_snapshots,
|
||||
"tag_sort_choices": TAG_SORT_CHOICES,
|
||||
"tag_has_snapshots_choices": TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
"tag_created_by_choices": get_tag_creator_choices(),
|
||||
"tag_year_choices": get_tag_year_choices(),
|
||||
"initial_tag_cards": build_tag_cards(
|
||||
query=query,
|
||||
request=request,
|
||||
sort=sort,
|
||||
@@ -121,62 +136,67 @@ class TagAdmin(BaseModelAdmin):
|
||||
year=year,
|
||||
has_snapshots=has_snapshots,
|
||||
),
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_create_api_url': reverse('api-1:tags_create'),
|
||||
"tag_search_api_url": reverse("api-1:search_tags"),
|
||||
"tag_create_api_url": reverse("api-1:tags_create"),
|
||||
}
|
||||
return super().changelist_view(request, extra_context=extra_context)
|
||||
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
|
||||
current_name = (request.POST.get('name') or '').strip()
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None):
|
||||
current_name = (request.POST.get("name") or "").strip()
|
||||
if not current_name and obj:
|
||||
current_name = obj.name
|
||||
|
||||
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
|
||||
similar_tag_cards = (
|
||||
build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
|
||||
)
|
||||
if obj:
|
||||
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
|
||||
similar_tag_cards = [card for card in similar_tag_cards if card["id"] != obj.pk]
|
||||
|
||||
context.update({
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_similar_cards': similar_tag_cards,
|
||||
'tag_similar_query': current_name,
|
||||
})
|
||||
context.update(
|
||||
{
|
||||
"tag_search_api_url": reverse("api-1:search_tags"),
|
||||
"tag_similar_cards": similar_tag_cards,
|
||||
"tag_similar_query": current_name,
|
||||
},
|
||||
)
|
||||
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
|
||||
|
||||
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
|
||||
if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST:
|
||||
return super().response_add(request, obj, post_url_continue=post_url_continue)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def response_change(self, request: HttpRequest, obj: Tag):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
|
||||
if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST or "_saveasnew" in request.POST:
|
||||
return super().response_change(request, obj)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
|
||||
changelist_url = reverse('admin:core_tag_changelist')
|
||||
def _redirect_to_changelist(self, query: str = "") -> HttpResponseRedirect:
|
||||
changelist_url = reverse("admin:core_tag_changelist")
|
||||
if query:
|
||||
changelist_url = f'{changelist_url}?q={quote(query)}'
|
||||
changelist_url = f"{changelist_url}?q={quote(query)}"
|
||||
return HttpResponseRedirect(changelist_url)
|
||||
|
||||
@admin.display(description='Snapshots')
|
||||
@admin.display(description="Snapshots")
|
||||
def snapshots(self, tag: Tag):
|
||||
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
|
||||
snapshots = tag.snapshot_set.select_related("crawl__created_by").order_by("-downloaded_at", "-created_at", "-pk")[:10]
|
||||
total_count = tag.snapshot_set.count()
|
||||
if not snapshots:
|
||||
return mark_safe(
|
||||
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
|
||||
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
|
||||
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>',
|
||||
)
|
||||
|
||||
cards = []
|
||||
for snapshot in snapshots:
|
||||
title = (snapshot.title or '').strip() or snapshot.url
|
||||
cards.append(format_html(
|
||||
'''
|
||||
title = (snapshot.title or "").strip() or snapshot.url
|
||||
cards.append(
|
||||
format_html(
|
||||
"""
|
||||
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
|
||||
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
|
||||
<span style="min-width:0;">
|
||||
@@ -184,23 +204,26 @@ class TagAdmin(BaseModelAdmin):
|
||||
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
|
||||
</span>
|
||||
</a>
|
||||
''',
|
||||
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
|
||||
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
|
||||
title[:120],
|
||||
snapshot.url[:120],
|
||||
))
|
||||
""",
|
||||
reverse("admin:core_snapshot_change", args=[snapshot.pk]),
|
||||
build_snapshot_url(str(snapshot.pk), "favicon.ico"),
|
||||
title[:120],
|
||||
snapshot.url[:120],
|
||||
),
|
||||
)
|
||||
|
||||
cards.append(format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
|
||||
tag.id,
|
||||
total_count,
|
||||
))
|
||||
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
|
||||
cards.append(
|
||||
format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
|
||||
tag.id,
|
||||
total_count,
|
||||
),
|
||||
)
|
||||
return mark_safe('<div style="display:grid;gap:10px;">' + "".join(cards) + "</div>")
|
||||
|
||||
@admin.display(description='Snapshots', ordering='num_snapshots')
|
||||
@admin.display(description="Snapshots", ordering="num_snapshots")
|
||||
def num_snapshots(self, tag: Tag):
|
||||
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
|
||||
count = getattr(tag, "num_snapshots", tag.snapshot_set.count())
|
||||
return format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
||||
tag.id,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.admin import UserAdmin
|
||||
@@ -8,87 +8,100 @@ from django.utils.safestring import mark_safe
|
||||
|
||||
|
||||
class CustomUserAdmin(UserAdmin):
|
||||
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
|
||||
list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined']
|
||||
readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set')
|
||||
sort_fields = ["id", "email", "username", "is_superuser", "last_login", "date_joined"]
|
||||
list_display = ["username", "id", "email", "is_superuser", "last_login", "date_joined"]
|
||||
readonly_fields = ("snapshot_set", "archiveresult_set", "tag_set", "apitoken_set", "outboundwebhook_set")
|
||||
|
||||
# Preserve Django's default user creation form and fieldsets
|
||||
# This ensures passwords are properly hashed and permissions are set correctly
|
||||
add_fieldsets = UserAdmin.add_fieldsets
|
||||
|
||||
# Extend fieldsets for change form only (not user creation)
|
||||
fieldsets = [*(UserAdmin.fieldsets or ()), ('Data', {'fields': readonly_fields})]
|
||||
fieldsets = [*(UserAdmin.fieldsets or ()), ("Data", {"fields": readonly_fields})]
|
||||
|
||||
@admin.display(description='Snapshots')
|
||||
@admin.display(description="Snapshots")
|
||||
def snapshot_set(self, obj):
|
||||
total_count = obj.snapshot_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
|
||||
snap.pk,
|
||||
str(snap.id)[:8],
|
||||
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
|
||||
snap.url[:64],
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
|
||||
snap.pk,
|
||||
str(snap.id)[:8],
|
||||
snap.downloaded_at.strftime("%Y-%m-%d %H:%M") if snap.downloaded_at else "pending...",
|
||||
snap.url[:64],
|
||||
)
|
||||
for snap in obj.snapshot_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for snap in obj.snapshot_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='Archive Result Logs')
|
||||
@admin.display(description="Archive Result Logs")
|
||||
def archiveresult_set(self, obj):
|
||||
total_count = obj.archiveresult_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
|
||||
result.pk,
|
||||
str(result.id)[:8],
|
||||
result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...',
|
||||
result.extractor,
|
||||
result.snapshot.url[:64],
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
|
||||
result.pk,
|
||||
str(result.id)[:8],
|
||||
result.snapshot.downloaded_at.strftime("%Y-%m-%d %H:%M") if result.snapshot.downloaded_at else "pending...",
|
||||
result.extractor,
|
||||
result.snapshot.url[:64],
|
||||
)
|
||||
for result in obj.archiveresult_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for result in obj.archiveresult_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='Tags')
|
||||
@admin.display(description="Tags")
|
||||
def tag_set(self, obj):
|
||||
total_count = obj.tag_set.count()
|
||||
return mark_safe(', '.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
|
||||
tag.pk,
|
||||
tag.name,
|
||||
return mark_safe(
|
||||
", ".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
|
||||
tag.pk,
|
||||
tag.name,
|
||||
)
|
||||
for tag in obj.tag_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for tag in obj.tag_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='API Tokens')
|
||||
@admin.display(description="API Tokens")
|
||||
def apitoken_set(self, obj):
|
||||
total_count = obj.apitoken_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
|
||||
apitoken.pk,
|
||||
str(apitoken.id)[:8],
|
||||
apitoken.token_redacted[:64],
|
||||
apitoken.expires,
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
|
||||
apitoken.pk,
|
||||
str(apitoken.id)[:8],
|
||||
apitoken.token_redacted[:64],
|
||||
apitoken.expires,
|
||||
)
|
||||
for apitoken in obj.apitoken_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for apitoken in obj.apitoken_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='API Outbound Webhooks')
|
||||
@admin.display(description="API Outbound Webhooks")
|
||||
def outboundwebhook_set(self, obj):
|
||||
total_count = obj.outboundwebhook_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
|
||||
outboundwebhook.pk,
|
||||
str(outboundwebhook.id)[:8],
|
||||
outboundwebhook.referenced_model,
|
||||
outboundwebhook.endpoint,
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
|
||||
outboundwebhook.pk,
|
||||
str(outboundwebhook.id)[:8],
|
||||
outboundwebhook.referenced_model,
|
||||
outboundwebhook.endpoint,
|
||||
)
|
||||
for outboundwebhook in obj.outboundwebhook_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
|
||||
|
||||
+ f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django.apps import AppConfig
|
||||
import os
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'archivebox.core'
|
||||
label = 'core'
|
||||
name = "archivebox.core"
|
||||
label = "core"
|
||||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
@@ -14,29 +14,30 @@ class CoreConfig(AppConfig):
|
||||
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
|
||||
|
||||
from archivebox.core.admin_site import register_admin_site
|
||||
|
||||
register_admin_site()
|
||||
|
||||
# Import models to register state machines with the registry
|
||||
# Skip during makemigrations to avoid premature state machine access
|
||||
if 'makemigrations' not in sys.argv:
|
||||
if "makemigrations" not in sys.argv:
|
||||
from archivebox.core import models # noqa: F401
|
||||
|
||||
pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE')
|
||||
pidfile = os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
|
||||
if pidfile:
|
||||
should_write_pid = True
|
||||
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
|
||||
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1":
|
||||
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == "true"
|
||||
if should_write_pid:
|
||||
try:
|
||||
with open(pidfile, 'w') as handle:
|
||||
with open(pidfile, "w") as handle:
|
||||
handle.write(str(os.getpid()))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _should_prepare_runtime() -> bool:
|
||||
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
|
||||
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
|
||||
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if os.environ.get("ARCHIVEBOX_RUNSERVER") == "1":
|
||||
if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1":
|
||||
return os.environ.get(DJANGO_AUTORELOAD_ENV) == "true"
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -44,4 +45,5 @@ class CoreConfig(AppConfig):
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Process.cleanup_orphaned_workers()
|
||||
Machine.current()
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.misc.util import URL_REGEX, find_all_urls
|
||||
from archivebox.misc.util import URL_REGEX, find_all_urls, parse_filesize_to_bytes
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
from archivebox.crawls.schedule_utils import validate_schedule
|
||||
@@ -13,11 +13,11 @@ from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_ic
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (+ URLs one hop away)'),
|
||||
('2', 'depth = 2 (+ URLs two hops away)'),
|
||||
('3', 'depth = 3 (+ URLs three hops away)'),
|
||||
('4', 'depth = 4 (+ URLs four hops away)'),
|
||||
("0", "depth = 0 (archive just these URLs)"),
|
||||
("1", "depth = 1 (+ URLs one hop away)"),
|
||||
("2", "depth = 2 (+ URLs two hops away)"),
|
||||
("3", "depth = 3 (+ URLs three hops away)"),
|
||||
("4", "depth = 4 (+ URLs four hops away)"),
|
||||
)
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ def get_plugin_choices():
|
||||
|
||||
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
|
||||
schema = plugin_configs.get(plugin_name, {})
|
||||
description = str(schema.get('description') or '').strip()
|
||||
description = str(schema.get("description") or "").strip()
|
||||
if not description:
|
||||
return plugin_name
|
||||
icon_html = get_plugin_icon(plugin_name)
|
||||
@@ -45,7 +45,7 @@ def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -
|
||||
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
|
||||
field = form.fields[name]
|
||||
if not isinstance(field, forms.ChoiceField):
|
||||
raise TypeError(f'{name} must be a ChoiceField')
|
||||
raise TypeError(f"{name} must be a ChoiceField")
|
||||
return field
|
||||
|
||||
|
||||
@@ -54,10 +54,12 @@ class AddLinkForm(forms.Form):
|
||||
url = forms.CharField(
|
||||
label="URLs",
|
||||
strip=True,
|
||||
widget=forms.Textarea(attrs={
|
||||
'data-url-regex': URL_REGEX.pattern,
|
||||
}),
|
||||
required=True
|
||||
widget=forms.Textarea(
|
||||
attrs={
|
||||
"data-url-regex": URL_REGEX.pattern,
|
||||
},
|
||||
),
|
||||
required=True,
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags",
|
||||
@@ -68,16 +70,41 @@ class AddLinkForm(forms.Form):
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
choices=DEPTH_CHOICES,
|
||||
initial='0',
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
|
||||
initial="0",
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"}),
|
||||
)
|
||||
max_urls = forms.IntegerField(
|
||||
label="Max URLs",
|
||||
required=False,
|
||||
min_value=0,
|
||||
initial=0,
|
||||
widget=forms.NumberInput(
|
||||
attrs={
|
||||
"min": 0,
|
||||
"step": 1,
|
||||
"placeholder": "0 = unlimited",
|
||||
},
|
||||
),
|
||||
)
|
||||
max_size = forms.CharField(
|
||||
label="Max size",
|
||||
required=False,
|
||||
initial="0",
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "0 = unlimited, or e.g. 45mb / 1gb",
|
||||
},
|
||||
),
|
||||
)
|
||||
notes = forms.CharField(
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'Optional notes about this crawl',
|
||||
})
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "Optional notes about this crawl",
|
||||
},
|
||||
),
|
||||
)
|
||||
url_filters = forms.Field(
|
||||
label="URL allowlist / denylist",
|
||||
@@ -128,16 +155,18 @@ class AddLinkForm(forms.Form):
|
||||
label="Repeat schedule",
|
||||
max_length=64,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "e.g., daily, weekly, 0 */6 * * * (every 6 hours)",
|
||||
},
|
||||
),
|
||||
)
|
||||
persona = forms.ModelChoiceField(
|
||||
label="Persona (authentication profile)",
|
||||
required=False,
|
||||
queryset=Persona.objects.none(),
|
||||
empty_label=None,
|
||||
to_field_name='name',
|
||||
to_field_name="name",
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only dry run (add crawl but don't archive yet)",
|
||||
@@ -155,8 +184,8 @@ class AddLinkForm(forms.Form):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
default_persona = Persona.get_or_create_default()
|
||||
self.fields['persona'].queryset = Persona.objects.order_by('name')
|
||||
self.fields['persona'].initial = default_persona.name
|
||||
self.fields["persona"].queryset = Persona.objects.order_by("name")
|
||||
self.fields["persona"].initial = default_persona.name
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
@@ -164,86 +193,136 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
|
||||
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
"accessibility",
|
||||
"chrome",
|
||||
"consolelog",
|
||||
"dom",
|
||||
"headers",
|
||||
"parse_dom_outlinks",
|
||||
"pdf",
|
||||
"redirects",
|
||||
"responses",
|
||||
"screenshot",
|
||||
"seo",
|
||||
"singlefile",
|
||||
"ssl",
|
||||
"staticfile",
|
||||
"title",
|
||||
}
|
||||
archiving = {
|
||||
'archivedotorg', 'defuddle', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'mercury', 'papersdl', 'readability', 'trafilatura', 'wget', 'ytdlp'
|
||||
"archivedotorg",
|
||||
"defuddle",
|
||||
"favicon",
|
||||
"forumdl",
|
||||
"gallerydl",
|
||||
"git",
|
||||
"htmltotext",
|
||||
"mercury",
|
||||
"papersdl",
|
||||
"readability",
|
||||
"trafilatura",
|
||||
"wget",
|
||||
"ytdlp",
|
||||
}
|
||||
parsing = {
|
||||
'parse_html_urls', 'parse_jsonl_urls',
|
||||
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
|
||||
"parse_html_urls",
|
||||
"parse_jsonl_urls",
|
||||
"parse_netscape_urls",
|
||||
"parse_rss_urls",
|
||||
"parse_txt_urls",
|
||||
}
|
||||
search = {
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
"search_backend_ripgrep",
|
||||
"search_backend_sonic",
|
||||
"search_backend_sqlite",
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
|
||||
binary = {"apt", "brew", "custom", "env", "npm", "pip"}
|
||||
extensions = {"twocaptcha", "istilldontcareaboutcookies", "ublock"}
|
||||
|
||||
# Populate plugin field choices
|
||||
get_choice_field(self, 'chrome_plugins').choices = [
|
||||
get_choice_field(self, "chrome_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
get_choice_field(self, 'archiving_plugins').choices = [
|
||||
get_choice_field(self, "archiving_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
get_choice_field(self, 'parsing_plugins').choices = [
|
||||
get_choice_field(self, "parsing_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
get_choice_field(self, 'search_plugins').choices = [
|
||||
get_choice_field(self, "search_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
get_choice_field(self, 'binary_plugins').choices = [
|
||||
get_choice_field(self, "binary_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
get_choice_field(self, 'extension_plugins').choices = [
|
||||
get_choice_field(self, "extension_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
|
||||
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
|
||||
required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip()
|
||||
search_choices = [choice[0] for choice in get_choice_field(self, "search_plugins").choices]
|
||||
if required_search_plugin in search_choices:
|
||||
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
|
||||
get_choice_field(self, "search_plugins").initial = [required_search_plugin]
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean() or {}
|
||||
|
||||
# Combine all plugin groups into single list
|
||||
all_selected_plugins = []
|
||||
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
|
||||
'search_plugins', 'binary_plugins', 'extension_plugins']:
|
||||
for field in [
|
||||
"chrome_plugins",
|
||||
"archiving_plugins",
|
||||
"parsing_plugins",
|
||||
"search_plugins",
|
||||
"binary_plugins",
|
||||
"extension_plugins",
|
||||
]:
|
||||
selected = cleaned_data.get(field)
|
||||
if isinstance(selected, list):
|
||||
all_selected_plugins.extend(selected)
|
||||
|
||||
# Store combined list for easy access
|
||||
cleaned_data['plugins'] = all_selected_plugins
|
||||
cleaned_data["plugins"] = all_selected_plugins
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def clean_url(self):
|
||||
value = self.cleaned_data.get('url') or ''
|
||||
urls = '\n'.join(find_all_urls(value))
|
||||
value = self.cleaned_data.get("url") or ""
|
||||
urls = "\n".join(find_all_urls(value))
|
||||
if not urls:
|
||||
raise forms.ValidationError('Enter at least one valid URL.')
|
||||
raise forms.ValidationError("Enter at least one valid URL.")
|
||||
return urls
|
||||
|
||||
def clean_url_filters(self):
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
value = self.cleaned_data.get('url_filters') or {}
|
||||
value = self.cleaned_data.get("url_filters") or {}
|
||||
return {
|
||||
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
|
||||
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
|
||||
'same_domain_only': bool(value.get('same_domain_only')),
|
||||
"allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))),
|
||||
"denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))),
|
||||
"same_domain_only": bool(value.get("same_domain_only")),
|
||||
}
|
||||
|
||||
def clean_max_urls(self):
|
||||
value = self.cleaned_data.get("max_urls")
|
||||
return int(value or 0)
|
||||
|
||||
def clean_max_size(self):
|
||||
raw_value = str(self.cleaned_data.get("max_size") or "").strip()
|
||||
if not raw_value:
|
||||
return 0
|
||||
try:
|
||||
value = parse_filesize_to_bytes(raw_value)
|
||||
except ValueError as err:
|
||||
raise forms.ValidationError(str(err))
|
||||
if value < 0:
|
||||
raise forms.ValidationError("Max size must be 0 or a positive number of bytes.")
|
||||
return value
|
||||
|
||||
def clean_schedule(self):
|
||||
schedule = (self.cleaned_data.get('schedule') or '').strip()
|
||||
schedule = (self.cleaned_data.get("schedule") or "").strip()
|
||||
if not schedule:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
try:
|
||||
validate_schedule(schedule)
|
||||
@@ -269,7 +348,7 @@ class TagField(forms.CharField):
|
||||
return parse_tags(value)
|
||||
except ValueError:
|
||||
raise forms.ValidationError(
|
||||
"Please provide a comma-separated list of tags."
|
||||
"Please provide a comma-separated list of tags.",
|
||||
)
|
||||
|
||||
def has_changed(self, initial, data):
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -9,6 +7,7 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
|
||||
|
||||
_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$")
|
||||
_SNAPSHOT_SUBDOMAIN_RE = re.compile(r"^snap-(?P<suffix>[0-9a-fA-F]{12})$")
|
||||
|
||||
|
||||
def split_host_port(host: str) -> tuple[str, str | None]:
|
||||
@@ -71,21 +70,29 @@ def get_web_host() -> str:
|
||||
return urlparse(override).netloc.lower()
|
||||
return _build_listen_host("web")
|
||||
|
||||
|
||||
def get_api_host() -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return get_listen_host().lower()
|
||||
return _build_listen_host("api")
|
||||
|
||||
|
||||
def get_public_host() -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return get_listen_host().lower()
|
||||
return _build_listen_host("public")
|
||||
|
||||
|
||||
def get_snapshot_subdomain(snapshot_id: str) -> str:
|
||||
normalized = re.sub(r"[^0-9a-fA-F]", "", snapshot_id or "")
|
||||
suffix = (normalized[-12:] if len(normalized) >= 12 else normalized).lower()
|
||||
return f"snap-{suffix}"
|
||||
|
||||
|
||||
def get_snapshot_host(snapshot_id: str) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return get_listen_host().lower()
|
||||
return _build_listen_host(snapshot_id)
|
||||
return _build_listen_host(get_snapshot_subdomain(snapshot_id))
|
||||
|
||||
|
||||
def get_original_host(domain: str) -> str:
|
||||
@@ -95,7 +102,16 @@ def get_original_host(domain: str) -> str:
|
||||
|
||||
|
||||
def is_snapshot_subdomain(subdomain: str) -> bool:
|
||||
return bool(_SNAPSHOT_ID_RE.match(subdomain or ""))
|
||||
value = (subdomain or "").strip()
|
||||
return bool(_SNAPSHOT_SUBDOMAIN_RE.match(value) or _SNAPSHOT_ID_RE.match(value))
|
||||
|
||||
|
||||
def get_snapshot_lookup_key(snapshot_ref: str) -> str:
|
||||
value = (snapshot_ref or "").strip().lower()
|
||||
match = _SNAPSHOT_SUBDOMAIN_RE.match(value)
|
||||
if match:
|
||||
return match.group("suffix")
|
||||
return value
|
||||
|
||||
|
||||
def get_listen_subdomain(request_host: str) -> str:
|
||||
@@ -141,22 +157,23 @@ def _build_base_url_for_host(host: str, request=None) -> str:
|
||||
|
||||
|
||||
def get_admin_base_url(request=None) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
|
||||
if override:
|
||||
return override
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
return _build_base_url_for_host(get_admin_host(), request=request)
|
||||
|
||||
|
||||
def get_web_base_url(request=None) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
|
||||
if override:
|
||||
return override
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
return _build_base_url_for_host(get_web_host(), request=request)
|
||||
|
||||
|
||||
def get_api_base_url(request=None) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
@@ -191,6 +208,7 @@ def build_admin_url(path: str = "", request=None) -> str:
|
||||
def build_web_url(path: str = "", request=None) -> str:
|
||||
return _build_url(get_web_base_url(request), path)
|
||||
|
||||
|
||||
def build_api_url(path: str = "", request=None) -> str:
|
||||
return _build_url(get_api_base_url(request), path)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox'
|
||||
__package__ = "archivebox"
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
@@ -6,13 +6,12 @@ from archivebox.cli import main as run_cli
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
|
||||
help = "Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
|
||||
parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
|
||||
|
||||
parser.add_argument("subcommand", type=str, help="The subcommand you want to run")
|
||||
parser.add_argument("command_args", nargs="*", help="Arguments to pass to the subcommand")
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
command_args = [kwargs['subcommand'], *kwargs['command_args']]
|
||||
command_args = [kwargs["subcommand"], *kwargs["command_args"]]
|
||||
run_cli(args=command_args)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
import ipaddress
|
||||
import re
|
||||
@@ -16,6 +16,7 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
from archivebox.core.host_utils import (
|
||||
build_snapshot_url,
|
||||
build_admin_url,
|
||||
build_web_url,
|
||||
get_api_host,
|
||||
@@ -31,10 +32,10 @@ from archivebox.core.host_utils import (
|
||||
from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
|
||||
|
||||
|
||||
def detect_timezone(request, activate: bool=True):
|
||||
gmt_offset = (request.COOKIES.get('GMT_OFFSET') or '').strip()
|
||||
def detect_timezone(request, activate: bool = True):
|
||||
gmt_offset = (request.COOKIES.get("GMT_OFFSET") or "").strip()
|
||||
tz = None
|
||||
if gmt_offset.replace('-', '').isdigit():
|
||||
if gmt_offset.replace("-", "").isdigit():
|
||||
tz = timezone.get_fixed_timezone(int(gmt_offset))
|
||||
if activate:
|
||||
timezone.activate(tz)
|
||||
@@ -53,11 +54,12 @@ def TimezoneMiddleware(get_response):
|
||||
def CacheControlMiddleware(get_response):
|
||||
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
|
||||
static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip()
|
||||
|
||||
def middleware(request):
|
||||
response = get_response(request)
|
||||
|
||||
if request.path.startswith('/static/'):
|
||||
rel_path = request.path[len('/static/'):]
|
||||
if request.path.startswith("/static/"):
|
||||
rel_path = request.path[len("/static/") :]
|
||||
static_path = finders.find(rel_path)
|
||||
if static_path:
|
||||
try:
|
||||
@@ -81,10 +83,10 @@ def CacheControlMiddleware(get_response):
|
||||
response.headers["Last-Modified"] = http_date(mtime)
|
||||
return response
|
||||
|
||||
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
|
||||
if not response.get('Cache-Control'):
|
||||
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
|
||||
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
|
||||
if "/archive/" in request.path or "/static/" in request.path or snapshot_path_re.match(request.path):
|
||||
if not response.get("Cache-Control"):
|
||||
policy = "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private"
|
||||
response["Cache-Control"] = f"{policy}, max-age=60, stale-while-revalidate=300"
|
||||
# print('Set Cache-Control header to', response['Cache-Control'])
|
||||
return response
|
||||
|
||||
@@ -115,6 +117,10 @@ def ServerSecurityModeMiddleware(get_response):
|
||||
|
||||
|
||||
def HostRoutingMiddleware(get_response):
|
||||
snapshot_path_re = re.compile(
|
||||
r"^/(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?$",
|
||||
)
|
||||
|
||||
def middleware(request):
|
||||
request_host = (request.get_host() or "").lower()
|
||||
admin_host = get_admin_host()
|
||||
@@ -124,6 +130,23 @@ def HostRoutingMiddleware(get_response):
|
||||
listen_host = get_listen_host()
|
||||
subdomain = get_listen_subdomain(request_host)
|
||||
|
||||
# Framework-owned assets must bypass snapshot/original-domain replay routing.
|
||||
# Otherwise pages on snapshot subdomains can receive HTML for JS/CSS requests.
|
||||
if request.path.startswith("/static/") or request.path in {"/favicon.ico", "/robots.txt"}:
|
||||
return get_response(request)
|
||||
|
||||
if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and not host_matches(request_host, admin_host):
|
||||
if (
|
||||
request.path == "/admin"
|
||||
or request.path.startswith("/admin/")
|
||||
or request.path == "/accounts"
|
||||
or request.path.startswith("/accounts/")
|
||||
):
|
||||
target = build_admin_url(request.path, request=request)
|
||||
if request.META.get("QUERY_STRING"):
|
||||
target = f"{target}?{request.META['QUERY_STRING']}"
|
||||
return redirect(target)
|
||||
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
if host_matches(request_host, listen_host):
|
||||
return get_response(request)
|
||||
@@ -140,6 +163,16 @@ def HostRoutingMiddleware(get_response):
|
||||
return get_response(request)
|
||||
|
||||
if host_matches(request_host, admin_host):
|
||||
snapshot_match = snapshot_path_re.match(request.path)
|
||||
if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and snapshot_match:
|
||||
snapshot_id = snapshot_match.group("snapshot_id")
|
||||
replay_path = (snapshot_match.group("path") or "").strip("/")
|
||||
if replay_path == "index.html":
|
||||
replay_path = ""
|
||||
target = build_snapshot_url(snapshot_id, replay_path, request=request)
|
||||
if request.META.get("QUERY_STRING"):
|
||||
target = f"{target}?{request.META['QUERY_STRING']}"
|
||||
return redirect(target)
|
||||
return get_response(request)
|
||||
|
||||
if host_matches(request_host, api_host):
|
||||
@@ -160,16 +193,9 @@ def HostRoutingMiddleware(get_response):
|
||||
if host_matches(request_host, web_host):
|
||||
request.user = AnonymousUser()
|
||||
request._cached_user = request.user
|
||||
if request.path.startswith("/admin"):
|
||||
target = build_admin_url(request.path, request=request)
|
||||
if request.META.get("QUERY_STRING"):
|
||||
target = f"{target}?{request.META['QUERY_STRING']}"
|
||||
return redirect(target)
|
||||
return get_response(request)
|
||||
|
||||
if host_matches(request_host, public_host):
|
||||
request.user = AnonymousUser()
|
||||
request._cached_user = request.user
|
||||
return get_response(request)
|
||||
|
||||
if subdomain:
|
||||
@@ -196,24 +222,26 @@ def HostRoutingMiddleware(get_response):
|
||||
|
||||
return middleware
|
||||
|
||||
|
||||
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
|
||||
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
|
||||
header = "HTTP_{normalized}".format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace("-", "_").upper())
|
||||
|
||||
def process_request(self, request):
|
||||
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
|
||||
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == "":
|
||||
return
|
||||
|
||||
ip = request.META.get('REMOTE_ADDR')
|
||||
ip = request.META.get("REMOTE_ADDR")
|
||||
if not isinstance(ip, str):
|
||||
return
|
||||
|
||||
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
|
||||
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(","):
|
||||
try:
|
||||
network = ipaddress.ip_network(cidr)
|
||||
except ValueError:
|
||||
raise ImproperlyConfigured(
|
||||
"The REVERSE_PROXY_WHITELIST config paramater is in invalid format, or "
|
||||
"contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.")
|
||||
"The REVERSE_PROXY_WHITELIST config parameter is in invalid format, or "
|
||||
"contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.",
|
||||
)
|
||||
|
||||
if ipaddress.ip_address(ip) in network:
|
||||
return super().process_request(request)
|
||||
|
||||
@@ -5,23 +5,21 @@ import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Snapshot',
|
||||
name="Snapshot",
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
('url', models.URLField(unique=True)),
|
||||
('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
|
||||
('title', models.CharField(default=None, max_length=128, null=True)),
|
||||
('tags', models.CharField(default=None, max_length=256, null=True)),
|
||||
('added', models.DateTimeField(auto_now_add=True)),
|
||||
('updated', models.DateTimeField(default=None, null=True)),
|
||||
("id", models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
("url", models.URLField(unique=True)),
|
||||
("timestamp", models.CharField(default=None, max_length=32, null=True, unique=True)),
|
||||
("title", models.CharField(default=None, max_length=128, null=True)),
|
||||
("tags", models.CharField(default=None, max_length=256, null=True)),
|
||||
("added", models.DateTimeField(auto_now_add=True)),
|
||||
("updated", models.DateTimeField(default=None, null=True)),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
("core", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(default=None, max_length=32, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,35 +4,34 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0002_auto_20200625_1521'),
|
||||
("core", "0002_auto_20200625_1521"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='added',
|
||||
model_name="snapshot",
|
||||
name="added",
|
||||
field=models.DateTimeField(auto_now_add=True, db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.CharField(db_index=True, default=None, max_length=256, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(db_index=True, default=None, max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(db_index=True, default=None, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
model_name="snapshot",
|
||||
name="updated",
|
||||
field=models.DateTimeField(db_index=True, default=None, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0003_auto_20200630_1034'),
|
||||
("core", "0003_auto_20200630_1034"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
|
||||
preserve_default=False,
|
||||
),
|
||||
|
||||
@@ -4,25 +4,24 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0004_auto_20200713_1552'),
|
||||
("core", "0004_auto_20200713_1552"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
model_name="snapshot",
|
||||
name="updated",
|
||||
field=models.DateTimeField(blank=True, db_index=True, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -3,19 +3,18 @@
|
||||
from django.db import migrations, models
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
SnapshotModel = apps.get_model("core", "Snapshot")
|
||||
TagModel = apps.get_model("core", "Tag")
|
||||
|
||||
snapshots = SnapshotModel.objects.all()
|
||||
for snapshot in snapshots:
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
|
||||
)
|
||||
tag_set = {tag.strip() for tag in (snapshot.tags_old or "").split(",")}
|
||||
tag_set.discard("")
|
||||
|
||||
for tag in tag_set:
|
||||
to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={'slug': slugify(tag)})
|
||||
to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={"slug": slugify(tag)})
|
||||
snapshot.tags.add(to_add)
|
||||
|
||||
|
||||
@@ -30,37 +29,36 @@ def reverse_func(apps, schema_editor):
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0005_auto_20200728_0326'),
|
||||
("core", "0005_auto_20200728_0326"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='tags',
|
||||
new_name='tags_old',
|
||||
model_name="snapshot",
|
||||
old_name="tags",
|
||||
new_name="tags_old",
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Tag',
|
||||
name="Tag",
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
|
||||
('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
|
||||
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("name", models.CharField(max_length=100, unique=True, verbose_name="name")),
|
||||
("slug", models.SlugField(max_length=100, unique=True, verbose_name="slug")),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Tag',
|
||||
'verbose_name_plural': 'Tags',
|
||||
"verbose_name": "Tag",
|
||||
"verbose_name_plural": "Tags",
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(to='core.Tag'),
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(to="core.Tag"),
|
||||
),
|
||||
migrations.RunPython(forwards_func, reverse_func),
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='tags_old',
|
||||
model_name="snapshot",
|
||||
name="tags_old",
|
||||
),
|
||||
]
|
||||
|
||||
@@ -9,13 +9,15 @@ import django.db.models.deletion
|
||||
# Handle old vs new import paths
|
||||
try:
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
|
||||
except ImportError:
|
||||
try:
|
||||
from archivebox.config import CONFIG
|
||||
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
|
||||
|
||||
ARCHIVE_DIR = Path(CONFIG.get("ARCHIVE_DIR", "./archive"))
|
||||
except ImportError:
|
||||
ARCHIVE_DIR = Path('./archive')
|
||||
ARCHIVE_DIR = Path("./archive")
|
||||
|
||||
try:
|
||||
from archivebox.misc.util import to_json
|
||||
@@ -29,6 +31,7 @@ try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
import jsonfield
|
||||
|
||||
JSONField = jsonfield.JSONField
|
||||
|
||||
|
||||
@@ -41,7 +44,7 @@ def forwards_func(apps, schema_editor):
|
||||
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
||||
|
||||
try:
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
with open(out_dir / "index.json") as f:
|
||||
fs_index = json.load(f)
|
||||
except Exception:
|
||||
continue
|
||||
@@ -56,37 +59,46 @@ def forwards_func(apps, schema_editor):
|
||||
snapshot=snapshot,
|
||||
pwd=result["pwd"],
|
||||
cmd=result.get("cmd") or [],
|
||||
cmd_version=result.get("cmd_version") or 'unknown',
|
||||
cmd_version=result.get("cmd_version") or "unknown",
|
||||
start_ts=result["start_ts"],
|
||||
end_ts=result["end_ts"],
|
||||
status=result["status"],
|
||||
output=result.get("output") or 'null',
|
||||
output=result.get("output") or "null",
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
' ! Skipping import due to missing/invalid index.json:',
|
||||
" ! Skipping import due to missing/invalid index.json:",
|
||||
out_dir,
|
||||
e,
|
||||
'(open an issue with this index.json for help)',
|
||||
"(open an issue with this index.json for help)",
|
||||
)
|
||||
|
||||
|
||||
def verify_json_index_integrity(snapshot):
|
||||
results = snapshot.archiveresult_set.all()
|
||||
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
with open(out_dir / "index.json") as f:
|
||||
index = json.load(f)
|
||||
|
||||
history = index["history"]
|
||||
index_results = [result for extractor in history for result in history[extractor]]
|
||||
flattened_results = [result["start_ts"] for result in index_results]
|
||||
|
||||
|
||||
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
|
||||
|
||||
for missing in missing_results:
|
||||
index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
|
||||
"start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
|
||||
"schema": "ArchiveResult", "status": missing.status})
|
||||
index["history"][missing.extractor].append(
|
||||
{
|
||||
"cmd": missing.cmd,
|
||||
"cmd_version": missing.cmd_version,
|
||||
"end_ts": missing.end_ts.isoformat(),
|
||||
"start_ts": missing.start_ts.isoformat(),
|
||||
"pwd": missing.pwd,
|
||||
"output": missing.output,
|
||||
"schema": "ArchiveResult",
|
||||
"status": missing.status,
|
||||
},
|
||||
)
|
||||
|
||||
json_index = to_json(index)
|
||||
with open(out_dir / "index.json", "w") as f:
|
||||
@@ -103,25 +115,47 @@ def reverse_func(apps, schema_editor):
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0006_auto_20201012_1520'),
|
||||
("core", "0006_auto_20201012_1520"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='ArchiveResult',
|
||||
name="ArchiveResult",
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('cmd', JSONField()),
|
||||
('pwd', models.CharField(max_length=256)),
|
||||
('cmd_version', models.CharField(max_length=32)),
|
||||
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
|
||||
('output', models.CharField(max_length=512)),
|
||||
('start_ts', models.DateTimeField()),
|
||||
('end_ts', models.DateTimeField()),
|
||||
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
|
||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
|
||||
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("cmd", JSONField()),
|
||||
("pwd", models.CharField(max_length=256)),
|
||||
("cmd_version", models.CharField(max_length=32)),
|
||||
(
|
||||
"status",
|
||||
models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16),
|
||||
),
|
||||
("output", models.CharField(max_length=512)),
|
||||
("start_ts", models.DateTimeField()),
|
||||
("end_ts", models.DateTimeField()),
|
||||
(
|
||||
"extractor",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("title", "title"),
|
||||
("favicon", "favicon"),
|
||||
("wget", "wget"),
|
||||
("singlefile", "singlefile"),
|
||||
("pdf", "pdf"),
|
||||
("screenshot", "screenshot"),
|
||||
("dom", "dom"),
|
||||
("readability", "readability"),
|
||||
("mercury", "mercury"),
|
||||
("git", "git"),
|
||||
("media", "media"),
|
||||
("headers", "headers"),
|
||||
("archivedotorg", "archivedotorg"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")),
|
||||
],
|
||||
),
|
||||
migrations.RunPython(forwards_func, reverse_func),
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0007_archiveresult'),
|
||||
("core", "0007_archiveresult"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
model_name="archiveresult",
|
||||
name="cmd_version",
|
||||
field=models.CharField(blank=True, default=None, max_length=32, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0008_auto_20210105_1421'),
|
||||
("core", "0008_auto_20210105_1421"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
model_name="snapshot",
|
||||
name="updated",
|
||||
field=models.DateTimeField(auto_now=True, db_index=True, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0009_auto_20210216_1038'),
|
||||
("core", "0009_auto_20210216_1038"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
model_name="archiveresult",
|
||||
name="start_ts",
|
||||
field=models.DateTimeField(db_index=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -5,20 +5,36 @@ import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0010_auto_20210216_1055'),
|
||||
("core", "0010_auto_20210216_1055"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
model_name="archiveresult",
|
||||
name="uuid",
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
model_name="archiveresult",
|
||||
name="extractor",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("title", "title"),
|
||||
("favicon", "favicon"),
|
||||
("headers", "headers"),
|
||||
("singlefile", "singlefile"),
|
||||
("pdf", "pdf"),
|
||||
("screenshot", "screenshot"),
|
||||
("dom", "dom"),
|
||||
("wget", "wget"),
|
||||
("readability", "readability"),
|
||||
("mercury", "mercury"),
|
||||
("git", "git"),
|
||||
("media", "media"),
|
||||
("archivedotorg", "archivedotorg"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,20 +4,19 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0011_auto_20210216_1331'),
|
||||
("core", "0011_auto_20210216_1331"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
model_name="archiveresult",
|
||||
name="cmd_version",
|
||||
field=models.CharField(blank=True, default=None, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
model_name="archiveresult",
|
||||
name="output",
|
||||
field=models.CharField(max_length=1024),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0012_auto_20210216_1425'),
|
||||
("core", "0012_auto_20210216_1425"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0013_auto_20210218_0729'),
|
||||
("core", "0013_auto_20210218_0729"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0014_auto_20210218_0729'),
|
||||
("core", "0014_auto_20210218_0729"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0015_auto_20210218_0730'),
|
||||
("core", "0015_auto_20210218_0730"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, to='core.Tag'),
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(blank=True, to="core.Tag"),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0016_auto_20210218_1204'),
|
||||
("core", "0016_auto_20210218_1204"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name="slug"),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,20 +4,19 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0017_auto_20210219_0211'),
|
||||
("core", "0017_auto_20210219_0211"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='name',
|
||||
model_name="tag",
|
||||
name="name",
|
||||
field=models.CharField(max_length=100, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0018_auto_20210327_0952'),
|
||||
("core", "0018_auto_20210327_0952"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
model_name="snapshot",
|
||||
name="url",
|
||||
field=models.URLField(db_index=True, unique=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,20 +4,19 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0019_auto_20210401_0654'),
|
||||
("core", "0019_auto_20210401_0654"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
model_name="archiveresult",
|
||||
name="id",
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
model_name="tag",
|
||||
name="id",
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,31 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0020_auto_20210410_1031'),
|
||||
("core", "0020_auto_20210410_1031"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
model_name="archiveresult",
|
||||
name="extractor",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("favicon", "favicon"),
|
||||
("headers", "headers"),
|
||||
("singlefile", "singlefile"),
|
||||
("pdf", "pdf"),
|
||||
("screenshot", "screenshot"),
|
||||
("dom", "dom"),
|
||||
("wget", "wget"),
|
||||
("title", "title"),
|
||||
("readability", "readability"),
|
||||
("mercury", "mercury"),
|
||||
("git", "git"),
|
||||
("media", "media"),
|
||||
("archivedotorg", "archivedotorg"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,32 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0021_auto_20220914_0934'),
|
||||
("core", "0021_auto_20220914_0934"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
model_name="archiveresult",
|
||||
name="extractor",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("favicon", "favicon"),
|
||||
("headers", "headers"),
|
||||
("singlefile", "singlefile"),
|
||||
("pdf", "pdf"),
|
||||
("screenshot", "screenshot"),
|
||||
("dom", "dom"),
|
||||
("wget", "wget"),
|
||||
("title", "title"),
|
||||
("readability", "readability"),
|
||||
("mercury", "mercury"),
|
||||
("htmltotext", "htmltotext"),
|
||||
("git", "git"),
|
||||
("media", "media"),
|
||||
("archivedotorg", "archivedotorg"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -16,6 +16,7 @@ def get_table_columns(table_name):
|
||||
def upgrade_core_tables(apps, schema_editor):
|
||||
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if core_archiveresult table exists
|
||||
@@ -30,11 +31,11 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
has_data = row_count > 0
|
||||
|
||||
# Detect which version we're migrating from
|
||||
archiveresult_cols = get_table_columns('core_archiveresult')
|
||||
has_uuid = 'uuid' in archiveresult_cols
|
||||
has_abid = 'abid' in archiveresult_cols
|
||||
archiveresult_cols = get_table_columns("core_archiveresult")
|
||||
has_uuid = "uuid" in archiveresult_cols
|
||||
has_abid = "abid" in archiveresult_cols
|
||||
|
||||
print(f'DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}')
|
||||
print(f"DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}")
|
||||
|
||||
# ============================================================================
|
||||
# PART 1: Upgrade core_archiveresult table
|
||||
@@ -62,7 +63,7 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
if has_data:
|
||||
if has_uuid and not has_abid:
|
||||
# Migrating from v0.7.2+ (has uuid column)
|
||||
print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
|
||||
print("Migrating ArchiveResult from v0.7.2+ schema (with uuid)...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||
@@ -75,7 +76,7 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
""")
|
||||
elif has_abid and not has_uuid:
|
||||
# Migrating from v0.8.6rc0 (has abid instead of uuid)
|
||||
print('Migrating ArchiveResult from v0.8.6rc0 schema...')
|
||||
print("Migrating ArchiveResult from v0.8.6rc0 schema...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||
@@ -88,17 +89,34 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
""")
|
||||
else:
|
||||
# Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
|
||||
print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
|
||||
cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
|
||||
print("Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...")
|
||||
cursor.execute(
|
||||
"SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult",
|
||||
)
|
||||
old_records = cursor.fetchall()
|
||||
for record in old_records:
|
||||
new_uuid = uuid7().hex
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status, extractor, output
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
|
||||
""",
|
||||
(
|
||||
record[0],
|
||||
new_uuid,
|
||||
record[1],
|
||||
record[2],
|
||||
record[3],
|
||||
record[4],
|
||||
record[5],
|
||||
record[6],
|
||||
record[7],
|
||||
record[8],
|
||||
record[9],
|
||||
),
|
||||
)
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
|
||||
@@ -149,13 +167,13 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
|
||||
if snapshot_has_data:
|
||||
# Detect which version we're migrating from
|
||||
snapshot_cols = get_table_columns('core_snapshot')
|
||||
has_added = 'added' in snapshot_cols
|
||||
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
|
||||
snapshot_cols = get_table_columns("core_snapshot")
|
||||
has_added = "added" in snapshot_cols
|
||||
has_bookmarked_at = "bookmarked_at" in snapshot_cols
|
||||
|
||||
if has_added and not has_bookmarked_at:
|
||||
# Migrating from v0.7.2 (has added/updated fields)
|
||||
print('Migrating Snapshot from v0.7.2 schema...')
|
||||
print("Migrating Snapshot from v0.7.2 schema...")
|
||||
# Transform added→bookmarked_at/created_at and updated→modified_at
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_new (
|
||||
@@ -173,28 +191,28 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
""")
|
||||
elif has_bookmarked_at and not has_added:
|
||||
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
|
||||
print('Migrating Snapshot from v0.8.6rc0 schema...')
|
||||
print("Migrating Snapshot from v0.8.6rc0 schema...")
|
||||
# Check what fields exist
|
||||
has_status = 'status' in snapshot_cols
|
||||
has_retry_at = 'retry_at' in snapshot_cols
|
||||
has_crawl_id = 'crawl_id' in snapshot_cols
|
||||
has_status = "status" in snapshot_cols
|
||||
has_retry_at = "retry_at" in snapshot_cols
|
||||
has_crawl_id = "crawl_id" in snapshot_cols
|
||||
|
||||
# Build column list based on what exists
|
||||
cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at']
|
||||
cols = ["id", "url", "timestamp", "title", "bookmarked_at", "created_at", "modified_at", "downloaded_at"]
|
||||
if has_crawl_id:
|
||||
cols.append('crawl_id')
|
||||
cols.append("crawl_id")
|
||||
if has_status:
|
||||
cols.append('status')
|
||||
cols.append("status")
|
||||
if has_retry_at:
|
||||
cols.append('retry_at')
|
||||
cols.append("retry_at")
|
||||
|
||||
cursor.execute(f"""
|
||||
INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)})
|
||||
SELECT {', '.join(cols)}
|
||||
INSERT OR IGNORE INTO core_snapshot_new ({", ".join(cols)})
|
||||
SELECT {", ".join(cols)}
|
||||
FROM core_snapshot;
|
||||
""")
|
||||
else:
|
||||
print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}')
|
||||
print(f"Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}")
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot;")
|
||||
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;")
|
||||
@@ -237,13 +255,13 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
cursor.execute("PRAGMA table_info(core_tag)")
|
||||
tag_id_type = None
|
||||
for row in cursor.fetchall():
|
||||
if row[1] == 'id': # row[1] is column name
|
||||
if row[1] == "id": # row[1] is column name
|
||||
tag_id_type = row[2] # row[2] is type
|
||||
break
|
||||
|
||||
if tag_id_type and 'char' in tag_id_type.lower():
|
||||
if tag_id_type and "char" in tag_id_type.lower():
|
||||
# v0.8.6rc0: Tag IDs are UUIDs, need to convert to INTEGER
|
||||
print('Converting Tag IDs from UUID to INTEGER...')
|
||||
print("Converting Tag IDs from UUID to INTEGER...")
|
||||
|
||||
# Get all tags with their UUIDs
|
||||
cursor.execute("SELECT id, name, slug, created_at, modified_at, created_by_id FROM core_tag ORDER BY name")
|
||||
@@ -255,10 +273,13 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
old_id, name, slug, created_at, modified_at, created_by_id = tag
|
||||
uuid_to_int_map[old_id] = i
|
||||
# Insert with new INTEGER ID
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT OR IGNORE INTO core_tag_new (id, name, slug, created_at, modified_at, created_by_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (i, name, slug, created_at, modified_at, created_by_id))
|
||||
""",
|
||||
(i, name, slug, created_at, modified_at, created_by_id),
|
||||
)
|
||||
|
||||
# Update snapshot_tags to use new INTEGER IDs
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot_tags'")
|
||||
@@ -273,13 +294,16 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
for st_id, snapshot_id, old_tag_id in snapshot_tags:
|
||||
new_tag_id = uuid_to_int_map.get(old_tag_id)
|
||||
if new_tag_id:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT OR IGNORE INTO core_snapshot_tags (id, snapshot_id, tag_id)
|
||||
VALUES (?, ?, ?)
|
||||
""", (st_id, snapshot_id, new_tag_id))
|
||||
""",
|
||||
(st_id, snapshot_id, new_tag_id),
|
||||
)
|
||||
else:
|
||||
# v0.7.2: Tag IDs are already INTEGER
|
||||
print('Migrating Tag from v0.7.2 schema...')
|
||||
print("Migrating Tag from v0.7.2 schema...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
|
||||
SELECT id, name, slug
|
||||
@@ -294,15 +318,14 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);")
|
||||
|
||||
if has_data:
|
||||
print('✓ Core tables upgraded to v0.9.0')
|
||||
print("✓ Core tables upgraded to v0.9.0")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
('crawls', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
("core", "0022_auto_20231023_2008"),
|
||||
("crawls", "0001_initial"),
|
||||
("auth", "0012_alter_user_first_name_max_length"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@@ -317,60 +340,58 @@ class Migration(migrations.Migration):
|
||||
# NOTE: We do NOT remove extractor/output for ArchiveResult!
|
||||
# They are still in the database and will be removed by migration 0025
|
||||
# after copying their data to plugin/output_str.
|
||||
|
||||
# However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields
|
||||
# because the SQL above already transformed them.
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
migrations.RemoveField(model_name="snapshot", name="added"),
|
||||
migrations.RemoveField(model_name="snapshot", name="updated"),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
model_name="snapshot",
|
||||
name="bookmarked_at",
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
model_name="snapshot",
|
||||
name="created_at",
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
model_name="snapshot",
|
||||
name="modified_at",
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
# Declare fs_version (already created in database with DEFAULT '0.8.0')
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
model_name="snapshot",
|
||||
name="fs_version",
|
||||
field=models.CharField(
|
||||
max_length=10,
|
||||
default='0.8.0',
|
||||
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
|
||||
default="0.8.0",
|
||||
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().',
|
||||
),
|
||||
),
|
||||
|
||||
# SnapshotTag table already exists from v0.7.2, just declare it in state
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
name="SnapshotTag",
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)),
|
||||
('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)),
|
||||
("id", models.AutoField(primary_key=True, serialize=False)),
|
||||
("snapshot", models.ForeignKey(to="core.Snapshot", db_column="snapshot_id", on_delete=models.CASCADE)),
|
||||
("tag", models.ForeignKey(to="core.Tag", db_column="tag_id", on_delete=models.CASCADE)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
'unique_together': {('snapshot', 'tag')},
|
||||
"db_table": "core_snapshot_tags",
|
||||
"unique_together": {("snapshot", "tag")},
|
||||
},
|
||||
),
|
||||
# Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2)
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(
|
||||
'Tag',
|
||||
"Tag",
|
||||
blank=True,
|
||||
related_name='snapshot_set',
|
||||
through='SnapshotTag',
|
||||
through_fields=('snapshot', 'tag'),
|
||||
related_name="snapshot_set",
|
||||
through="SnapshotTag",
|
||||
through_fields=("snapshot", "tag"),
|
||||
),
|
||||
),
|
||||
],
|
||||
|
||||
@@ -20,23 +20,27 @@ def create_default_crawl_and_assign_snapshots(apps, schema_editor):
|
||||
snapshots_without_crawl = cursor.fetchone()[0]
|
||||
|
||||
if snapshots_without_crawl == 0:
|
||||
print('✓ Fresh install or all snapshots already have crawls')
|
||||
print("✓ Fresh install or all snapshots already have crawls")
|
||||
return
|
||||
|
||||
# Get or create system user (pk=1)
|
||||
cursor.execute("SELECT id FROM auth_user WHERE id = 1")
|
||||
if not cursor.fetchone():
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined)
|
||||
VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?)
|
||||
""", [datetime.now().isoformat()])
|
||||
""",
|
||||
[datetime.now().isoformat()],
|
||||
)
|
||||
|
||||
# Create a default crawl for migrated snapshots
|
||||
# At this point crawls_crawl is guaranteed to have v0.9.0 schema (crawls/0002 ran first)
|
||||
crawl_id = str(uuid_lib.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO crawls_crawl (
|
||||
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
|
||||
urls, max_depth, tags_str, label, notes, output_dir,
|
||||
@@ -44,20 +48,21 @@ def create_default_crawl_and_assign_snapshots(apps, schema_editor):
|
||||
) VALUES (?, ?, ?, 0, 0, '', 0, '', 'Migrated from v0.7.2/v0.8.6',
|
||||
'Auto-created crawl for migrated snapshots', '',
|
||||
'sealed', ?, 1, NULL, '{}', NULL)
|
||||
""", [crawl_id, now, now, now])
|
||||
""",
|
||||
[crawl_id, now, now, now],
|
||||
)
|
||||
|
||||
# Assign all snapshots without a crawl to the default crawl
|
||||
cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id])
|
||||
|
||||
print(f'✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}')
|
||||
print(f"✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_upgrade_to_0_9_0'),
|
||||
('crawls', '0002_upgrade_from_0_8_6'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
("core", "0023_upgrade_to_0_9_0"),
|
||||
("crawls", "0002_upgrade_from_0_8_6"),
|
||||
("auth", "0012_alter_user_first_name_max_length"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@@ -137,12 +142,12 @@ class Migration(migrations.Migration):
|
||||
],
|
||||
state_operations=[
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
model_name="snapshot",
|
||||
name="crawl",
|
||||
field=models.ForeignKey(
|
||||
on_delete=models.deletion.CASCADE,
|
||||
to='crawls.crawl',
|
||||
help_text='Crawl that created this snapshot'
|
||||
to="crawls.crawl",
|
||||
help_text="Crawl that created this snapshot",
|
||||
),
|
||||
),
|
||||
],
|
||||
|
||||
@@ -17,20 +17,24 @@ def copy_old_fields_to_new(apps, schema_editor):
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'extractor' in cols and 'plugin' in cols:
|
||||
if "extractor" in cols and "plugin" in cols:
|
||||
# Copy extractor -> plugin
|
||||
cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL")
|
||||
|
||||
if 'output' in cols and 'output_str' in cols:
|
||||
if "output" in cols and "output_str" in cols:
|
||||
# Copy output -> output_str
|
||||
cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '') WHERE output_str = '' OR output_str IS NULL")
|
||||
|
||||
# Copy timestamps to new timestamp fields if they don't have values yet
|
||||
if 'start_ts' in cols and 'created_at' in cols:
|
||||
cursor.execute("UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''")
|
||||
if "start_ts" in cols and "created_at" in cols:
|
||||
cursor.execute(
|
||||
"UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''",
|
||||
)
|
||||
|
||||
if 'end_ts' in cols and 'modified_at' in cols:
|
||||
cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
|
||||
if "end_ts" in cols and "modified_at" in cols:
|
||||
cursor.execute(
|
||||
"UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''",
|
||||
)
|
||||
|
||||
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
|
||||
# transformed by migration 0023, so we don't need to copy them here.
|
||||
@@ -39,164 +43,191 @@ def copy_old_fields_to_new(apps, schema_editor):
|
||||
# Debug: Check Snapshot timestamps at end of RunPython
|
||||
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
|
||||
snap_after = cursor.fetchall()
|
||||
print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}')
|
||||
print(f"DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_assign_default_crawl'),
|
||||
('crawls', '0001_initial'),
|
||||
("core", "0024_assign_default_crawl"),
|
||||
("crawls", "0001_initial"),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
name="archiveresult",
|
||||
options={"verbose_name": "Archive Result", "verbose_name_plural": "Archive Results Log"},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='snapshot',
|
||||
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
|
||||
name="snapshot",
|
||||
options={"verbose_name": "Snapshot", "verbose_name_plural": "Snapshots"},
|
||||
),
|
||||
# NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027
|
||||
# to allow data migration to Process records first
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
model_name="archiveresult",
|
||||
name="config",
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
model_name="archiveresult",
|
||||
name="created_at",
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
|
||||
model_name="archiveresult",
|
||||
name="hook_name",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
default="",
|
||||
help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)",
|
||||
max_length=255,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified_at',
|
||||
model_name="archiveresult",
|
||||
name="modified_at",
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
model_name="archiveresult",
|
||||
name="notes",
|
||||
field=models.TextField(blank=True, default=""),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
model_name="archiveresult",
|
||||
name="num_uses_failed",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
model_name="archiveresult",
|
||||
name="num_uses_succeeded",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
model_name="archiveresult",
|
||||
name="output_files",
|
||||
field=models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}"),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
model_name="archiveresult",
|
||||
name="output_json",
|
||||
field=models.JSONField(blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)", null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
model_name="archiveresult",
|
||||
name="output_mimetypes",
|
||||
field=models.CharField(blank=True, default="", help_text="CSV of mimetypes sorted by size", max_length=512),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
model_name="archiveresult",
|
||||
name="output_size",
|
||||
field=models.BigIntegerField(default=0, help_text="Total bytes of all output files"),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
model_name="archiveresult",
|
||||
name="output_str",
|
||||
field=models.TextField(blank=True, default="", help_text="Human-readable output summary"),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(db_index=True, default='', max_length=32),
|
||||
model_name="archiveresult",
|
||||
name="plugin",
|
||||
field=models.CharField(db_index=True, default="", max_length=32),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
model_name="archiveresult",
|
||||
name="retry_at",
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
# NOTE: bookmarked_at and created_at already added by migration 0023
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
model_name="snapshot",
|
||||
name="config",
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
|
||||
model_name="snapshot",
|
||||
name="current_step",
|
||||
field=models.PositiveSmallIntegerField(
|
||||
db_index=True,
|
||||
default=0,
|
||||
help_text="Current hook step being executed (0-9). Used for sequential hook execution.",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
model_name="snapshot",
|
||||
name="depth",
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
model_name="snapshot",
|
||||
name="downloaded_at",
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
# NOTE: fs_version already added by migration 0023 with default='0.8.0'
|
||||
# NOTE: modified_at already added by migration 0023
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
model_name="snapshot",
|
||||
name="notes",
|
||||
field=models.TextField(blank=True, default=""),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_failed',
|
||||
model_name="snapshot",
|
||||
name="num_uses_failed",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_succeeded',
|
||||
model_name="snapshot",
|
||||
name="num_uses_succeeded",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
model_name="snapshot",
|
||||
name="parent_snapshot",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
help_text="Parent snapshot that discovered this URL (for recursive crawling)",
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="child_snapshots",
|
||||
to="core.snapshot",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
model_name="snapshot",
|
||||
name="retry_at",
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
|
||||
model_name="snapshot",
|
||||
name="status",
|
||||
field=models.CharField(
|
||||
choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")],
|
||||
db_index=True,
|
||||
default="queued",
|
||||
max_length=15,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_at',
|
||||
model_name="tag",
|
||||
name="created_at",
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
model_name="tag",
|
||||
name="created_by",
|
||||
field=models.ForeignKey(
|
||||
default=archivebox.base_models.models.get_or_create_system_user_pk,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="tag_set",
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified_at',
|
||||
model_name="tag",
|
||||
name="modified_at",
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
# Copy data from old field names to new field names after AddField operations
|
||||
@@ -206,75 +237,93 @@ class Migration(migrations.Migration):
|
||||
),
|
||||
# Now remove the old ArchiveResult fields after data has been copied
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
model_name="archiveresult",
|
||||
name="extractor",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
model_name="archiveresult",
|
||||
name="output",
|
||||
),
|
||||
# NOTE: Snapshot's added/updated were already removed by migration 0023
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
model_name="archiveresult",
|
||||
name="end_ts",
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
model_name="archiveresult",
|
||||
name="id",
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
model_name="archiveresult",
|
||||
name="start_ts",
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
model_name="archiveresult",
|
||||
name="status",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("queued", "Queued"),
|
||||
("started", "Started"),
|
||||
("backoff", "Waiting to retry"),
|
||||
("succeeded", "Succeeded"),
|
||||
("failed", "Failed"),
|
||||
("skipped", "Skipped"),
|
||||
],
|
||||
db_index=True,
|
||||
default="queued",
|
||||
max_length=15,
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
model_name="archiveresult",
|
||||
name="uuid",
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
model_name="snapshot",
|
||||
name="crawl",
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="snapshot_set", to="crawls.crawl"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
model_name="snapshot",
|
||||
name="id",
|
||||
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(
|
||||
blank=True,
|
||||
related_name="snapshot_set",
|
||||
through="core.SnapshotTag",
|
||||
through_fields=("snapshot", "tag"),
|
||||
to="core.tag",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
model_name="snapshot",
|
||||
name="url",
|
||||
field=models.URLField(db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
field=models.SlugField(editable=False, max_length=100, unique=True),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'),
|
||||
model_name="snapshot",
|
||||
constraint=models.UniqueConstraint(fields=("url", "crawl"), name="unique_url_per_crawl"),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
|
||||
model_name="snapshot",
|
||||
constraint=models.UniqueConstraint(fields=("timestamp",), name="unique_timestamp"),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -5,24 +5,30 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
("core", "0025_alter_archiveresult_options_alter_snapshot_options_and_more"),
|
||||
("machine", "0007_add_process_type_and_parent"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
model_name="archiveresult",
|
||||
name="num_uses_failed",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
model_name="archiveresult",
|
||||
name="num_uses_succeeded",
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
model_name="archiveresult",
|
||||
name="process",
|
||||
field=models.OneToOneField(
|
||||
blank=True,
|
||||
help_text="Process execution details for this archive result",
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.PROTECT,
|
||||
related_name="archiveresult",
|
||||
to="machine.process",
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -25,7 +25,7 @@ def parse_cmd_field(cmd_raw):
|
||||
return []
|
||||
|
||||
# Try to parse as JSON first
|
||||
if cmd_raw.startswith('['):
|
||||
if cmd_raw.startswith("["):
|
||||
try:
|
||||
parsed = json.loads(cmd_raw)
|
||||
if isinstance(parsed, list):
|
||||
@@ -45,7 +45,7 @@ def get_or_create_current_machine(cursor):
|
||||
|
||||
# Simple machine detection - get hostname as guid
|
||||
hostname = socket.gethostname()
|
||||
guid = f'host_{hostname}' # Simple but stable identifier
|
||||
guid = f"host_{hostname}" # Simple but stable identifier
|
||||
|
||||
# Check if machine exists
|
||||
cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
|
||||
@@ -64,9 +64,10 @@ def get_or_create_current_machine(cursor):
|
||||
machine_cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
if 'config' in machine_cols:
|
||||
if "config" in machine_cols:
|
||||
# 0.9.x schema with config column
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO machine_machine (
|
||||
id, created_at, modified_at, guid, hostname,
|
||||
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
@@ -74,10 +75,13 @@ def get_or_create_current_machine(cursor):
|
||||
stats, config, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
|
||||
'', '', '', '', '', '{}', '{}', 0, 0)
|
||||
""", [machine_id, now, now, guid, hostname])
|
||||
""",
|
||||
[machine_id, now, now, guid, hostname],
|
||||
)
|
||||
else:
|
||||
# 0.8.x schema without config column
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO machine_machine (
|
||||
id, created_at, modified_at, guid, hostname,
|
||||
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
@@ -85,7 +89,9 @@ def get_or_create_current_machine(cursor):
|
||||
stats, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
|
||||
'', '', '', '', '', '{}', 0, 0)
|
||||
""", [machine_id, now, now, guid, hostname])
|
||||
""",
|
||||
[machine_id, now, now, guid, hostname],
|
||||
)
|
||||
|
||||
return machine_id
|
||||
|
||||
@@ -108,15 +114,18 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
|
||||
# If abspath is just a name without slashes, it's not a full path
|
||||
# Store it in both fields for simplicity
|
||||
if '/' not in abspath:
|
||||
if "/" not in abspath:
|
||||
# Not a full path - store as-is
|
||||
pass
|
||||
|
||||
# Check if binary exists with same machine, name, abspath, version
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT id FROM machine_binary
|
||||
WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
|
||||
""", [machine_id, name, abspath, version])
|
||||
""",
|
||||
[machine_id, name, abspath, version],
|
||||
)
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
@@ -134,9 +143,10 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
# Use only columns that exist in current schema
|
||||
# 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
|
||||
# 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
|
||||
if 'binproviders' in binary_cols:
|
||||
if "binproviders" in binary_cols:
|
||||
# 0.9.x schema
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO machine_binary (
|
||||
id, created_at, modified_at, machine_id,
|
||||
name, binproviders, overrides, binprovider, abspath, version, sha256,
|
||||
@@ -144,16 +154,21 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
|
||||
'succeeded', NULL, '', 0, 0)
|
||||
""", [binary_id, now, now, machine_id, name, abspath, version])
|
||||
""",
|
||||
[binary_id, now, now, machine_id, name, abspath, version],
|
||||
)
|
||||
else:
|
||||
# 0.8.x schema (simpler)
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO machine_binary (
|
||||
id, created_at, modified_at, machine_id,
|
||||
name, binprovider, abspath, version, sha256,
|
||||
num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
|
||||
""", [binary_id, now, now, machine_id, name, abspath, version])
|
||||
""",
|
||||
[binary_id, now, now, machine_id, name, abspath, version],
|
||||
)
|
||||
|
||||
return binary_id
|
||||
|
||||
@@ -169,15 +184,15 @@ def map_status(old_status):
|
||||
(process_status, exit_code) tuple
|
||||
"""
|
||||
status_map = {
|
||||
'queued': ('queued', None),
|
||||
'started': ('running', None),
|
||||
'backoff': ('queued', None),
|
||||
'succeeded': ('exited', 0),
|
||||
'failed': ('exited', 1),
|
||||
'skipped': ('exited', None), # Skipped = exited without error
|
||||
"queued": ("queued", None),
|
||||
"started": ("running", None),
|
||||
"backoff": ("queued", None),
|
||||
"succeeded": ("exited", 0),
|
||||
"failed": ("exited", 1),
|
||||
"skipped": ("exited", None), # Skipped = exited without error
|
||||
}
|
||||
|
||||
return status_map.get(old_status, ('queued', None))
|
||||
return status_map.get(old_status, ("queued", None))
|
||||
|
||||
|
||||
def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
|
||||
@@ -197,9 +212,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
|
||||
cmd_json = json.dumps(cmd)
|
||||
|
||||
# Set retry_at to now for queued processes, NULL otherwise
|
||||
retry_at = now if status == 'queued' else None
|
||||
retry_at = now if status == "queued" else None
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO machine_process (
|
||||
id, created_at, modified_at, machine_id, parent_id, process_type,
|
||||
pwd, cmd, env, timeout,
|
||||
@@ -213,14 +229,22 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
|
||||
?, ?,
|
||||
?, NULL, NULL,
|
||||
?, ?)
|
||||
""", [
|
||||
process_id, now, now, machine_id,
|
||||
pwd, cmd_json,
|
||||
exit_code,
|
||||
started_at, ended_at,
|
||||
binary_id,
|
||||
status, retry_at
|
||||
])
|
||||
""",
|
||||
[
|
||||
process_id,
|
||||
now,
|
||||
now,
|
||||
machine_id,
|
||||
pwd,
|
||||
cmd_json,
|
||||
exit_code,
|
||||
started_at,
|
||||
ended_at,
|
||||
binary_id,
|
||||
status,
|
||||
retry_at,
|
||||
],
|
||||
)
|
||||
|
||||
return process_id
|
||||
|
||||
@@ -250,16 +274,18 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
print(f'DEBUG 0027: Columns found: {sorted(cols)}')
|
||||
print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
|
||||
print(f"DEBUG 0027: Columns found: {sorted(cols)}")
|
||||
print(
|
||||
f"DEBUG 0027: Has cmd={('cmd' in cols)}, pwd={('pwd' in cols)}, cmd_version={('cmd_version' in cols)}, process_id={('process_id' in cols)}",
|
||||
)
|
||||
|
||||
if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
|
||||
print('✓ Fresh install or fields already removed - skipping data copy')
|
||||
if "cmd" not in cols or "pwd" not in cols or "cmd_version" not in cols:
|
||||
print("✓ Fresh install or fields already removed - skipping data copy")
|
||||
return
|
||||
|
||||
# Check if process_id field exists (should exist from 0026)
|
||||
if 'process_id' not in cols:
|
||||
print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
|
||||
if "process_id" not in cols:
|
||||
print("✗ ERROR: process_id field not found. Migration 0026 must run first.")
|
||||
return
|
||||
|
||||
# Get or create Machine.current()
|
||||
@@ -278,10 +304,10 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
results = cursor.fetchall()
|
||||
|
||||
if not results:
|
||||
print('✓ No ArchiveResults need Process migration')
|
||||
print("✓ No ArchiveResults need Process migration")
|
||||
return
|
||||
|
||||
print(f'Migrating {len(results)} ArchiveResults to Process records...')
|
||||
print(f"Migrating {len(results)} ArchiveResults to Process records...")
|
||||
|
||||
migrated_count = 0
|
||||
skipped_count = 0
|
||||
@@ -291,42 +317,46 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
|
||||
print(f"DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}")
|
||||
|
||||
try:
|
||||
# Parse cmd field
|
||||
cmd_array = parse_cmd_field(cmd_raw)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
|
||||
print(f"DEBUG 0027: Parsed cmd: {cmd_array}")
|
||||
|
||||
# Extract binary info from cmd[0] if available
|
||||
binary_id = None
|
||||
if cmd_array and cmd_array[0]:
|
||||
binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name
|
||||
binary_abspath = cmd_array[0]
|
||||
binary_version = cmd_version or ''
|
||||
binary_version = cmd_version or ""
|
||||
|
||||
# Get or create Binary record
|
||||
binary_id = get_or_create_binary(
|
||||
cursor, machine_id, binary_name, binary_abspath, binary_version
|
||||
cursor,
|
||||
machine_id,
|
||||
binary_name,
|
||||
binary_abspath,
|
||||
binary_version,
|
||||
)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
|
||||
print(f"DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}")
|
||||
|
||||
# Map status
|
||||
process_status, exit_code = map_status(status)
|
||||
|
||||
# Set timestamps
|
||||
started_at = start_ts or created_at
|
||||
ended_at = end_ts if process_status == 'exited' else None
|
||||
ended_at = end_ts if process_status == "exited" else None
|
||||
|
||||
# Create Process record
|
||||
process_id = create_process(
|
||||
cursor=cursor,
|
||||
machine_id=machine_id,
|
||||
pwd=pwd or '',
|
||||
pwd=pwd or "",
|
||||
cmd=cmd_array,
|
||||
status=process_status,
|
||||
exit_code=exit_code,
|
||||
@@ -336,34 +366,34 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Created Process: id={process_id}')
|
||||
print(f"DEBUG 0027: Created Process: id={process_id}")
|
||||
|
||||
# Link ArchiveResult to Process
|
||||
cursor.execute(
|
||||
"UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
|
||||
[process_id, ar_id]
|
||||
[process_id, ar_id],
|
||||
)
|
||||
|
||||
migrated_count += 1
|
||||
|
||||
if i == 0:
|
||||
print('DEBUG 0027: Linked ArchiveResult to Process')
|
||||
print("DEBUG 0027: Linked ArchiveResult to Process")
|
||||
|
||||
except Exception as e:
|
||||
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
|
||||
print(f"✗ Error migrating ArchiveResult {ar_id}: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
error_count += 1
|
||||
continue
|
||||
|
||||
print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
|
||||
print(f"✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_add_process_to_archiveresult'),
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
("core", "0026_add_process_to_archiveresult"),
|
||||
("machine", "0007_add_process_type_and_parent"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@@ -372,18 +402,17 @@ class Migration(migrations.Migration):
|
||||
copy_archiveresult_data_to_process,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
|
||||
# Now safe to remove old fields (moved from 0025)
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd',
|
||||
model_name="archiveresult",
|
||||
name="cmd",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
model_name="archiveresult",
|
||||
name="cmd_version",
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='pwd',
|
||||
model_name="archiveresult",
|
||||
name="pwd",
|
||||
),
|
||||
]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user