This commit is contained in:
Nick Sweeting
2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions

View File

@@ -9,7 +9,7 @@
# in a universe that seems indifferent to us."
# --Norber Weiner
__package__ = 'archivebox'
__package__ = "archivebox"
import os
import sys
@@ -22,11 +22,12 @@ from abx_plugins import get_plugins_dir
class _ReconfigurableStream(Protocol):
def reconfigure(self, *, line_buffering: bool) -> object: ...
# Force unbuffered output for real-time logs
if hasattr(sys.stdout, 'reconfigure'):
if hasattr(sys.stdout, "reconfigure"):
cast(_ReconfigurableStream, sys.stdout).reconfigure(line_buffering=True)
cast(_ReconfigurableStream, sys.stderr).reconfigure(line_buffering=True)
os.environ['PYTHONUNBUFFERED'] = '1'
os.environ["PYTHONUNBUFFERED"] = "1"
ASCII_LOGO = """
█████╗ ██████╗ ██████╗██╗ ██╗██╗██╗ ██╗███████╗ ██████╗ ██████╗ ██╗ ██╗
@@ -44,48 +45,51 @@ PACKAGE_DIR = Path(__file__).resolve().parent
# if str(PACKAGE_DIR) not in sys.path:
# sys.path.append(str(PACKAGE_DIR))
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
os.environ['TZ'] = 'UTC'
os.environ["DJANGO_SETTINGS_MODULE"] = "archivebox.core.settings"
os.environ["TZ"] = "UTC"
# detect ArchiveBox user's UID/GID based on data dir ownership
from .config.permissions import drop_privileges # noqa
from .config.permissions import drop_privileges # noqa
drop_privileges()
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
check_not_root()
check_not_inside_source_dir()
check_io_encoding()
# Install monkey patches for third-party libraries
from .misc.monkey_patches import * # noqa
from .misc.monkey_patches import * # noqa
# Plugin directories
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
USER_PLUGINS_DIR = Path(
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
or os.environ.get('USER_PLUGINS_DIR')
or os.environ.get('DATA_DIR', os.getcwd())
) / 'custom_plugins'
USER_PLUGINS_DIR = (
Path(
os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or os.environ.get("USER_PLUGINS_DIR") or os.environ.get("DATA_DIR", os.getcwd()),
)
/ "custom_plugins"
)
# These are kept for backwards compatibility with existing code
# that checks for plugins. The new hook system uses discover_hooks()
ALL_PLUGINS = {
'builtin': BUILTIN_PLUGINS_DIR,
'user': USER_PLUGINS_DIR,
"builtin": BUILTIN_PLUGINS_DIR,
"user": USER_PLUGINS_DIR,
}
LOADED_PLUGINS = ALL_PLUGINS
# Setup basic config, constants, paths, and version
from .config.constants import CONSTANTS # noqa
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .config.version import VERSION # noqa
from .config.constants import CONSTANTS # noqa
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .config.version import VERSION # noqa
# Set MACHINE_ID env var so hook scripts can use it
os.environ.setdefault('MACHINE_ID', CONSTANTS.MACHINE_ID)
os.environ.setdefault("MACHINE_ID", CONSTANTS.MACHINE_ID)
__version__ = VERSION
__author__ = 'ArchiveBox'
__license__ = 'MIT'
__author__ = "ArchiveBox"
__license__ = "MIT"
ASCII_ICON = """
██████████████████████████████████████████████████████████████████████████████████████████████████

View File

@@ -1,8 +1,9 @@
#!/usr/bin/env python3
"""This is the entrypoint for python -m archivebox ..."""
__package__ = 'archivebox'
import archivebox # noqa # make sure monkey patches are applied before anything else
__package__ = "archivebox"
import archivebox # noqa # make sure monkey patches are applied before anything else
import sys
from .cli import main
@@ -15,5 +16,5 @@ ASCII_LOGO_MINI = r"""
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
"""
if __name__ == '__main__':
if __name__ == "__main__":
main(args=sys.argv[1:], stdin=sys.stdin)

View File

@@ -1 +1 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.contrib import admin
from django.http import HttpRequest
@@ -11,57 +11,81 @@ from archivebox.api.models import APIToken
class APITokenAdmin(BaseModelAdmin):
list_display = ('created_at', 'id', 'created_by', 'token_redacted', 'expires')
sort_fields = ('id', 'created_at', 'created_by', 'expires')
readonly_fields = ('created_at', 'modified_at')
search_fields = ('id', 'created_by__username', 'token')
list_display = ("created_at", "id", "created_by", "token_redacted", "expires")
sort_fields = ("id", "created_at", "created_by", "expires")
readonly_fields = ("created_at", "modified_at")
search_fields = ("id", "created_by__username", "token")
fieldsets = (
('Token', {
'fields': ('token', 'expires'),
'classes': ('card',),
}),
('Owner', {
'fields': ('created_by',),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
(
"Token",
{
"fields": ("token", "expires"),
"classes": ("card",),
},
),
(
"Owner",
{
"fields": ("created_by",),
"classes": ("card",),
},
),
(
"Timestamps",
{
"fields": ("created_at", "modified_at"),
"classes": ("card",),
},
),
)
list_filter = ('created_by',)
ordering = ['-created_at']
list_filter = ("created_by",)
ordering = ["-created_at"]
list_per_page = 100
class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
list_display = ('created_at', 'created_by', 'id', *WebhookAdmin.list_display)
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display)
sort_fields = ("created_at", "created_by", "id", "referenced_model", "endpoint", "last_success", "last_error")
readonly_fields = ("created_at", "modified_at", *WebhookAdmin.readonly_fields)
fieldsets = (
('Webhook', {
'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
'classes': ('card', 'wide'),
}),
('Authentication', {
'fields': ('auth_token',),
'classes': ('card',),
}),
('Status', {
'fields': ('enabled', 'last_success', 'last_error'),
'classes': ('card',),
}),
('Owner', {
'fields': ('created_by',),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
(
"Webhook",
{
"fields": ("name", "signal", "referenced_model", "endpoint"),
"classes": ("card", "wide"),
},
),
(
"Authentication",
{
"fields": ("auth_token",),
"classes": ("card",),
},
),
(
"Status",
{
"fields": ("enabled", "last_success", "last_error"),
"classes": ("card",),
},
),
(
"Owner",
{
"fields": ("created_by",),
"classes": ("card",),
},
),
(
"Timestamps",
{
"fields": ("created_at", "modified_at"),
"classes": ("card",),
},
),
)
def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool:

View File

@@ -1,13 +1,14 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.apps import AppConfig
class APIConfig(AppConfig):
name = 'archivebox.api'
label = 'api'
name = "archivebox.api"
label = "api"
def register_admin(admin_site):
from archivebox.api.admin import register_admin
register_admin(admin_site)

View File

@@ -1,6 +1,5 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from typing import Optional
from datetime import timedelta
from django.utils import timezone
@@ -14,7 +13,7 @@ from ninja.errors import HttpError
def get_or_create_api_token(user: User | None):
from archivebox.api.models import APIToken
if user and user.is_superuser:
api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now())
if api_tokens.exists():
@@ -34,18 +33,18 @@ def get_or_create_api_token(user: User | None):
def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None:
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
user: User | None = None
submitted_empty_form = str(token).strip() in ('string', '', 'None', 'null')
submitted_empty_form = str(token).strip() in ("string", "", "None", "null")
if not submitted_empty_form:
try:
api_token = APIToken.objects.get(token=token)
if api_token.is_valid() and isinstance(api_token.created_by, User):
user = api_token.created_by
if request is not None:
setattr(request, '_api_token', api_token)
setattr(request, "_api_token", api_token)
except APIToken.DoesNotExist:
pass
@@ -55,8 +54,8 @@ def auth_using_token(token: str | None, request: HttpRequest | None = None) -> U
def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None:
"""Given a username and password, check if they are valid and return the corresponding user"""
user: User | None = None
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None))
if not submitted_empty_form:
authenticated_user = authenticate(
username=username,
@@ -73,34 +72,40 @@ def auth_using_password(username: str | None, password: str | None, request: Htt
def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None:
if user and user.pk:
request.user = user
setattr(request, '_api_auth_method', auth_method)
setattr(request, "_api_auth_method", auth_method)
if not user.is_superuser:
raise HttpError(403, 'Valid credentials but User does not have permission (make sure user.is_superuser=True)')
raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)")
return user
### Django-Ninja-Provided Auth Methods
class HeaderTokenAuth(APIKeyHeader):
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
param_name = "X-ArchiveBox-API-Key"
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
class BearerTokenAuth(HttpBearer):
"""Allow authenticating by passing Bearer=xyz as a request header"""
def authenticate(self, request: HttpRequest, token: str) -> User | None:
return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__)
class QueryParamTokenAuth(APIKeyQuery):
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
param_name = "api_key"
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
class UsernameAndPasswordAuth(HttpBasicAuth):
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
@@ -111,25 +116,28 @@ class UsernameAndPasswordAuth(HttpBasicAuth):
self.__class__.__name__,
)
class DjangoSessionAuth:
"""Allow authenticating with existing Django session cookies (same-origin only)."""
def __call__(self, request: HttpRequest) -> User | None:
return self.authenticate(request)
def authenticate(self, request: HttpRequest, **kwargs) -> User | None:
user = getattr(request, 'user', None)
user = getattr(request, "user", None)
if isinstance(user, User) and user.is_authenticated:
setattr(request, '_api_auth_method', self.__class__.__name__)
setattr(request, "_api_auth_method", self.__class__.__name__)
if not user.is_superuser:
raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
raise HttpError(403, "Valid session but User does not have permission (make sure user.is_superuser=True)")
return user
return None
### Enabled Auth Methods
API_AUTH_METHODS = [
HeaderTokenAuth(),
BearerTokenAuth(),
QueryParamTokenAuth(),
QueryParamTokenAuth(),
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
]

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.http import HttpResponse
@@ -10,8 +10,8 @@ class ApiCorsMiddleware:
self.get_response = get_response
def __call__(self, request):
if request.path.startswith('/api/'):
if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
if request.path.startswith("/api/"):
if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"):
response = HttpResponse(status=204)
return self._add_cors_headers(request, response)
@@ -21,14 +21,12 @@ class ApiCorsMiddleware:
return self.get_response(request)
def _add_cors_headers(self, request, response):
origin = request.META.get('HTTP_ORIGIN')
origin = request.META.get("HTTP_ORIGIN")
if not origin:
return response
response['Access-Control-Allow-Origin'] = '*'
response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
response['Access-Control-Allow-Headers'] = (
'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
)
response['Access-Control-Max-Age'] = '600'
response["Access-Control-Allow-Origin"] = "*"
response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS"
response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken"
response["Access-Control-Max-Age"] = "600"
return response

View File

@@ -13,11 +13,10 @@ import signal_webhooks.utils
class Migration(migrations.Migration):
initial = True
dependencies = [
('auth', '0012_alter_user_first_name_max_length'),
("auth", "0012_alter_user_first_name_max_length"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
@@ -75,55 +74,165 @@ class Migration(migrations.Migration):
reverse_sql="""
DROP TABLE IF EXISTS api_outboundwebhook;
DROP TABLE IF EXISTS api_apitoken;
"""
""",
),
],
state_operations=[
migrations.CreateModel(
name='APIToken',
name="APIToken",
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
('expires', models.DateTimeField(blank=True, null=True)),
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
("modified_at", models.DateTimeField(auto_now=True)),
("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
("expires", models.DateTimeField(blank=True, null=True)),
(
"created_by",
models.ForeignKey(
default=get_or_create_system_user_pk,
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
options={
'verbose_name': 'API Key',
'verbose_name_plural': 'API Keys',
'app_label': 'api',
"verbose_name": "API Key",
"verbose_name_plural": "API Keys",
"app_label": "api",
},
),
migrations.CreateModel(
name='OutboundWebhook',
name="OutboundWebhook",
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('name', models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name')),
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal')),
('ref', models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
('endpoint', models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint')),
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
("modified_at", models.DateTimeField(auto_now=True)),
(
"name",
models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"),
),
(
"signal",
models.CharField(
choices=[
("CREATE", "Create"),
("UPDATE", "Update"),
("DELETE", "Delete"),
("M2M", "M2M changed"),
("CREATE_OR_UPDATE", "Create or Update"),
("CREATE_OR_DELETE", "Create or Delete"),
("CREATE_OR_M2M", "Create or M2M changed"),
("UPDATE_OR_DELETE", "Update or Delete"),
("UPDATE_OR_M2M", "Update or M2M changed"),
("DELETE_OR_M2M", "Delete or M2M changed"),
("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"),
("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"),
("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"),
("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"),
("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"),
],
help_text="Signal the webhook fires to.",
max_length=255,
verbose_name="signal",
),
),
(
"ref",
models.CharField(
db_index=True,
help_text="Dot import notation to the model the webhook is for.",
max_length=1023,
validators=[signal_webhooks.utils.model_from_reference],
verbose_name="referenced model",
),
),
(
"endpoint",
models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"),
),
(
"headers",
models.JSONField(
blank=True,
default=dict,
help_text="Headers to send with the webhook request.",
validators=[signal_webhooks.utils.is_dict],
verbose_name="headers",
),
),
(
"auth_token",
signal_webhooks.fields.TokenField(
blank=True,
default="",
help_text="Authentication token to use in an Authorization header.",
max_length=8000,
validators=[signal_webhooks.utils.decode_cipher_key],
verbose_name="authentication token",
),
),
("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")),
(
"keep_last_response",
models.BooleanField(
default=False,
help_text="Should the webhook keep a log of the latest response it got?",
verbose_name="keep last response",
),
),
(
"created",
models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"),
),
(
"updated",
models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"),
),
(
"last_response",
models.CharField(
blank=True,
default="",
help_text="Latest response to this webhook.",
max_length=8000,
verbose_name="last response",
),
),
(
"last_success",
models.DateTimeField(
default=None,
help_text="When the webhook last succeeded.",
null=True,
verbose_name="last success",
),
),
(
"last_failure",
models.DateTimeField(
default=None,
help_text="When the webhook last failed.",
null=True,
verbose_name="last failure",
),
),
(
"created_by",
models.ForeignKey(
default=get_or_create_system_user_pk,
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
options={
'verbose_name': 'API Outbound Webhook',
'app_label': 'api',
"verbose_name": "API Outbound Webhook",
"app_label": "api",
},
),
migrations.AddConstraint(
model_name='outboundwebhook',
constraint=models.UniqueConstraint(fields=['ref', 'endpoint'], name='prevent_duplicate_hooks_api_outboundwebhook'),
model_name="outboundwebhook",
constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"),
),
],
),

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
import secrets
from archivebox.uuid_compat import uuid7
@@ -25,7 +25,7 @@ class APIToken(models.Model):
expires = models.DateTimeField(null=True, blank=True)
class Meta(TypedModelMeta):
app_label = 'api'
app_label = "api"
verbose_name = "API Key"
verbose_name_plural = "API Keys"
@@ -34,7 +34,7 @@ class APIToken(models.Model):
@property
def token_redacted(self):
return f'************{self.token[-4:]}'
return f"************{self.token[-4:]}"
def is_valid(self, for_date=None):
return not self.expires or self.expires >= (for_date or timezone.now())
@@ -47,8 +47,8 @@ class OutboundWebhook(WebhookBase):
modified_at = models.DateTimeField(auto_now=True)
class Meta(WebhookBase.Meta):
app_label = 'api'
verbose_name = 'API Outbound Webhook'
app_label = "api"
verbose_name = "API Outbound Webhook"
def __str__(self) -> str:
return f'[{self.id}] {self.ref} -> {self.endpoint}'
return f"[{self.id}] {self.ref} -> {self.endpoint}"

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.urls import path
from django.views.generic.base import RedirectView
@@ -6,12 +6,10 @@ from django.views.generic.base import RedirectView
from .v1_api import urls as v1_api_urls
urlpatterns = [
path("", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url='/api/v1/docs')),
path("", RedirectView.as_view(url="/api/v1/docs")),
path("v1/", RedirectView.as_view(url="/api/v1/docs")),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url="/api/v1/docs")),
# ... v2 can be added here ...
# path("v2/", v2_api_urls),
# path("v2", RedirectView.as_view(url='/api/v2/docs')),

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from io import StringIO
@@ -20,9 +20,9 @@ from archivebox.api.auth import API_AUTH_METHODS
from archivebox.api.models import APIToken
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
COMMIT_HASH = get_COMMIT_HASH() or "unknown"
html_description=f'''
html_description = f"""
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
<br/>
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
@@ -35,47 +35,47 @@ html_description=f'''
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
</ul>
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
'''
"""
def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
api.add_router('/core/', 'archivebox.api.v1_core.router')
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
api.add_router("/auth/", "archivebox.api.v1_auth.router")
api.add_router("/core/", "archivebox.api.v1_core.router")
api.add_router("/crawls/", "archivebox.api.v1_crawls.router")
api.add_router("/cli/", "archivebox.api.v1_cli.router")
api.add_router("/machine/", "archivebox.api.v1_machine.router")
return api
class NinjaAPIWithIOCapture(NinjaAPI):
class NinjaAPIWithIOCapture(NinjaAPI):
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
stdout, stderr = StringIO(), StringIO()
with redirect_stderr(stderr):
with redirect_stdout(stdout):
setattr(request, 'stdout', stdout)
setattr(request, 'stderr', stderr)
setattr(request, "stdout", stdout)
setattr(request, "stderr", stderr)
response = super().create_temporal_response(request)
# Diable caching of API responses entirely
response['Cache-Control'] = 'no-store'
# Disable caching of API responses entirely
response["Cache-Control"] = "no-store"
# Add debug stdout and stderr headers to response
response['X-ArchiveBox-Stdout'] = stdout.getvalue().replace('\n', '\\n')[:200]
response['X-ArchiveBox-Stderr'] = stderr.getvalue().replace('\n', '\\n')[:200]
response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200]
response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200]
# response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown'
# Add Auth Headers to response
api_token_attr = getattr(request, '_api_token', None)
api_token_attr = getattr(request, "_api_token", None)
api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else 'Never'
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never"
response['X-ArchiveBox-Auth-Method'] = str(getattr(request, '_api_auth_method', 'None'))
response['X-ArchiveBox-Auth-Expires'] = token_expiry
response['X-ArchiveBox-Auth-Token-Id'] = str(api_token.id) if api_token else 'None'
response['X-ArchiveBox-Auth-User-Id'] = str(request.user.pk) if getattr(request.user, 'pk', None) else 'None'
response['X-ArchiveBox-Auth-User-Username'] = request.user.username if isinstance(request.user, User) else 'None'
response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None"))
response["X-ArchiveBox-Auth-Expires"] = token_expiry
response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None"
response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None"
response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None"
# import ipdb; ipdb.set_trace()
# print('RESPONDING NOW', response)
@@ -84,7 +84,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
api = NinjaAPIWithIOCapture(
title='ArchiveBox API',
title="ArchiveBox API",
description=html_description,
version=VERSION,
auth=API_AUTH_METHODS,
@@ -103,15 +103,15 @@ def generic_exception_handler(request, err):
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
status = 404
print(''.join(format_exception(err)))
print("".join(format_exception(err)))
return api.create_response(
request,
{
"succeeded": False,
"message": f'{err.__class__.__name__}: {err}',
"message": f"{err.__class__.__name__}: {err}",
"errors": [
''.join(format_exception(err)),
"".join(format_exception(err)),
# or send simpler parent-only traceback:
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
],
@@ -120,7 +120,6 @@ def generic_exception_handler(request, err):
)
# import orjson
# from ninja.renderers import BaseRenderer
# class ORJSONRenderer(BaseRenderer):

View File

@@ -1,6 +1,5 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from typing import Optional
from django.http import HttpRequest
from ninja import Router, Schema
@@ -8,16 +7,21 @@ from ninja import Router, Schema
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
router = Router(tags=['Authentication'], auth=None)
router = Router(tags=["Authentication"], auth=None)
class PasswordAuthSchema(Schema):
"""Schema for a /get_api_token request"""
username: Optional[str] = None
password: Optional[str] = None
username: str | None = None
password: str | None = None
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
@router.post(
"/get_api_token",
auth=None,
summary="Generate an API token for a given username & password (or currently logged-in user)",
) # auth=None because they are not authed yet
def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
user = auth_using_password(
username=auth_data.username,
@@ -35,17 +39,21 @@ def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
"token": api_token.token,
"expires": api_token.expires.isoformat() if api_token.expires else None,
}
return {"success": False, "errors": ["Invalid credentials"]}
return {"success": False, "errors": ["Invalid credentials"]}
class TokenAuthSchema(Schema):
"""Schema for a /check_api_token request"""
token: str
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
@router.post(
"/check_api_token",
auth=None,
summary="Validate an API token to make sure its valid and non-expired",
) # auth=None because they are not authed yet
def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
user = auth_using_token(
token=token_data.token,
@@ -53,5 +61,5 @@ def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
)
if user:
return {"success": True, "user_id": str(user.pk)}
return {"success": False, "user_id": None}

View File

@@ -1,8 +1,8 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
import json
from io import StringIO
from typing import List, Dict, Any, Optional
from typing import Any
from enum import Enum
from django.http import HttpRequest
@@ -16,44 +16,47 @@ from archivebox.config.common import ARCHIVING_CONFIG
# from .auth import API_AUTH_METHODS
# router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
router = Router(tags=["ArchiveBox CLI Sub-Commands"])
# Schemas
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
JSONType = list[Any] | dict[str, Any] | bool | int | str | None
class CLICommandResponseSchema(Schema):
success: bool
errors: List[str]
errors: list[str]
result: JSONType
result_format: str = 'str'
result_format: str = "str"
stdout: str
stderr: str
class FilterTypeChoices(str, Enum):
exact = 'exact'
substring = 'substring'
regex = 'regex'
domain = 'domain'
tag = 'tag'
timestamp = 'timestamp'
exact = "exact"
substring = "substring"
regex = "regex"
domain = "domain"
tag = "tag"
timestamp = "timestamp"
class StatusChoices(str, Enum):
indexed = 'indexed'
archived = 'archived'
unarchived = 'unarchived'
present = 'present'
valid = 'valid'
invalid = 'invalid'
duplicate = 'duplicate'
orphaned = 'orphaned'
corrupted = 'corrupted'
unrecognized = 'unrecognized'
indexed = "indexed"
archived = "archived"
unarchived = "unarchived"
present = "present"
valid = "valid"
invalid = "invalid"
duplicate = "duplicate"
orphaned = "orphaned"
corrupted = "corrupted"
unrecognized = "unrecognized"
class AddCommandSchema(Schema):
urls: List[str]
urls: list[str]
tag: str = ""
depth: int = 0
parser: str = "auto"
@@ -62,53 +65,54 @@ class AddCommandSchema(Schema):
overwrite: bool = False
index_only: bool = False
class UpdateCommandSchema(Schema):
resume: Optional[str] = None
after: Optional[float] = 0
before: Optional[float] = 999999999999999
filter_type: Optional[str] = FilterTypeChoices.substring
filter_patterns: Optional[List[str]] = ['https://example.com']
resume: str | None = None
after: float | None = 0
before: float | None = 999999999999999
filter_type: str | None = FilterTypeChoices.substring
filter_patterns: list[str] | None = ["https://example.com"]
batch_size: int = 100
continuous: bool = False
class ScheduleCommandSchema(Schema):
import_path: Optional[str] = None
import_path: str | None = None
add: bool = False
show: bool = False
foreground: bool = False
run_all: bool = False
quiet: bool = False
every: Optional[str] = None
tag: str = ''
every: str | None = None
tag: str = ""
depth: int = 0
overwrite: bool = False
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
clear: bool = False
class ListCommandSchema(Schema):
filter_patterns: Optional[List[str]] = ['https://example.com']
filter_patterns: list[str] | None = ["https://example.com"]
filter_type: str = FilterTypeChoices.substring
status: StatusChoices = StatusChoices.indexed
after: Optional[float] = 0
before: Optional[float] = 999999999999999
sort: str = 'bookmarked_at'
after: float | None = 0
before: float | None = 999999999999999
sort: str = "bookmarked_at"
as_json: bool = True
as_html: bool = False
as_csv: str | None = 'timestamp,url'
as_csv: str | None = "timestamp,url"
with_headers: bool = False
class RemoveCommandSchema(Schema):
delete: bool = True
after: Optional[float] = 0
before: Optional[float] = 999999999999999
after: float | None = 0
before: float | None = 999999999999999
filter_type: str = FilterTypeChoices.exact
filter_patterns: Optional[List[str]] = ['https://example.com']
filter_patterns: list[str] | None = ["https://example.com"]
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
def cli_add(request: HttpRequest, args: AddCommandSchema):
from archivebox.cli.archivebox_add import add
@@ -125,30 +129,30 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
created_by_id=request.user.pk,
)
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)]
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
result_payload = {
"crawl_id": str(crawl.id),
"num_snapshots": len(snapshot_ids),
"snapshot_ids": snapshot_ids,
"queued_urls": args.urls,
}
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result_payload,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
from archivebox.cli.archivebox_update import update
result = update(
filter_patterns=args.filter_patterns or [],
filter_type=args.filter_type or FilterTypeChoices.substring,
@@ -158,21 +162,21 @@ def cli_update(request: HttpRequest, args: UpdateCommandSchema):
batch_size=args.batch_size,
continuous=args.continuous,
)
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
from archivebox.cli.archivebox_schedule import schedule
result = schedule(
import_path=args.import_path,
add=args.add,
@@ -188,23 +192,22 @@ def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
update=args.update,
)
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
def cli_search(request: HttpRequest, args: ListCommandSchema):
from archivebox.cli.archivebox_search import search
result = search(
filter_patterns=args.filter_patterns,
filter_type=args.filter_type,
@@ -218,7 +221,7 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
with_headers=args.with_headers,
)
result_format = 'txt'
result_format = "txt"
if args.as_json:
result_format = "json"
result = json.loads(result)
@@ -227,20 +230,19 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
elif args.as_csv:
result_format = "csv"
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": result_format,
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
from archivebox.cli.archivebox_remove import remove
from archivebox.cli.archivebox_search import get_snapshots
@@ -253,10 +255,10 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
after=args.after,
before=args.before,
)
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes
yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete,
snapshots=snapshots_to_remove,
before=args.before,
@@ -270,14 +272,13 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
"removed_snapshot_ids": removed_snapshot_ids,
"remaining_snapshots": Snapshot.objects.count(),
}
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}

View File

@@ -1,11 +1,13 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
import math
from collections import defaultdict
from uuid import UUID
from typing import List, Optional, Union, Any, Annotated
from typing import Union, Any, Annotated
from datetime import datetime
from django.db.models import Model, Q
from django.db.models import Model, Q, Sum
from django.db.models.functions import Coalesce
from django.conf import settings
from django.http import HttpRequest, HttpResponse
from django.core.exceptions import ValidationError
@@ -39,7 +41,7 @@ from archivebox.crawls.models import Crawl
from archivebox.api.v1_crawls import CrawlSchema
router = Router(tags=['Core Models'])
router = Router(tags=["Core Models"])
class CustomPagination(PaginationBase):
@@ -49,13 +51,14 @@ class CustomPagination(PaginationBase):
page: int = 0
class Output(PaginationBase.Output):
count: int
total_items: int
total_pages: int
page: int
limit: int
offset: int
num_items: int
items: List[Any]
items: list[Any]
def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params):
limit = min(pagination.limit, 500)
@@ -65,27 +68,29 @@ class CustomPagination(PaginationBase):
current_page = math.ceil(offset / (limit + 1))
items = queryset[offset : offset + limit]
return {
'total_items': total,
'total_pages': total_pages,
'page': current_page,
'limit': limit,
'offset': offset,
'num_items': len(items),
'items': items,
"count": total,
"total_items": total,
"total_pages": total_pages,
"page": current_page,
"limit": limit,
"offset": offset,
"num_items": len(items),
"items": items,
}
### ArchiveResult #########################################################################
class MinimalArchiveResultSchema(Schema):
TYPE: str = 'core.models.ArchiveResult'
TYPE: str = "core.models.ArchiveResult"
id: UUID
created_at: datetime | None
modified_at: datetime | None
created_by_id: str
created_by_username: str
status: str
retry_at: datetime | None
retry_at: datetime | None = None
plugin: str
hook_name: str
process_id: UUID | None
@@ -93,8 +98,8 @@ class MinimalArchiveResultSchema(Schema):
cmd: list[str] | None
pwd: str | None
output_str: str
output_json: dict | None
output_files: dict | None
output_json: dict[str, Any] | None
output_files: dict[str, dict[str, Any]] | None
output_size: int
output_mimetypes: str
start_ts: datetime | None
@@ -108,13 +113,34 @@ class MinimalArchiveResultSchema(Schema):
def resolve_created_by_username(obj) -> str:
return obj.created_by.username
@staticmethod
def resolve_output_files(obj):
return obj.output_file_map()
@staticmethod
def resolve_output_mimetypes(obj) -> str:
mime_sizes: dict[str, int] = defaultdict(int)
for metadata in obj.output_file_map().values():
if not isinstance(metadata, dict):
continue
mimetype = str(metadata.get("mimetype") or "").strip()
try:
size = max(int(metadata.get("size") or 0), 0)
except (TypeError, ValueError):
size = 0
if mimetype and size:
mime_sizes[mimetype] += size
if mime_sizes:
return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
return obj.output_mimetypes or ""
class ArchiveResultSchema(MinimalArchiveResultSchema):
TYPE: str = 'core.models.ArchiveResult'
TYPE: str = "core.models.ArchiveResult"
snapshot_id: UUID
snapshot_timestamp: str
snapshot_url: str
snapshot_tags: List[str]
snapshot_tags: list[str]
@staticmethod
def resolve_snapshot_timestamp(obj):
@@ -134,25 +160,39 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):
class ArchiveResultFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup(['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
search: Annotated[Optional[str], FilterLookup(['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
snapshot_id: Annotated[Optional[str], FilterLookup(['snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
snapshot_url: Annotated[Optional[str], FilterLookup('snapshot__url__icontains')] = None
snapshot_tag: Annotated[Optional[str], FilterLookup('snapshot__tags__name__icontains')] = None
status: Annotated[Optional[str], FilterLookup('status')] = None
output_str: Annotated[Optional[str], FilterLookup('output_str__icontains')] = None
plugin: Annotated[Optional[str], FilterLookup('plugin__icontains')] = None
hook_name: Annotated[Optional[str], FilterLookup('hook_name__icontains')] = None
process_id: Annotated[Optional[str], FilterLookup('process__id__startswith')] = None
cmd: Annotated[Optional[str], FilterLookup('cmd__0__icontains')] = None
pwd: Annotated[Optional[str], FilterLookup('pwd__icontains')] = None
cmd_version: Annotated[Optional[str], FilterLookup('cmd_version')] = None
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
search: Annotated[
str | None,
FilterLookup(
[
"snapshot__url__icontains",
"snapshot__title__icontains",
"snapshot__tags__name__icontains",
"plugin",
"output_str__icontains",
"id__startswith",
"snapshot__id__startswith",
"snapshot__timestamp__startswith",
],
),
] = None
snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None
snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None
status: Annotated[str | None, FilterLookup("status")] = None
output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None
plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None
hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None
process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None
cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None
pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None
cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult")
@paginate(CustomPagination)
def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]):
"""List all ArchiveResult entries matching these filters."""
@@ -167,8 +207,9 @@ def get_archiveresult(request: HttpRequest, archiveresult_id: str):
### Snapshot #########################################################################
class SnapshotSchema(Schema):
TYPE: str = 'core.models.Snapshot'
TYPE: str = "core.models.Snapshot"
id: UUID
created_by_id: str
created_by_username: str
@@ -177,14 +218,16 @@ class SnapshotSchema(Schema):
status: str
retry_at: datetime | None
bookmarked_at: datetime
downloaded_at: Optional[datetime]
downloaded_at: datetime | None
url: str
tags: List[str]
title: Optional[str]
tags: list[str]
title: str | None
timestamp: str
archive_path: str
archive_size: int
output_size: int
num_archiveresults: int
archiveresults: List[MinimalArchiveResultSchema]
archiveresults: list[MinimalArchiveResultSchema]
@staticmethod
def resolve_created_by_id(obj):
@@ -198,13 +241,21 @@ class SnapshotSchema(Schema):
def resolve_tags(obj):
return sorted(tag.name for tag in obj.tags.all())
@staticmethod
def resolve_archive_size(obj):
return int(getattr(obj, "output_size_sum", obj.archive_size) or 0)
@staticmethod
def resolve_output_size(obj):
return SnapshotSchema.resolve_archive_size(obj)
@staticmethod
def resolve_num_archiveresults(obj, context):
return obj.archiveresult_set.all().distinct().count()
@staticmethod
def resolve_archiveresults(obj, context):
if bool(getattr(context['request'], 'with_archiveresults', False)):
if bool(getattr(context["request"], "with_archiveresults", False)):
return obj.archiveresult_set.all().distinct()
return ArchiveResult.objects.none()
@@ -212,16 +263,16 @@ class SnapshotSchema(Schema):
class SnapshotUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
tags: list[str] | None = None
class SnapshotCreateSchema(Schema):
url: str
crawl_id: Optional[str] = None
crawl_id: str | None = None
depth: int = 0
title: Optional[str] = None
tags: Optional[List[str]] = None
status: Optional[str] = None
title: str | None = None
tags: list[str] | None = None
status: str | None = None
class SnapshotDeleteResponseSchema(Schema):
@@ -231,77 +282,82 @@ class SnapshotDeleteResponseSchema(Schema):
deleted_count: int
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
def normalize_tag_list(tags: list[str] | None = None) -> list[str]:
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
class SnapshotFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup(['id__icontains', 'timestamp__startswith'])] = None
created_by_id: Annotated[Optional[str], FilterLookup('crawl__created_by_id')] = None
created_by_username: Annotated[Optional[str], FilterLookup('crawl__created_by__username__icontains')] = None
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
modified_at: Annotated[Optional[datetime], FilterLookup('modified_at')] = None
modified_at__gte: Annotated[Optional[datetime], FilterLookup('modified_at__gte')] = None
modified_at__lt: Annotated[Optional[datetime], FilterLookup('modified_at__lt')] = None
search: Annotated[Optional[str], FilterLookup(['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])] = None
url: Annotated[Optional[str], FilterLookup('url')] = None
tag: Annotated[Optional[str], FilterLookup('tags__name')] = None
title: Annotated[Optional[str], FilterLookup('title__icontains')] = None
timestamp: Annotated[Optional[str], FilterLookup('timestamp__startswith')] = None
bookmarked_at__gte: Annotated[Optional[datetime], FilterLookup('bookmarked_at__gte')] = None
bookmarked_at__lt: Annotated[Optional[datetime], FilterLookup('bookmarked_at__lt')] = None
id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None
created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None
created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None
modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None
modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None
search: Annotated[
str | None,
FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]),
] = None
url: Annotated[str | None, FilterLookup("url")] = None
tag: Annotated[str | None, FilterLookup("tags__name")] = None
title: Annotated[str | None, FilterLookup("title__icontains")] = None
timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None
bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None
bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None
@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots")
@paginate(CustomPagination)
def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False):
"""List all Snapshot entries matching these filters."""
setattr(request, 'with_archiveresults', with_archiveresults)
return filters.filter(Snapshot.objects.all()).distinct()
setattr(request, "with_archiveresults", with_archiveresults)
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
return filters.filter(queryset).distinct()
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True):
"""Get a specific Snapshot by id."""
setattr(request, 'with_archiveresults', with_archiveresults)
setattr(request, "with_archiveresults", with_archiveresults)
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
try:
return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
except Snapshot.DoesNotExist:
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
return queryset.get(Q(id__icontains=snapshot_id))
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
tags = normalize_tag_list(data.tags)
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
raise HttpError(400, f"Invalid status: {data.status}")
if not data.url.strip():
raise HttpError(400, 'URL is required')
raise HttpError(400, "URL is required")
if data.depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'depth must be between 0 and 4')
raise HttpError(400, "depth must be between 0 and 4")
if data.crawl_id:
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
crawl_tags = normalize_tag_list(crawl.tags_str.split(","))
tags = tags or crawl_tags
else:
crawl = Crawl.objects.create(
urls=data.url,
max_depth=max(data.depth, 0),
tags_str=','.join(tags),
tags_str=",".join(tags),
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
created_by=request.user if isinstance(request.user, User) else None,
)
snapshot_defaults = {
'depth': data.depth,
'title': data.title,
'timestamp': str(timezone.now().timestamp()),
'status': data.status or Snapshot.StatusChoices.QUEUED,
'retry_at': timezone.now(),
"depth": data.depth,
"title": data.title,
"timestamp": str(timezone.now().timestamp()),
"status": data.status or Snapshot.StatusChoices.QUEUED,
"retry_at": timezone.now(),
}
snapshot, _ = Snapshot.objects.get_or_create(
url=data.url,
@@ -309,17 +365,17 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
defaults=snapshot_defaults,
)
update_fields: List[str] = []
update_fields: list[str] = []
if data.title is not None and snapshot.title != data.title:
snapshot.title = data.title
update_fields.append('title')
update_fields.append("title")
if data.status is not None and snapshot.status != data.status:
if data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
raise HttpError(400, f"Invalid status: {data.status}")
snapshot.status = data.status
update_fields.append('status')
update_fields.append("status")
if update_fields:
update_fields.append('modified_at')
update_fields.append("modified_at")
snapshot.save(update_fields=update_fields)
if tags:
@@ -330,7 +386,7 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
except Exception:
pass
setattr(request, 'with_archiveresults', False)
setattr(request, "with_archiveresults", False)
return snapshot
@@ -343,26 +399,26 @@ def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateS
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
tags = payload.pop('tags', None)
update_fields = ["modified_at"]
tags = payload.pop("tags", None)
if 'status' in payload:
if payload['status'] not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {payload["status"]}')
snapshot.status = payload['status']
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
if "status" in payload:
if payload["status"] not in Snapshot.StatusChoices.values:
raise HttpError(400, f"Invalid status: {payload['status']}")
snapshot.status = payload["status"]
if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload:
snapshot.retry_at = None
update_fields.append('status')
update_fields.append("status")
if 'retry_at' in payload:
snapshot.retry_at = payload['retry_at']
update_fields.append('retry_at')
if "retry_at" in payload:
snapshot.retry_at = payload["retry_at"]
update_fields.append("retry_at")
if tags is not None:
snapshot.save_tags(normalize_tag_list(tags))
snapshot.save(update_fields=update_fields)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_archiveresults", False)
return snapshot
@@ -373,17 +429,18 @@ def delete_snapshot(request: HttpRequest, snapshot_id: str):
crawl_id_str = str(snapshot.crawl.pk)
deleted_count, _ = snapshot.delete()
return {
'success': True,
'snapshot_id': snapshot_id_str,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
"success": True,
"snapshot_id": snapshot_id_str,
"crawl_id": crawl_id_str,
"deleted_count": deleted_count,
}
### Tag #########################################################################
class TagSchema(Schema):
TYPE: str = 'core.models.Tag'
TYPE: str = "core.models.Tag"
id: int
modified_at: datetime
created_at: datetime
@@ -392,7 +449,7 @@ class TagSchema(Schema):
name: str
slug: str
num_snapshots: int
snapshots: List[SnapshotSchema]
snapshots: list[SnapshotSchema]
@staticmethod
def resolve_created_by_id(obj):
@@ -402,7 +459,7 @@ class TagSchema(Schema):
def resolve_created_by_username(obj):
user_model = get_user_model()
user = user_model.objects.get(id=obj.created_by_id)
username = getattr(user, 'username', None)
username = getattr(user, "username", None)
return username if isinstance(username, str) else str(user)
@staticmethod
@@ -411,58 +468,67 @@ class TagSchema(Schema):
@staticmethod
def resolve_snapshots(obj, context):
if bool(getattr(context['request'], 'with_snapshots', False)):
if bool(getattr(context["request"], "with_snapshots", False)):
return obj.snapshot_set.all().distinct()
return Snapshot.objects.none()
@router.get("/tags", response=List[TagSchema], url_name="get_tags")
@router.get("/tags", response=list[TagSchema], url_name="get_tags")
@paginate(CustomPagination)
def get_tags(request: HttpRequest):
setattr(request, 'with_snapshots', False)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_snapshots", False)
setattr(request, "with_archiveresults", False)
return get_matching_tags()
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
setattr(request, 'with_snapshots', with_snapshots)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_snapshots", with_snapshots)
setattr(request, "with_archiveresults", False)
try:
return get_tag_by_ref(tag_id)
except (Tag.DoesNotExist, ValidationError):
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
@router.get(
"/any/{id}",
response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema],
url_name="get_any",
summary="Get any object by its ID",
)
def get_any(request: HttpRequest, id: str):
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
setattr(request, 'with_snapshots', False)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_snapshots", False)
setattr(request, "with_archiveresults", False)
for getter in [get_snapshot, get_archiveresult, get_tag]:
try:
response = getter(request, id)
if isinstance(response, Model):
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
return redirect(
f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}",
)
except Exception:
pass
try:
from archivebox.api.v1_crawls import get_crawl
response = get_crawl(request, id)
if isinstance(response, Model):
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
except Exception:
pass
raise HttpError(404, 'Object with given ID not found')
raise HttpError(404, "Object with given ID not found")
### Tag Editor API Endpoints #########################################################################
class TagAutocompleteSchema(Schema):
tags: List[dict]
tags: list[dict]
class TagCreateSchema(Schema):
@@ -483,7 +549,7 @@ class TagSearchSnapshotSchema(Schema):
favicon_url: str
admin_url: str
archive_url: str
downloaded_at: Optional[str] = None
downloaded_at: str | None = None
class TagSearchCardSchema(Schema):
@@ -497,11 +563,11 @@ class TagSearchCardSchema(Schema):
export_jsonl_url: str
rename_url: str
delete_url: str
snapshots: List[TagSearchSnapshotSchema]
snapshots: list[TagSearchSnapshotSchema]
class TagSearchResponseSchema(Schema):
tags: List[TagSearchCardSchema]
tags: list[TagSearchCardSchema]
sort: str
created_by: str
year: str
@@ -527,8 +593,8 @@ class TagDeleteResponseSchema(Schema):
class TagSnapshotRequestSchema(Schema):
snapshot_id: str
tag_name: Optional[str] = None
tag_id: Optional[int] = None
tag_name: str | None = None
tag_id: int | None = None
class TagSnapshotResponseSchema(Schema):
@@ -541,10 +607,10 @@ class TagSnapshotResponseSchema(Schema):
def search_tags(
request: HttpRequest,
q: str = "",
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
sort: str = "created_desc",
created_by: str = "",
year: str = "",
has_snapshots: str = "all",
):
"""Return detailed tag cards for admin/live-search UIs."""
normalized_sort = normalize_tag_sort(sort)
@@ -552,7 +618,7 @@ def search_tags(
normalized_year = normalize_created_year_filter(year)
normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
return {
'tags': build_tag_cards(
"tags": build_tag_cards(
query=q,
request=request,
sort=normalized_sort,
@@ -560,28 +626,28 @@ def search_tags(
year=normalized_year,
has_snapshots=normalized_has_snapshots,
),
'sort': normalized_sort,
'created_by': normalized_created_by,
'year': normalized_year,
'has_snapshots': normalized_has_snapshots,
"sort": normalized_sort,
"created_by": normalized_created_by,
"year": normalized_year,
"has_snapshots": normalized_has_snapshots,
}
def _public_tag_listing_enabled() -> bool:
explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None)
if explicit is not None:
return bool(explicit)
return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX))
def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
user = getattr(request, 'user', None)
if getattr(user, 'is_authenticated', False):
user = getattr(request, "user", None)
if getattr(user, "is_authenticated", False):
return True
token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
auth_header = request.headers.get('Authorization', '')
if not token and auth_header.lower().startswith('bearer '):
token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key")
auth_header = request.headers.get("Authorization", "")
if not token and auth_header.lower().startswith("bearer "):
token = auth_header.split(None, 1)[1].strip()
if token and auth_using_token(token=token, request=request):
@@ -594,12 +660,12 @@ def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
def tags_autocomplete(request: HttpRequest, q: str = ""):
"""Return tags matching the query for autocomplete."""
if not _request_has_tag_autocomplete_access(request):
raise HttpError(401, 'Authentication required')
raise HttpError(401, "Authentication required")
tags = get_matching_tags(q)[:50 if not q else 20]
tags = get_matching_tags(q)[: 50 if not q else 20]
return {
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
"tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags],
}
@@ -615,10 +681,10 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
raise HttpError(400, str(err)) from err
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
'created': created,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
"created": created,
}
@@ -627,15 +693,15 @@ def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
try:
tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
except ValueError as err:
raise HttpError(400, str(err)) from err
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
'slug': tag.slug,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
"slug": tag.slug,
}
@@ -644,13 +710,13 @@ def delete_tag(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
deleted_count, _ = delete_tag_record(tag)
return {
'success': True,
'tag_id': int(tag_id),
'deleted_count': deleted_count,
"success": True,
"tag_id": int(tag_id),
"deleted_count": deleted_count,
}
@@ -659,10 +725,10 @@ def tag_urls_export(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8")
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
return response
@@ -671,10 +737,10 @@ def tag_snapshots_export(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8")
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
return response
@@ -684,16 +750,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
# Get the snapshot
try:
snapshot = Snapshot.objects.get(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
)
except Snapshot.DoesNotExist:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
).first()
if snapshot is None:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
# Get or create the tag
if data.tag_name:
@@ -708,17 +774,17 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
try:
tag = get_tag_by_ref(data.tag_id)
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
else:
raise HttpError(400, 'Either tag_name or tag_id is required')
raise HttpError(400, "Either tag_name or tag_id is required")
# Add the tag to the snapshot
snapshot.tags.add(tag.pk)
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
}
@@ -728,36 +794,36 @@ def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSche
# Get the snapshot
try:
snapshot = Snapshot.objects.get(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
)
except Snapshot.DoesNotExist:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
).first()
if snapshot is None:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
# Get the tag
if data.tag_id:
try:
tag = Tag.objects.get(pk=data.tag_id)
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
elif data.tag_name:
try:
tag = Tag.objects.get(name__iexact=data.tag_name.strip())
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
else:
raise HttpError(400, 'Either tag_name or tag_id is required')
raise HttpError(400, "Either tag_name or tag_id is required")
# Remove the tag from the snapshot
snapshot.tags.remove(tag.pk)
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
}

View File

@@ -1,7 +1,6 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from uuid import UUID
from typing import List, Optional
from datetime import datetime
from django.http import HttpRequest
from django.utils import timezone
@@ -17,11 +16,11 @@ from archivebox.crawls.models import Crawl
from .auth import API_AUTH_METHODS
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS)
class CrawlSchema(Schema):
TYPE: str = 'crawls.models.Crawl'
TYPE: str = "crawls.models.Crawl"
id: UUID
@@ -35,6 +34,8 @@ class CrawlSchema(Schema):
urls: str
max_depth: int
max_urls: int
max_size: int
tags_str: str
config: dict
@@ -48,12 +49,12 @@ class CrawlSchema(Schema):
def resolve_created_by_username(obj):
user_model = get_user_model()
user = user_model.objects.get(id=obj.created_by_id)
username = getattr(user, 'username', None)
username = getattr(user, "username", None)
return username if isinstance(username, str) else str(user)
@staticmethod
def resolve_snapshots(obj, context):
if bool(getattr(context['request'], 'with_snapshots', False)):
if bool(getattr(context["request"], "with_snapshots", False)):
return obj.snapshot_set.all().distinct()
return Snapshot.objects.none()
@@ -61,17 +62,19 @@ class CrawlSchema(Schema):
class CrawlUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
tags: list[str] | None = None
tags_str: str | None = None
class CrawlCreateSchema(Schema):
urls: List[str]
urls: list[str]
max_depth: int = 0
tags: Optional[List[str]] = None
tags_str: str = ''
label: str = ''
notes: str = ''
max_urls: int = 0
max_size: int = 0
tags: list[str] | None = None
tags_str: str = ""
label: str = ""
notes: str = ""
config: dict = {}
@@ -82,13 +85,13 @@ class CrawlDeleteResponseSchema(Schema):
deleted_snapshots: int
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]:
if tags is not None:
return [tag.strip() for tag in tags if tag and tag.strip()]
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
return [tag.strip() for tag in tags_str.split(",") if tag.strip()]
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls")
def get_crawls(request: HttpRequest):
return Crawl.objects.all().distinct()
@@ -97,15 +100,21 @@ def get_crawls(request: HttpRequest):
def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
urls = [url.strip() for url in data.urls if url and url.strip()]
if not urls:
raise HttpError(400, 'At least one URL is required')
raise HttpError(400, "At least one URL is required")
if data.max_depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'max_depth must be between 0 and 4')
raise HttpError(400, "max_depth must be between 0 and 4")
if data.max_urls < 0:
raise HttpError(400, "max_urls must be >= 0")
if data.max_size < 0:
raise HttpError(400, "max_size must be >= 0")
tags = normalize_tag_list(data.tags, data.tags_str)
crawl = Crawl.objects.create(
urls='\n'.join(urls),
urls="\n".join(urls),
max_depth=data.max_depth,
tags_str=','.join(tags),
max_urls=data.max_urls,
max_size=data.max_size,
tags_str=",".join(tags),
label=data.label,
notes=data.notes,
config=data.config,
@@ -116,25 +125,26 @@ def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
crawl.create_snapshots_from_urls()
return crawl
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False):
"""Get a specific Crawl by id."""
setattr(request, 'with_snapshots', with_snapshots)
setattr(request, 'with_archiveresults', with_archiveresults)
setattr(request, "with_snapshots", with_snapshots)
setattr(request, "with_archiveresults", with_archiveresults)
crawl = Crawl.objects.get(id__icontains=crawl_id)
if crawl and as_rss:
# return snapshots as XML rss feed
urls = [
{'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
{"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str}
for snapshot in crawl.snapshot_set.all()
]
xml = '<rss version="2.0"><channel>'
for url in urls:
xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
xml += '</channel></rss>'
xml += f"<item><url>{url['url']}</url><title>{url['title']}</title><bookmarked_at>{url['bookmarked_at']}</bookmarked_at><tags>{url['tags']}</tags></item>"
xml += "</channel></rss>"
return xml
return crawl
@@ -143,29 +153,29 @@ def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema):
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
crawl = Crawl.objects.get(id__icontains=crawl_id)
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
update_fields = ["modified_at"]
tags = payload.pop('tags', None)
tags_str = payload.pop('tags_str', None)
tags = payload.pop("tags", None)
tags_str = payload.pop("tags_str", None)
if tags is not None or tags_str is not None:
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
update_fields.append('tags_str')
crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or ""))
update_fields.append("tags_str")
if 'status' in payload:
if payload['status'] not in Crawl.StatusChoices.values:
raise HttpError(400, f'Invalid status: {payload["status"]}')
crawl.status = payload['status']
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
if "status" in payload:
if payload["status"] not in Crawl.StatusChoices.values:
raise HttpError(400, f"Invalid status: {payload['status']}")
crawl.status = payload["status"]
if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload:
crawl.retry_at = None
update_fields.append('status')
update_fields.append("status")
if 'retry_at' in payload:
crawl.retry_at = payload['retry_at']
update_fields.append('retry_at')
if "retry_at" in payload:
crawl.retry_at = payload["retry_at"]
update_fields.append("retry_at")
crawl.save(update_fields=update_fields)
if payload.get('status') == Crawl.StatusChoices.SEALED:
if payload.get("status") == Crawl.StatusChoices.SEALED:
Snapshot.objects.filter(
crawl=crawl,
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
@@ -184,8 +194,8 @@ def delete_crawl(request: HttpRequest, crawl_id: str):
snapshot_count = crawl.snapshot_set.count()
deleted_count, _ = crawl.delete()
return {
'success': True,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
'deleted_snapshots': snapshot_count,
"success": True,
"crawl_id": crawl_id_str,
"deleted_count": deleted_count,
"deleted_snapshots": snapshot_count,
}

View File

@@ -1,7 +1,7 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from uuid import UUID
from typing import Annotated, List, Optional
from typing import Annotated
from datetime import datetime
from django.http import HttpRequest
@@ -12,16 +12,18 @@ from ninja.pagination import paginate
from archivebox.api.v1_core import CustomPagination
router = Router(tags=['Machine and Dependencies'])
router = Router(tags=["Machine and Dependencies"])
# ============================================================================
# Machine Schemas
# ============================================================================
class MachineSchema(Schema):
"""Schema for Machine model."""
TYPE: str = 'machine.Machine'
TYPE: str = "machine.Machine"
id: UUID
created_at: datetime
modified_at: datetime
@@ -43,22 +45,24 @@ class MachineSchema(Schema):
class MachineFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
hostname: Annotated[Optional[str], FilterLookup('hostname__icontains')] = None
os_platform: Annotated[Optional[str], FilterLookup('os_platform__icontains')] = None
os_arch: Annotated[Optional[str], FilterLookup('os_arch')] = None
hw_in_docker: Annotated[Optional[bool], FilterLookup('hw_in_docker')] = None
hw_in_vm: Annotated[Optional[bool], FilterLookup('hw_in_vm')] = None
bin_providers: Annotated[Optional[str], FilterLookup('bin_providers__icontains')] = None
id: Annotated[str | None, FilterLookup("id__startswith")] = None
hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None
os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None
os_arch: Annotated[str | None, FilterLookup("os_arch")] = None
hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None
hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None
bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None
# ============================================================================
# Binary Schemas
# ============================================================================
class BinarySchema(Schema):
"""Schema for Binary model."""
TYPE: str = 'machine.Binary'
TYPE: str = "machine.Binary"
id: UUID
created_at: datetime
modified_at: datetime
@@ -85,23 +89,25 @@ class BinarySchema(Schema):
class BinaryFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
name: Annotated[Optional[str], FilterLookup('name__icontains')] = None
binprovider: Annotated[Optional[str], FilterLookup('binprovider')] = None
status: Annotated[Optional[str], FilterLookup('status')] = None
machine_id: Annotated[Optional[str], FilterLookup('machine_id__startswith')] = None
version: Annotated[Optional[str], FilterLookup('version__icontains')] = None
id: Annotated[str | None, FilterLookup("id__startswith")] = None
name: Annotated[str | None, FilterLookup("name__icontains")] = None
binprovider: Annotated[str | None, FilterLookup("binprovider")] = None
status: Annotated[str | None, FilterLookup("status")] = None
machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None
version: Annotated[str | None, FilterLookup("version__icontains")] = None
# ============================================================================
# Machine Endpoints
# ============================================================================
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
@router.get("/machines", response=list[MachineSchema], url_name="get_machines")
@paginate(CustomPagination)
def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
"""List all machines."""
from archivebox.machine.models import Machine
return filters.filter(Machine.objects.all()).distinct()
@@ -109,6 +115,7 @@ def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
def get_current_machine(request: HttpRequest):
"""Get the current machine."""
from archivebox.machine.models import Machine
return Machine.current()
@@ -117,6 +124,7 @@ def get_machine(request: HttpRequest, machine_id: str):
"""Get a specific machine by ID."""
from archivebox.machine.models import Machine
from django.db.models import Q
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
@@ -127,23 +135,27 @@ def get_machine(request: HttpRequest, machine_id: str):
# Binary Endpoints
# ============================================================================
@router.get("/binaries", response=List[BinarySchema], url_name="get_binaries")
@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries")
@paginate(CustomPagination)
def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]):
"""List all binaries."""
from archivebox.machine.models import Binary
return filters.filter(Binary.objects.all().select_related('machine')).distinct()
return filters.filter(Binary.objects.all().select_related("machine")).distinct()
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
def get_binary(request: HttpRequest, binary_id: str):
"""Get a specific binary by ID."""
from archivebox.machine.models import Binary
return Binary.objects.select_related('machine').get(id__startswith=binary_id)
return Binary.objects.select_related("machine").get(id__startswith=binary_id)
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name")
def get_binaries_by_name(request: HttpRequest, name: str):
"""Get all binaries with the given name."""
from archivebox.machine.models import Binary
return list(Binary.objects.filter(name__iexact=name).select_related('machine'))
return list(Binary.objects.filter(name__iexact=name).select_related("machine"))

View File

@@ -1 +1 @@
__package__ = 'archivebox.base_models'
__package__ = "archivebox.base_models"

View File

@@ -1,6 +1,6 @@
"""Base admin classes for models using UUIDv7."""
__package__ = 'archivebox.base_models'
__package__ = "archivebox.base_models"
import json
from collections.abc import Mapping
@@ -32,11 +32,12 @@ class KeyValueWidget(forms.Widget):
with + and - buttons to add/remove rows.
Includes autocomplete for available config keys from the plugin system.
"""
template_name = "" # We render manually
class Media:
css = {
'all': []
"all": [],
}
js = []
@@ -44,17 +45,18 @@ class KeyValueWidget(forms.Widget):
"""Get available config options from plugins."""
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
options: dict[str, ConfigOption] = {}
for plugin_name, schema in plugin_configs.items():
for key, prop in schema.get('properties', {}).items():
for key, prop in schema.get("properties", {}).items():
option: ConfigOption = {
'plugin': plugin_name,
'type': prop.get('type', 'string'),
'default': prop.get('default', ''),
'description': prop.get('description', ''),
"plugin": plugin_name,
"type": prop.get("type", "string"),
"default": prop.get("default", ""),
"description": prop.get("description", ""),
}
for schema_key in ('enum', 'pattern', 'minimum', 'maximum'):
for schema_key in ("enum", "pattern", "minimum", "maximum"):
if schema_key in prop:
option[schema_key] = prop[schema_key]
options[key] = option
@@ -85,11 +87,11 @@ class KeyValueWidget(forms.Widget):
) -> SafeString:
data = self._parse_value(value)
widget_id = attrs.get('id', name) if attrs else name
widget_id = attrs.get("id", name) if attrs else name
config_options = self._get_config_options()
# Build datalist options
datalist_options = '\n'.join(
datalist_options = "\n".join(
f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
for key, opt in sorted(config_options.items())
)
@@ -111,7 +113,7 @@ class KeyValueWidget(forms.Widget):
html += self._render_row(widget_id, key, val_str)
# Always add one empty row for new entries
html += self._render_row(widget_id, '', '')
html += self._render_row(widget_id, "", "")
html += f'''
</div>
@@ -669,8 +671,8 @@ class KeyValueWidget(forms.Widget):
def _escape(self, s: object) -> str:
"""Escape HTML special chars in attribute values."""
if not s:
return ''
return str(s).replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
return ""
return str(s).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")
def value_from_datadict(
self,
@@ -678,8 +680,8 @@ class KeyValueWidget(forms.Widget):
files: object,
name: str,
) -> str:
value = data.get(name, '{}')
return value if isinstance(value, str) else '{}'
value = data.get(name, "{}")
return value if isinstance(value, str) else "{}"
class ConfigEditorMixin(admin.ModelAdmin):
@@ -696,14 +698,20 @@ class ConfigEditorMixin(admin.ModelAdmin):
**kwargs: object,
) -> forms.Field | None:
"""Use KeyValueWidget for the config JSON field."""
if db_field.name == 'config':
kwargs['widget'] = KeyValueWidget()
if db_field.name == "config":
kwargs["widget"] = KeyValueWidget()
return super().formfield_for_dbfield(db_field, request, **kwargs)
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
list_display = ('id', 'created_at', 'created_by')
readonly_fields = ('id', 'created_at', 'modified_at')
list_display = ("id", "created_at", "created_by")
readonly_fields = ("id", "created_at", "modified_at")
show_search_mode_selector = False
def get_default_search_mode(self) -> str:
# The shared changelist template always asks every admin for a default
# search mode, even when the search-mode toggle is hidden.
return "meta"
def get_form(
self,
@@ -713,6 +721,6 @@ class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
**kwargs: object,
):
form = super().get_form(request, obj, change=change, **kwargs)
if 'created_by' in form.base_fields:
form.base_fields['created_by'].initial = request.user
if "created_by" in form.base_fields:
form.base_fields["created_by"].initial = request.user
return form

View File

@@ -1,6 +1,6 @@
"""Base models using UUIDv7 for all id fields."""
__package__ = 'archivebox.base_models'
__package__ = "archivebox.base_models"
from archivebox.uuid_compat import uuid7
from pathlib import Path
@@ -15,22 +15,22 @@ from django.conf import settings
from django_stubs_ext.db.models import TypedModelMeta
def get_or_create_system_user_pk(username='system'):
def get_or_create_system_user_pk(username="system"):
User = get_user_model()
# If there's exactly one superuser, use that for all system operations
if User.objects.filter(is_superuser=True).count() == 1:
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
return User.objects.filter(is_superuser=True).values_list("pk", flat=True)[0]
# Otherwise get or create the system user
user, _ = User.objects.get_or_create(
username=username,
defaults={'is_staff': True, 'is_superuser': True, 'email': '', 'password': '!'}
defaults={"is_staff": True, "is_superuser": True, "email": "", "password": "!"},
)
return user.pk
class AutoDateTimeField(models.DateTimeField):
"""DateTimeField that automatically updates on save (legacy compatibility)."""
def pre_save(self, model_instance, add):
if add or not getattr(model_instance, self.attname):
value = timezone.now()
@@ -43,13 +43,19 @@ class ModelWithUUID(models.Model):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, db_index=True)
created_by = models.ForeignKey(
settings.AUTH_USER_MODEL,
on_delete=models.CASCADE,
default=get_or_create_system_user_pk,
null=False,
db_index=True,
)
class Meta(TypedModelMeta):
abstract = True
def __str__(self) -> str:
return f'[{self.id}] {self.__class__.__name__}'
return f"[{self.id}] {self.__class__.__name__}"
@property
def admin_change_url(self) -> str:
@@ -57,17 +63,17 @@ class ModelWithUUID(models.Model):
@property
def api_url(self) -> str:
return str(reverse_lazy('api-1:get_any', args=[self.id]))
return str(reverse_lazy("api-1:get_any", args=[self.id]))
@property
def api_docs_url(self) -> str:
return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
return f"/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}"
class ModelWithNotes(models.Model):
"""Mixin for models with a notes field."""
notes = models.TextField(blank=True, null=False, default='')
notes = models.TextField(blank=True, null=False, default="")
class Meta(TypedModelMeta):
abstract = True
@@ -75,6 +81,7 @@ class ModelWithNotes(models.Model):
class ModelWithHealthStats(models.Model):
"""Mixin for models with health tracking fields."""
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
@@ -88,12 +95,13 @@ class ModelWithHealthStats(models.Model):
def increment_health_stats(self, success: bool):
"""Atomically increment success or failure counter using F() expression."""
field = 'num_uses_succeeded' if success else 'num_uses_failed'
field = "num_uses_succeeded" if success else "num_uses_failed"
type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
class ModelWithConfig(models.Model):
"""Mixin for models with a JSON config field."""
config = models.JSONField(default=dict, null=True, blank=True, editable=True)
class Meta(TypedModelMeta):
@@ -111,7 +119,7 @@ class ModelWithOutputDir(ModelWithUUID):
@property
def output_dir_parent(self) -> str:
return f'{self._meta.model_name}s'
return f"{self._meta.model_name}s"
@property
def output_dir_name(self) -> str:
@@ -119,7 +127,7 @@ class ModelWithOutputDir(ModelWithUUID):
@property
def output_dir_str(self) -> str:
return f'{self.output_dir_parent}/{self.output_dir_name}'
return f"{self.output_dir_parent}/{self.output_dir_name}"
@property
def output_dir(self) -> Path:

View File

@@ -1,5 +1,5 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
__package__ = "archivebox.cli"
__command__ = "archivebox"
import os
import sys
from importlib import import_module
@@ -10,55 +10,55 @@ from rich import print
from archivebox.config.version import VERSION
if '--debug' in sys.argv:
os.environ['DEBUG'] = 'True'
sys.argv.remove('--debug')
if "--debug" in sys.argv:
os.environ["DEBUG"] = "True"
sys.argv.remove("--debug")
class ArchiveBoxGroup(click.Group):
"""lazy loading click group for archivebox commands"""
meta_commands = {
'help': 'archivebox.cli.archivebox_help.main',
'version': 'archivebox.cli.archivebox_version.main',
'mcp': 'archivebox.cli.archivebox_mcp.main',
"help": "archivebox.cli.archivebox_help.main",
"version": "archivebox.cli.archivebox_version.main",
"mcp": "archivebox.cli.archivebox_mcp.main",
}
setup_commands = {
'init': 'archivebox.cli.archivebox_init.main',
'install': 'archivebox.cli.archivebox_install.main',
"init": "archivebox.cli.archivebox_init.main",
"install": "archivebox.cli.archivebox_install.main",
}
# Model commands (CRUD operations via subcommands)
model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
'tag': 'archivebox.cli.archivebox_tag.main',
'binary': 'archivebox.cli.archivebox_binary.main',
'process': 'archivebox.cli.archivebox_process.main',
'machine': 'archivebox.cli.archivebox_machine.main',
'persona': 'archivebox.cli.archivebox_persona.main',
"crawl": "archivebox.cli.archivebox_crawl.main",
"snapshot": "archivebox.cli.archivebox_snapshot.main",
"archiveresult": "archivebox.cli.archivebox_archiveresult.main",
"tag": "archivebox.cli.archivebox_tag.main",
"binary": "archivebox.cli.archivebox_binary.main",
"process": "archivebox.cli.archivebox_process.main",
"machine": "archivebox.cli.archivebox_machine.main",
"persona": "archivebox.cli.archivebox_persona.main",
}
archive_commands = {
# High-level commands
'add': 'archivebox.cli.archivebox_add.main',
'extract': 'archivebox.cli.archivebox_extract.main',
'list': 'archivebox.cli.archivebox_list.main',
'remove': 'archivebox.cli.archivebox_remove.main',
'run': 'archivebox.cli.archivebox_run.main',
'update': 'archivebox.cli.archivebox_update.main',
'status': 'archivebox.cli.archivebox_status.main',
'search': 'archivebox.cli.archivebox_search.main',
'config': 'archivebox.cli.archivebox_config.main',
'schedule': 'archivebox.cli.archivebox_schedule.main',
'server': 'archivebox.cli.archivebox_server.main',
'shell': 'archivebox.cli.archivebox_shell.main',
'manage': 'archivebox.cli.archivebox_manage.main',
"add": "archivebox.cli.archivebox_add.main",
"extract": "archivebox.cli.archivebox_extract.main",
"list": "archivebox.cli.archivebox_list.main",
"remove": "archivebox.cli.archivebox_remove.main",
"run": "archivebox.cli.archivebox_run.main",
"update": "archivebox.cli.archivebox_update.main",
"status": "archivebox.cli.archivebox_status.main",
"search": "archivebox.cli.archivebox_search.main",
"config": "archivebox.cli.archivebox_config.main",
"schedule": "archivebox.cli.archivebox_schedule.main",
"server": "archivebox.cli.archivebox_server.main",
"shell": "archivebox.cli.archivebox_shell.main",
"manage": "archivebox.cli.archivebox_manage.main",
# Introspection commands
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
"pluginmap": "archivebox.cli.archivebox_pluginmap.main",
}
legacy_model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
"crawl": "archivebox.cli.archivebox_crawl_compat.main",
"snapshot": "archivebox.cli.archivebox_snapshot_compat.main",
}
all_subcommands = {
**meta_commands,
@@ -67,15 +67,15 @@ class ArchiveBoxGroup(click.Group):
**archive_commands,
}
renamed_commands = {
'setup': 'install',
'import': 'add',
'archive': 'add',
"setup": "install",
"import": "add",
"archive": "add",
}
legacy_model_subcommands = {
'crawl': {'create', 'list', 'update', 'delete'},
'snapshot': {'create', 'list', 'update', 'delete'},
"crawl": {"create", "list", "update", "delete"},
"snapshot": {"create", "list", "update", "delete"},
}
@classmethod
def get_canonical_name(cls, cmd_name):
return cls.renamed_commands.get(cmd_name, cmd_name)
@@ -90,23 +90,22 @@ class ArchiveBoxGroup(click.Group):
except ValueError:
return False
remaining_args = sys.argv[arg_idx + 1:]
remaining_args = sys.argv[arg_idx + 1 :]
if not remaining_args:
return False
first_arg = remaining_args[0]
if first_arg in ('-h', '--help'):
if first_arg in ("-h", "--help"):
return False
return first_arg not in cls.legacy_model_subcommands[cmd_name]
def get_command(self, ctx, cmd_name):
# handle renamed commands
if cmd_name in self.renamed_commands:
new_name = self.renamed_commands[cmd_name]
print(
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`",
file=sys.stderr,
)
cmd_name = new_name
@@ -114,11 +113,11 @@ class ArchiveBoxGroup(click.Group):
if self._should_use_legacy_model_command(cmd_name):
return self._lazy_load(self.legacy_model_commands[cmd_name])
# handle lazy loading of commands
if cmd_name in self.all_subcommands:
return self._lazy_load(cmd_name)
# fall-back to using click's default command lookup
return super().get_command(ctx, cmd_name)
@@ -127,72 +126,74 @@ class ArchiveBoxGroup(click.Group):
import_path = cls.all_subcommands.get(cmd_name_or_path)
if import_path is None:
import_path = cmd_name_or_path
modname, funcname = import_path.rsplit('.', 1)
modname, funcname = import_path.rsplit(".", 1)
# print(f'LAZY LOADING {import_path}')
mod = import_module(modname)
func = getattr(mod, funcname)
if not hasattr(func, '__doc__'):
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
if not hasattr(func, "__doc__"):
raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method")
# if not isinstance(cmd, click.BaseCommand):
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
return func
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
@click.option('--help', '-h', is_flag=True, help='Show help')
@click.version_option(VERSION, '-v', '--version', package_name='archivebox', message='%(version)s')
@click.option("--help", "-h", is_flag=True, help="Show help")
@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s")
@click.pass_context
def cli(ctx, help=False):
"""ArchiveBox: The self-hosted internet archive"""
subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand)
# if --help is passed or no subcommand is given, show custom help message
if help or ctx.invoked_subcommand is None:
ctx.invoke(ctx.command.get_command(ctx, 'help'))
ctx.invoke(ctx.command.get_command(ctx, "help"))
# if the subcommand is in archive_commands or model_commands,
# then we need to set up the django environment and check that we're in a valid data folder
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
# print('SETUP DJANGO AND CHECK DATA FOLDER')
try:
if subcommand == 'server':
run_in_debug = '--reload' in sys.argv or os.environ.get('DEBUG') in ('1', 'true', 'True', 'TRUE', 'yes')
if subcommand == "server":
run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes")
if run_in_debug:
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
if '--reload' in sys.argv:
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
os.environ["ARCHIVEBOX_RUNSERVER"] = "1"
if "--reload" in sys.argv:
os.environ["ARCHIVEBOX_AUTORELOAD"] = "1"
from archivebox.config.common import STORAGE_CONFIG
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
from archivebox.config.django import setup_django
from archivebox.misc.checks import check_data_folder
setup_django()
check_data_folder()
except Exception as e:
print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr)
if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand
print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr)
if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand
raise
def main(args=None, prog_name=None, stdin=None):
# show `docker run archivebox xyz` in help messages if running in docker
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
IS_TTY = sys.stdin.isatty()
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox")
# stdin param allows passing input data from caller (used by __main__.py)
# currently not used by click-based CLI, but kept for backwards compatibility
try:
cli(args=args, prog_name=prog_name)
except KeyboardInterrupt:
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
print("\n\n[red][X] Got CTRL+C. Exiting...[/red]")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox add'
__package__ = "archivebox.cli"
__command__ = "archivebox add"
import sys
from pathlib import Path
@@ -14,6 +14,7 @@ from django.utils import timezone
from django.db.models import QuerySet
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.util import parse_filesize_to_bytes
from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
from archivebox.config.permissions import USER, HOSTNAME
@@ -29,34 +30,38 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
urls: list[str] = []
for record in read_args_or_stdin(args):
url = record.get('url')
url = record.get("url")
if isinstance(url, str) and url:
urls.append(url)
urls_field = record.get('urls')
urls_field = record.get("urls")
if isinstance(urls_field, str):
for line in urls_field.splitlines():
line = line.strip()
if line and not line.startswith('#'):
if line and not line.startswith("#"):
urls.append(line)
return urls
@enforce_types
def add(urls: str | list[str],
depth: int | str=0,
tag: str='',
url_allowlist: str='',
url_denylist: str='',
parser: str="auto",
plugins: str="",
persona: str='Default',
overwrite: bool=False,
update: bool | None=None,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
def add(
urls: str | list[str],
depth: int | str = 0,
max_urls: int = 0,
max_size: int | str = 0,
tag: str = "",
url_allowlist: str = "",
url_denylist: str = "",
parser: str = "auto",
plugins: str = "",
persona: str = "Default",
overwrite: bool = False,
update: bool | None = None,
index_only: bool = False,
bg: bool = False,
created_by_id: int | None = None,
) -> tuple["Crawl", QuerySet["Snapshot"]]:
"""Add a new URL or list of URLs to your archive.
The flow is:
@@ -72,8 +77,15 @@ def add(urls: str | list[str],
from rich import print
depth = int(depth)
max_urls = int(max_urls or 0)
max_size = parse_filesize_to_bytes(max_size)
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
if depth not in (0, 1, 2, 3, 4):
raise ValueError("Depth must be 0-4")
if max_urls < 0:
raise ValueError("max_urls must be >= 0")
if max_size < 0:
raise ValueError("max_size must be >= 0")
# import models once django is set up
from archivebox.core.models import Snapshot
@@ -91,47 +103,49 @@ def add(urls: str | list[str],
update = not ARCHIVING_CONFIG.ONLY_NEW
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
# 2. Create a new Crawl with inline URLs
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
if cli_args[0].lower().endswith("archivebox"):
cli_args[0] = "archivebox"
cmd_str = " ".join(cli_args)
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
# Read URLs directly into crawl
urls_content = sources_file.read_text()
persona_name = (persona or 'Default').strip() or 'Default'
plugins = plugins or str(get_config().get('PLUGINS') or '')
persona_name = (persona or "Default").strip() or "Default"
plugins = plugins or str(get_config().get("PLUGINS") or "")
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
persona_obj.ensure_dirs()
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
max_urls=max_urls,
max_size=max_size,
tags_str=tag,
persona_id=persona_obj.id,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]",
created_by_id=created_by_id,
config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
}
"ONLY_NEW": not update,
"INDEX_ONLY": index_only,
"OVERWRITE": overwrite,
"PLUGINS": plugins,
"DEFAULT_PERSONA": persona_name,
"PARSER": parser,
**({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}),
**({"URL_DENYLIST": url_denylist} if url_denylist else {}),
},
)
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]")
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ""
print(f" [dim]First URL: {first_url}[/dim]")
# 3. The CrawlMachine will create Snapshots from all URLs when started
# Parser extractors run on snapshots and discover more URLs
@@ -139,20 +153,21 @@ def add(urls: str | list[str],
if index_only:
# Just create the crawl but don't start processing
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]")
# Create snapshots for all URLs in the crawl
for url in crawl.get_urls_list():
snapshot, _ = Snapshot.objects.update_or_create(
crawl=crawl, url=url,
crawl=crawl,
url=url,
defaults={
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
'depth': 0,
"status": Snapshot.INITIAL_STATE,
"retry_at": timezone.now(),
"timestamp": str(timezone.now().timestamp()),
"depth": 0,
},
)
if tag:
snapshot.save_tags(tag.split(','))
snapshot.save_tags(tag.split(","))
snapshot.ensure_crawl_symlink()
return crawl, crawl.snapshot_set.all()
@@ -168,10 +183,12 @@ def add(urls: str | list[str],
if bg:
# Background mode: just queue work and return (background runner via server will pick it up)
print('[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]')
print(
"[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]",
)
else:
# Foreground mode: run full crawl runner until all work is done
print('[green]\\[*] Starting crawl runner to process crawl...[/green]')
print("[green]\\[*] Starting crawl runner to process crawl...[/green]")
run_crawl(str(crawl.id))
# Print summary for foreground runs
@@ -179,7 +196,10 @@ def add(urls: str | list[str],
crawl.refresh_from_db()
snapshots_count = crawl.snapshot_set.count()
try:
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
from django.db.models import Count, Sum
totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size"))
total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0
except Exception:
total_bytes, _, _ = get_dir_size(crawl.output_dir)
total_size = printable_filesize(total_bytes)
@@ -197,23 +217,23 @@ def add(urls: str | list[str],
# Output dir relative to DATA_DIR
try:
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
rel_output_str = f'./{rel_output}'
rel_output_str = f"./{rel_output}"
except Exception:
rel_output_str = str(crawl.output_dir)
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000"
if bind_addr.startswith("http://") or bind_addr.startswith("https://"):
base_url = bind_addr
else:
base_url = f'http://{bind_addr}'
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
base_url = f"http://{bind_addr}"
admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/"
print('\n[bold]crawl output saved to:[/bold]')
print(f' {rel_output_str}')
print(f' {admin_url}')
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
print(f'[bold]total size:[/bold] {total_size}')
print(f'[bold]total time:[/bold] {duration_str}')
print("\n[bold]crawl output saved to:[/bold]")
print(f" {rel_output_str}")
print(f" {admin_url}")
print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}")
print(f"[bold]total size:[/bold] {total_size}")
print(f"[bold]total time:[/bold] {duration_str}")
except Exception:
# Summary is best-effort; avoid failing the command if something goes wrong
pass
@@ -224,29 +244,43 @@ def add(urls: str | list[str],
@click.command()
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())
@click.option(
"--depth",
"-d",
type=click.Choice([str(i) for i in range(5)]),
default="0",
help="Recursively archive linked pages up to N hops away",
)
@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)")
@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)")
@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3")
@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl")
@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl")
@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)")
@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...")
@click.option("--persona", default="Default", help="Authentication profile to use when archiving")
@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them")
@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now")
@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)")
@click.argument("urls", nargs=-1, type=click.Path())
@docstring(add.__doc__)
def main(**kwargs):
"""Add a new URL or list of URLs to your archive"""
raw_urls = kwargs.pop('urls')
raw_urls = kwargs.pop("urls")
urls = _collect_input_urls(raw_urls)
if not urls:
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.")
if int(kwargs.get("max_urls") or 0) < 0:
raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls")
try:
kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size"))
except ValueError as err:
raise click.BadParameter(str(err), param_hint="--max-size") from err
add(urls=urls, **kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -30,11 +30,10 @@ Examples:
archivebox archiveresult list --status=failed | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox archiveresult'
__package__ = "archivebox.cli"
__command__ = "archivebox archiveresult"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -42,13 +41,13 @@ from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict:
return {
'type': 'ArchiveResult',
'snapshot_id': str(snapshot_id),
'plugin': plugin,
'hook_name': hook_name,
'status': status,
"type": "ArchiveResult",
"snapshot_id": str(snapshot_id),
"plugin": plugin,
"hook_name": hook_name,
"status": status,
}
@@ -56,10 +55,11 @@ def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str =
# CREATE
# =============================================================================
def create_archiveresults(
snapshot_id: Optional[str] = None,
plugin: Optional[str] = None,
status: str = 'queued',
snapshot_id: str | None = None,
plugin: str | None = None,
status: str = "queued",
) -> int:
"""
Create ArchiveResult request records for Snapshots.
@@ -86,13 +86,13 @@ def create_archiveresults(
snapshots = [Snapshot.objects.get(id=snapshot_id)]
pass_through_records = []
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr)
return 1
else:
# Read from stdin
records = list(read_stdin())
if not records:
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Separate snapshot records from pass-through records
@@ -100,17 +100,17 @@ def create_archiveresults(
pass_through_records = []
for record in records:
record_type = record.get('type', '')
record_type = record.get("type", "")
if record_type == TYPE_SNAPSHOT:
# Pass through the Snapshot record itself
pass_through_records.append(record)
if record.get('id'):
snapshot_ids.append(record['id'])
if record.get("id"):
snapshot_ids.append(record["id"])
elif record_type == TYPE_ARCHIVERESULT:
# ArchiveResult records: pass through if they have an id
if record.get('id'):
if record.get("id"):
pass_through_records.append(record)
# If no id, we could create it, but for now just pass through
else:
@@ -120,9 +120,9 @@ def create_archiveresults(
# Other typed records (Crawl, Tag, etc): pass through
pass_through_records.append(record)
elif record.get('id'):
elif record.get("id"):
# Untyped record with id - assume it's a snapshot ID
snapshot_ids.append(record['id'])
snapshot_ids.append(record["id"])
# Output pass-through records first
if not is_tty:
@@ -131,15 +131,15 @@ def create_archiveresults(
if not snapshot_ids:
if pass_through_records:
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr)
return 0
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr)
return 1
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
if not snapshots:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
return 0 if pass_through_records else 1
created_count = 0
@@ -150,7 +150,7 @@ def create_archiveresults(
created_count += 1
else:
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
hooks = discover_hooks('Snapshot', config=config)
hooks = discover_hooks("Snapshot", config=config)
for hook_path in hooks:
hook_name = hook_path.name
plugin_name = hook_path.parent.name
@@ -158,7 +158,7 @@ def create_archiveresults(
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
created_count += 1
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr)
return 0
@@ -166,11 +166,12 @@ def create_archiveresults(
# LIST
# =============================================================================
def list_archiveresults(
status: Optional[str] = None,
plugin: Optional[str] = None,
snapshot_id: Optional[str] = None,
limit: Optional[int] = None,
status: str | None = None,
plugin: str | None = None,
snapshot_id: str | None = None,
limit: int | None = None,
) -> int:
"""
List ArchiveResults as JSONL with optional filters.
@@ -183,13 +184,13 @@ def list_archiveresults(
is_tty = sys.stdout.isatty()
queryset = ArchiveResult.objects.all().order_by('-start_ts')
queryset = ArchiveResult.objects.all().order_by("-start_ts")
# Apply filters
filter_kwargs = {
'status': status,
'plugin': plugin,
'snapshot_id': snapshot_id,
"status": status,
"plugin": plugin,
"snapshot_id": snapshot_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
@@ -197,20 +198,22 @@ def list_archiveresults(
for result in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
'noresults': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
"queued": "yellow",
"started": "blue",
"succeeded": "green",
"failed": "red",
"skipped": "dim",
"noresults": "dim",
"backoff": "magenta",
}.get(result.status, "dim")
rprint(
f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}",
)
else:
write_record(result.to_json())
count += 1
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr)
return 0
@@ -218,8 +221,9 @@ def list_archiveresults(
# UPDATE
# =============================================================================
def update_archiveresults(
status: Optional[str] = None,
status: str | None = None,
) -> int:
"""
Update ArchiveResults from stdin JSONL.
@@ -238,12 +242,12 @@ def update_archiveresults(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
result_id = record.get('id')
result_id = record.get("id")
if not result_id:
continue
@@ -261,10 +265,10 @@ def update_archiveresults(
write_record(result.to_json())
except ArchiveResult.DoesNotExist:
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr)
return 0
@@ -272,6 +276,7 @@ def update_archiveresults(
# DELETE
# =============================================================================
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete ArchiveResults from stdin JSONL.
@@ -287,37 +292,37 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
result_ids = [r.get('id') for r in records if r.get('id')]
result_ids = [r.get("id") for r in records if r.get("id")]
if not result_ids:
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr)
return 1
results = ArchiveResult.objects.filter(id__in=result_ids)
count = results.count()
if count == 0:
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr)
for result in results[:10]:
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr)
if count > 10:
rprint(f' ... and {count - 10} more', file=sys.stderr)
rprint(f" ... and {count - 10} more", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = results.delete()
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr)
return 0
@@ -325,51 +330,58 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage ArchiveResult records (plugin extraction results)."""
pass
@main.command('create')
@click.option('--snapshot-id', help='Snapshot ID to create results for')
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
@main.command("create")
@click.option("--snapshot-id", help="Snapshot ID to create results for")
@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(snapshot_id: str | None, plugin: str | None, status: str):
"""Create ArchiveResults for Snapshots from stdin JSONL."""
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
@click.option('--plugin', '-p', help='Filter by plugin name')
@click.option('--snapshot-id', help='Filter by snapshot ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], plugin: Optional[str],
snapshot_id: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)")
@click.option("--plugin", "-p", help="Filter by plugin name")
@click.option("--snapshot-id", help="Filter by snapshot ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
status: str | None,
plugin: str | None,
snapshot_id: str | None,
limit: int | None,
):
"""List ArchiveResults as JSONL."""
sys.exit(list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
))
sys.exit(
list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
),
)
@main.command('update')
@click.option('--status', '-s', help='Set status')
def update_cmd(status: Optional[str]):
@main.command("update")
@click.option("--status", "-s", help="Set status")
def update_cmd(status: str | None):
"""Update ArchiveResults from stdin JSONL."""
sys.exit(update_archiveresults(status=status))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete ArchiveResults from stdin JSONL."""
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -25,11 +25,10 @@ Examples:
archivebox binary list --name=chrome | archivebox binary delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox binary'
__package__ = "archivebox.cli"
__command__ = "archivebox binary"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -41,10 +40,11 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_binary(
name: str,
abspath: str,
version: str = '',
version: str = "",
) -> int:
"""
Create/register a Binary.
@@ -59,7 +59,7 @@ def create_binary(
is_tty = sys.stdout.isatty()
if not name or not abspath:
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr)
return 1
try:
@@ -76,28 +76,30 @@ def create_binary(
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
# records are owned by the current machine and can be safely piped into
# `archivebox run` without creating invalid rows missing machine_id.
binary = Binary.from_json({
'name': name,
'abspath': abspath,
'version': version,
'binproviders': 'env',
'binprovider': 'env',
})
binary = Binary.from_json(
{
"name": name,
"abspath": abspath,
"version": version,
"binproviders": "env",
"binprovider": "env",
},
)
if binary is None:
raise ValueError('failed to create binary record')
raise ValueError("failed to create binary record")
if not is_tty:
write_record(binary.to_json())
if created:
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr)
else:
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr)
return 1
@@ -105,11 +107,12 @@ def create_binary(
# LIST
# =============================================================================
def list_binaries(
name: Optional[str] = None,
abspath__icontains: Optional[str] = None,
version__icontains: Optional[str] = None,
limit: Optional[int] = None,
name: str | None = None,
abspath__icontains: str | None = None,
version__icontains: str | None = None,
limit: int | None = None,
) -> int:
"""
List Binaries as JSONL with optional filters.
@@ -122,25 +125,25 @@ def list_binaries(
is_tty = sys.stdout.isatty()
queryset = Binary.objects.all().order_by('name', '-modified_at', '-created_at')
queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at")
# Apply filters
filter_kwargs = {
'name': name,
'abspath__icontains': abspath__icontains,
'version__icontains': version__icontains,
"name": name,
"abspath__icontains": abspath__icontains,
"version__icontains": version__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for binary in queryset:
if is_tty:
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}")
else:
write_record(binary.to_json())
count += 1
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr)
return 0
@@ -148,9 +151,10 @@ def list_binaries(
# UPDATE
# =============================================================================
def update_binaries(
version: Optional[str] = None,
abspath: Optional[str] = None,
version: str | None = None,
abspath: str | None = None,
) -> int:
"""
Update Binaries from stdin JSONL.
@@ -169,12 +173,12 @@ def update_binaries(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
binary_id = record.get('id')
binary_id = record.get("id")
if not binary_id:
continue
@@ -194,10 +198,10 @@ def update_binaries(
write_record(binary.to_json())
except Binary.DoesNotExist:
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr)
return 0
@@ -205,6 +209,7 @@ def update_binaries(
# DELETE
# =============================================================================
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Binaries from stdin JSONL.
@@ -220,35 +225,35 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
binary_ids = [r.get('id') for r in records if r.get('id')]
binary_ids = [r.get("id") for r in records if r.get("id")]
if not binary_ids:
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr)
return 1
binaries = Binary.objects.filter(id__in=binary_ids)
count = binaries.count()
if count == 0:
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr)
for binary in binaries:
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
rprint(f" {binary.name} {binary.abspath}", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = binaries.delete()
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr)
return 0
@@ -256,52 +261,59 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Binary records (detected executables)."""
pass
@main.command('create')
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
@click.option('--version', '-v', default='', help='Binary version')
@main.command("create")
@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)")
@click.option("--abspath", "-p", required=True, help="Absolute path to binary")
@click.option("--version", "-v", default="", help="Binary version")
def create_cmd(name: str, abspath: str, version: str):
"""Create/register a Binary."""
sys.exit(create_binary(name=name, abspath=abspath, version=version))
@main.command('list')
@click.option('--name', '-n', help='Filter by name')
@click.option('--abspath__icontains', help='Filter by path contains')
@click.option('--version__icontains', help='Filter by version contains')
@click.option('--limit', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
version__icontains: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--name", "-n", help="Filter by name")
@click.option("--abspath__icontains", help="Filter by path contains")
@click.option("--version__icontains", help="Filter by version contains")
@click.option("--limit", type=int, help="Limit number of results")
def list_cmd(
name: str | None,
abspath__icontains: str | None,
version__icontains: str | None,
limit: int | None,
):
"""List Binaries as JSONL."""
sys.exit(list_binaries(
name=name,
abspath__icontains=abspath__icontains,
version__icontains=version__icontains,
limit=limit,
))
sys.exit(
list_binaries(
name=name,
abspath__icontains=abspath__icontains,
version__icontains=version__icontains,
limit=limit,
),
)
@main.command('update')
@click.option('--version', '-v', help='Set version')
@click.option('--abspath', '-p', help='Set path')
def update_cmd(version: Optional[str], abspath: Optional[str]):
@main.command("update")
@click.option("--version", "-v", help="Set version")
@click.option("--abspath", "-p", help="Set path")
def update_cmd(version: str | None, abspath: str | None):
"""Update Binaries from stdin JSONL."""
sys.exit(update_binaries(version=version, abspath=abspath))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Binaries from stdin JSONL."""
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import sys
import rich_click as click
@@ -12,12 +12,14 @@ from archivebox.misc.toml_util import CustomTOMLEncoder
@enforce_types
def config(*keys,
get: bool=False,
set: bool=False,
search: bool=False,
reset: bool=False,
**kwargs) -> None:
def config(
*keys,
get: bool = False,
set: bool = False,
search: bool = False,
reset: bool = False,
**kwargs,
) -> None:
"""Get and set your ArchiveBox project configuration values"""
from archivebox.misc.checks import check_data_folder
@@ -29,8 +31,8 @@ def config(*keys,
FLAT_CONFIG = get_flat_config()
CONFIGS = get_all_configs()
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()])
no_args = not (get or set or reset or config_options)
matching_config = {}
@@ -39,19 +41,19 @@ def config(*keys,
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
for config_section in CONFIGS.values():
aliases = getattr(config_section, 'aliases', {})
aliases = getattr(config_section, "aliases", {})
for search_key in config_options:
# search all aliases in the section
for alias_key, key in aliases.items():
if search_key.lower() in alias_key.lower():
matching_config[key] = dict(config_section)[key]
# search all keys and values in the section
for existing_key, value in dict(config_section).items():
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
matching_config[existing_key] = value
print(printable_config(matching_config))
raise SystemExit(not matching_config)
@@ -61,23 +63,23 @@ def config(*keys,
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
if failed_config:
print('\n[red][X] These options failed to get[/red]')
print(' {}'.format('\n '.join(config_options)))
print("\n[red][X] These options failed to get[/red]")
print(" {}".format("\n ".join(config_options)))
raise SystemExit(1)
else:
matching_config = FLAT_CONFIG
# Display core config sections
for config_section in CONFIGS.values():
section_header = getattr(config_section, 'toml_section_header', '')
section_header = getattr(config_section, "toml_section_header", "")
if isinstance(section_header, str) and section_header:
print(f'[grey53]\\[{section_header}][/grey53]')
print(f"[grey53]\\[{section_header}][/grey53]")
else:
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]")
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
print("[grey53]################################################################[/grey53]")
# Display plugin config section
from archivebox.hooks import discover_plugin_configs
@@ -87,17 +89,17 @@ def config(*keys,
# Collect all plugin config keys
for plugin_name, schema in plugin_configs.items():
if 'properties' not in schema:
if "properties" not in schema:
continue
for key in schema['properties'].keys():
for key in schema["properties"].keys():
if key in matching_config:
plugin_keys[key] = matching_config[key]
# Display all plugin config in single [PLUGINS] section
if plugin_keys:
print('[grey53]\\[PLUGINS][/grey53]')
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
print("[grey53]\\[PLUGINS][/grey53]")
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
print("[grey53]################################################################[/grey53]")
raise SystemExit(not matching_config)
@@ -105,18 +107,20 @@ def config(*keys,
new_config = {}
failed_options = []
for line in config_options:
if line.startswith('#') or not line.strip():
if line.startswith("#") or not line.strip():
continue
if '=' not in line:
print('[red][X] Config KEY=VALUE must have an = sign in it[/red]')
print(f' {line}')
if "=" not in line:
print("[red][X] Config KEY=VALUE must have an = sign in it[/red]")
print(f" {line}")
raise SystemExit(2)
raw_key, val = line.split('=', 1)
raw_key, val = line.split("=", 1)
raw_key = raw_key.upper().strip()
key = get_real_name(raw_key)
if key != raw_key:
print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]')
print(
f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]",
)
if key in FLAT_CONFIG:
new_config[key] = val.strip()
@@ -136,38 +140,38 @@ def config(*keys,
if side_effect_changes:
print(file=sys.stderr)
print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr)
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr)
print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr)
print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr)
if failed_options:
print()
print('[red][X] These options failed to set (check for typos):[/red]')
print(' {}'.format('\n '.join(failed_options)))
print("[red][X] These options failed to set (check for typos):[/red]")
print(" {}".format("\n ".join(failed_options)))
raise SystemExit(1)
elif reset:
print('[red][X] This command is not implemented yet.[/red]')
print(' Please manually remove the relevant lines from your config file:')
print("[red][X] This command is not implemented yet.[/red]")
print(" Please manually remove the relevant lines from your config file:")
raise SystemExit(2)
else:
print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]')
print(' archivebox config')
print(' archivebox config --get SOME_KEY')
print(' archivebox config --set SOME_KEY=SOME_VALUE')
print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]")
print(" archivebox config")
print(" archivebox config --get SOME_KEY")
print(" archivebox config --set SOME_KEY=SOME_VALUE")
raise SystemExit(2)
@click.command()
@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term')
@click.option('--get', is_flag=True, help='Get the value for the given config KEYs')
@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values')
@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults')
@click.argument('KEY=VALUE', nargs=-1, type=str)
@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term")
@click.option("--get", is_flag=True, help="Get the value for the given config KEYs")
@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values")
@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults")
@click.argument("KEY=VALUE", nargs=-1, type=str)
@docstring(config.__doc__)
def main(**kwargs) -> None:
config(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -30,11 +30,11 @@ Examples:
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
__package__ = "archivebox.cli"
__command__ = "archivebox crawl"
import sys
from typing import Optional, Iterable
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
@@ -46,12 +46,13 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_crawl(
urls: Iterable[str],
depth: int = 0,
tag: str = '',
status: str = 'queued',
created_by_id: Optional[int] = None,
tag: str = "",
status: str = "queued",
created_by_id: int | None = None,
) -> int:
"""
Create a Crawl job from URLs.
@@ -74,7 +75,7 @@ def create_crawl(
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Separate pass-through records from URL records
@@ -82,29 +83,29 @@ def create_crawl(
pass_through_records = []
for record in records:
record_type = record.get('type', '')
record_type = record.get("type", "")
# Pass-through: output records that aren't URL/Crawl types
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
pass_through_records.append(record)
continue
# Handle existing Crawl records (just pass through with id)
if record_type == TYPE_CRAWL and record.get('id'):
if record_type == TYPE_CRAWL and record.get("id"):
pass_through_records.append(record)
continue
# Collect URLs
url = record.get('url')
url = record.get("url")
if url:
url_list.append(url)
# Handle 'urls' field (newline-separated)
urls_field = record.get('urls')
urls_field = record.get("urls")
if urls_field:
for line in urls_field.split('\n'):
for line in urls_field.split("\n"):
line = line.strip()
if line and not line.startswith('#'):
if line and not line.startswith("#"):
url_list.append(line)
# Output pass-through records first
@@ -115,44 +116,44 @@ def create_crawl(
if not url_list:
if pass_through_records:
# If we had pass-through records but no URLs, that's OK
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
return 0
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
rprint("[red]No valid URLs found[/red]", file=sys.stderr)
return 1
try:
# Build crawl record with all URLs as newline-separated string
crawl_record = {
'urls': '\n'.join(url_list),
'max_depth': depth,
'tags_str': tag,
'status': status,
'label': '',
"urls": "\n".join(url_list),
"max_depth": depth,
"tags_str": tag,
"status": status,
"label": "",
}
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
if not crawl:
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
return 1
# Output JSONL record (only when piped)
if not is_tty:
write_record(crawl.to_json())
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr)
for url in url_list[:5]: # Show first 5 URLs
rprint(f' {url[:70]}', file=sys.stderr)
rprint(f" {url[:70]}", file=sys.stderr)
if len(url_list) > 5:
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
return 1
@@ -160,11 +161,12 @@ def create_crawl(
# LIST
# =============================================================================
def list_crawls(
status: Optional[str] = None,
urls__icontains: Optional[str] = None,
max_depth: Optional[int] = None,
limit: Optional[int] = None,
status: str | None = None,
urls__icontains: str | None = None,
max_depth: int | None = None,
limit: int | None = None,
) -> int:
"""
List Crawls as JSONL with optional filters.
@@ -177,13 +179,13 @@ def list_crawls(
is_tty = sys.stdout.isatty()
queryset = Crawl.objects.all().order_by('-created_at')
queryset = Crawl.objects.all().order_by("-created_at")
# Apply filters
filter_kwargs = {
'status': status,
'urls__icontains': urls__icontains,
'max_depth': max_depth,
"status": status,
"urls__icontains": urls__icontains,
"max_depth": max_depth,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
@@ -191,17 +193,17 @@ def list_crawls(
for crawl in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(crawl.status, 'dim')
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
"queued": "yellow",
"started": "blue",
"sealed": "green",
}.get(crawl.status, "dim")
url_preview = crawl.urls[:50].replace("\n", " ")
rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
else:
write_record(crawl.to_json())
count += 1
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
return 0
@@ -209,9 +211,10 @@ def list_crawls(
# UPDATE
# =============================================================================
def update_crawls(
status: Optional[str] = None,
max_depth: Optional[int] = None,
status: str | None = None,
max_depth: int | None = None,
) -> int:
"""
Update Crawls from stdin JSONL.
@@ -232,12 +235,12 @@ def update_crawls(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
crawl_id = record.get('id')
crawl_id = record.get("id")
if not crawl_id:
continue
@@ -258,10 +261,10 @@ def update_crawls(
write_record(crawl.to_json())
except Crawl.DoesNotExist:
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
return 0
@@ -269,6 +272,7 @@ def update_crawls(
# DELETE
# =============================================================================
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Crawls from stdin JSONL.
@@ -284,36 +288,36 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
crawl_ids = [r.get('id') for r in records if r.get('id')]
crawl_ids = [r.get("id") for r in records if r.get("id")]
if not crawl_ids:
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
return 1
crawls = Crawl.objects.filter(id__in=crawl_ids)
count = crawls.count()
if count == 0:
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
for crawl in crawls:
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
url_preview = crawl.urls[:50].replace("\n", " ")
rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = crawls.delete()
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
return 0
@@ -321,53 +325,60 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Crawl records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
"""Create a Crawl job from URLs or stdin."""
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--urls__icontains', help='Filter by URLs contains')
@click.option('--max-depth', type=int, help='Filter by max depth')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
max_depth: Optional[int], limit: Optional[int]):
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--urls__icontains", help="Filter by URLs contains")
@click.option("--max-depth", type=int, help="Filter by max depth")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
status: str | None,
urls__icontains: str | None,
max_depth: int | None,
limit: int | None,
):
"""List Crawls as JSONL."""
sys.exit(list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
))
sys.exit(
list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
),
)
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--max-depth', type=int, help='Set max depth')
def update_cmd(status: Optional[str], max_depth: Optional[int]):
@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--max-depth", type=int, help="Set max depth")
def update_cmd(status: str | None, max_depth: int | None):
"""Update Crawls from stdin JSONL."""
sys.exit(update_crawls(status=status, max_depth=max_depth))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Crawls from stdin JSONL."""
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
__package__ = "archivebox.cli"
__command__ = "archivebox crawl"
import sys
@@ -10,12 +10,12 @@ import rich_click as click
from archivebox.cli.archivebox_add import add
@click.command(context_settings={'ignore_unknown_options': True})
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
@click.argument('urls', nargs=-1)
@click.command(context_settings={"ignore_unknown_options": True})
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility")
@click.argument("urls", nargs=-1)
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
del status, wait
@@ -23,5 +23,5 @@ def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
sys.exit(0)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,8 +27,8 @@ Examples:
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
__package__ = "archivebox.cli"
__command__ = "archivebox extract"
import sys
from collections import defaultdict
@@ -52,51 +52,52 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
except ArchiveResult.DoesNotExist:
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr)
return 1
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr)
try:
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
snapshot.status = snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
if crawl.status != crawl.StatusChoices.STARTED:
crawl.status = crawl.StatusChoices.QUEUED
crawl.retry_at = timezone.now()
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=["status", "retry_at", "modified_at"])
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]")
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]")
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr)
return 1
else:
# Still in progress or backoff - not a failure
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]")
return 0
except Exception as e:
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
return 1
def run_plugins(
args: tuple,
records: list[dict] | None = None,
plugins: str = '',
plugins: str = "",
wait: bool = True,
emit_results: bool = True,
) -> int:
"""
Run plugins on Snapshots from input.
@@ -111,16 +112,18 @@ def run_plugins(
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
read_args_or_stdin,
write_record,
TYPE_SNAPSHOT,
TYPE_ARCHIVERESULT,
)
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
from archivebox.services.runner import run_crawl
is_tty = sys.stdout.isatty()
# Parse comma-separated plugins list once (reused in creation and filtering)
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else []
# Parse stdin/args exactly once per CLI invocation.
# `main()` may already have consumed stdin to distinguish Snapshot input from
@@ -130,41 +133,41 @@ def run_plugins(
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Gather snapshot IDs and optional plugin constraints to process
snapshot_ids = set()
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
for record in records:
record_type = record.get('type')
record_type = record.get("type")
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
snapshot_id = record.get("id")
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
elif record.get("url"):
# Look up by URL (get most recent if multiple exist)
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first()
if snap:
snapshot_ids.add(str(snap.id))
else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
snapshot_id = record.get("snapshot_id")
if snapshot_id:
snapshot_ids.add(snapshot_id)
plugin_name = record.get('plugin')
plugin_name = record.get("plugin")
if plugin_name and not plugins_list:
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
elif 'id' in record:
elif "id" in record:
# Assume it's a snapshot ID
snapshot_ids.add(record['id'])
snapshot_ids.add(record["id"])
if not snapshot_ids:
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr)
return 1
# Get snapshots and ensure they have pending ArchiveResults
@@ -173,17 +176,13 @@ def run_plugins(
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr)
continue
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
if existing_result and existing_result.status in [
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
ArchiveResult.StatusChoices.BACKOFF,
]:
requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set())
for plugin_name in requested_plugin_names:
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
if existing_result:
existing_result.reset_for_retry()
# Reset snapshot status to allow processing
@@ -195,34 +194,39 @@ def run_plugins(
processed_count += 1
if processed_count == 0:
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
rprint("[red]No snapshots to process[/red]", file=sys.stderr)
return 1
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr)
# Run orchestrator if --wait (default)
if wait:
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
rprint("[blue]Running plugins...[/blue]", file=sys.stderr)
snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id)
snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id)
except Snapshot.DoesNotExist:
continue
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
selected_plugins = plugins_list or sorted({
plugin
for snapshot_id in crawl_snapshot_ids
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
}) or None
selected_plugins = (
plugins_list
or sorted(
{plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())},
)
or None
)
run_crawl(
crawl_id,
snapshot_ids=sorted(crawl_snapshot_ids),
selected_plugins=selected_plugins,
)
if not emit_results:
return 0
# Output results as JSONL (when piped) or human-readable (when TTY)
for snapshot_id in snapshot_ids:
try:
@@ -234,11 +238,14 @@ def run_plugins(
for result in results:
if is_tty:
status_color = {
'succeeded': 'green',
'failed': 'red',
'skipped': 'yellow',
}.get(result.status, 'dim')
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ""}', file=sys.stderr)
"succeeded": "green",
"failed": "red",
"skipped": "yellow",
}.get(result.status, "dim")
rprint(
f" [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ''}",
file=sys.stderr,
)
else:
write_record(result.to_json())
except Snapshot.DoesNotExist:
@@ -250,18 +257,20 @@ def run_plugins(
def is_archiveresult_id(value: str) -> bool:
"""Check if value looks like an ArchiveResult UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from archivebox.core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()
@click.command()
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)")
@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)")
@click.argument("args", nargs=-1)
def main(plugins: str, wait: bool, args: tuple):
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
@@ -271,14 +280,12 @@ def main(plugins: str, wait: bool, args: tuple):
if not records:
from rich import print as rprint
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr)
sys.exit(1)
# Check if input looks like existing ArchiveResult IDs to process
all_are_archiveresult_ids = all(
is_archiveresult_id(r.get('id') or r.get('url', ''))
for r in records
)
all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records)
if all_are_archiveresult_ids:
# Process existing ArchiveResults by ID
@@ -286,9 +293,9 @@ def main(plugins: str, wait: bool, args: tuple):
exit_code = 0
for record in records:
archiveresult_id = record.get('id') or record.get('url')
archiveresult_id = record.get("id") or record.get("url")
if not isinstance(archiveresult_id, str):
rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr)
rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr)
exit_code = 1
continue
result = process_archiveresult_by_id(archiveresult_id)
@@ -300,5 +307,5 @@ def main(plugins: str, wait: bool, args: tuple):
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox help'
__package__ = "archivebox.cli"
__command__ = "archivebox help"
import os
import os
from pathlib import Path
import click
@@ -17,33 +17,44 @@ def help() -> None:
from archivebox.config import CONSTANTS
from archivebox.config.permissions import IN_DOCKER
from archivebox.misc.logging_util import log_cli_command
log_cli_command('help', [], None, '.')
COMMANDS_HELP_TEXT = '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.meta_commands.keys()
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.setup_commands.keys()
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.archive_commands.keys()
log_cli_command("help", [], None, ".")
COMMANDS_HELP_TEXT = (
"\n ".join(
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys()
)
+ "\n\n "
+ "\n ".join(
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys()
)
+ "\n\n "
+ "\n ".join(
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys()
)
)
DOCKER_USAGE = '''
DOCKER_USAGE = (
"""
[dodger_blue3]Docker Usage:[/dodger_blue3]
[grey53]# using Docker Compose:[/grey53]
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
[grey53]# using Docker:[/grey53]
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
''' if IN_DOCKER else ''
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
"""
if IN_DOCKER
else ""
)
DOCKER_DOCS = (
"\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]"
if IN_DOCKER
else ""
)
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ""
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ""
print(f'''{DOCKER_USAGE}
print(f"""{DOCKER_USAGE}
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
@@ -54,12 +65,11 @@ def help() -> None:
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
''')
""")
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
EXAMPLE_USAGE = f'''
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~")
EXAMPLE_USAGE = f"""
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
@@ -73,33 +83,49 @@ def help() -> None:
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
'''
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
"""
print(
Panel(
EXAMPLE_USAGE,
expand=False,
border_style="grey53",
title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]",
subtitle="Commands run inside this dir will only apply to this collection.",
),
)
else:
DATA_SETUP_HELP = '\n'
DATA_SETUP_HELP = "\n"
if IN_DOCKER:
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n"
DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n"
DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n"
DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n"
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n"
DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n"
DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n"
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n"
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n"
print(
Panel(
DATA_SETUP_HELP,
expand=False,
border_style="grey53",
title="[red]:cross_mark: No collection is currently active[/red]",
subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
),
)
@click.command()
@click.option('--help', '-h', is_flag=True, help='Show help')
@click.option("--help", "-h", is_flag=True, help="Show help")
def main(**kwargs):
"""Print the ArchiveBox help message and usage"""
return help()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import os
import sys
from pathlib import Path
from typing import Mapping
from collections.abc import Mapping
from rich import print
import rich_click as click
@@ -14,12 +14,12 @@ from archivebox.misc.util import docstring, enforce_types
def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None:
url = link_dict.get('url')
url = link_dict.get("url")
if not isinstance(url, str) or not url:
return None
record: dict[str, object] = {'url': url}
for key in ('timestamp', 'title', 'tags', 'sources'):
record: dict[str, object] = {"url": url}
for key in ("timestamp", "title", "tags", "sources"):
value = link_dict.get(key)
if value is not None:
record[key] = value
@@ -27,15 +27,15 @@ def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, di
@enforce_types
def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
def init(force: bool = False, quick: bool = False, install: bool = False) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details
from archivebox.misc.db import apply_migrations
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
@@ -43,69 +43,71 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
if is_empty and not existing_index:
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
print('[green]----------------------------------------------------------------------[/green]')
print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]")
print("[green]----------------------------------------------------------------------[/green]")
elif existing_index:
# TODO: properly detect and print the existing version in current index as well
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
print('[green]----------------------------------------------------------------------[/green]')
print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]")
print("[green]----------------------------------------------------------------------[/green]")
else:
if force:
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]")
print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]")
else:
print(
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
"[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
" You must run init in a completely empty directory, or an existing data folder.\n\n"
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
" then run and run 'archivebox init' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
)
" (Always make sure your data folder is backed up first before updating ArchiveBox)",
)
raise SystemExit(2)
if existing_index:
print('\n[green][*] Verifying archive folder structure...[/green]')
print("\n[green][*] Verifying archive folder structure...[/green]")
else:
print('\n[green][+] Building archive folder structure...[/green]')
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
print("\n[green][+] Building archive folder structure...[/green]")
print(
f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...",
)
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...")
# create the .archivebox_id file with a unique ID for this collection
from archivebox.config.paths import _get_collection_id
_get_collection_id(DATA_DIR, force_create=True)
# create the ArchiveBox.conf file
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
_get_collection_id(DATA_DIR, force_create=True)
# create the ArchiveBox.conf file
write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY})
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]")
else:
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
print("\n[green][+] Building main SQL index and running initial migrations...[/green]")
from archivebox.config.django import setup_django
setup_django()
for migration_line in apply_migrations(DATA_DIR):
sys.stdout.write(f' {migration_line}\n')
sys.stdout.write(f" {migration_line}\n")
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
print()
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}")
# from django.contrib.auth.models import User
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
# call_command("createsuperuser", interactive=True)
print()
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]")
from archivebox.core.models import Snapshot
@@ -114,10 +116,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
if existing_index:
all_links = Snapshot.objects.all()
print(f' √ Loaded {all_links.count()} links from existing main index.')
print(f" √ Loaded {all_links.count()} links from existing main index.")
if quick:
print(' > Skipping orphan snapshot import (quick mode)')
print(" > Skipping orphan snapshot import (quick mode)")
else:
try:
# Import orphaned links from legacy JSON indexes
@@ -131,7 +133,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
orphaned_json_links[url] = record
if orphaned_json_links:
pending_links.update(orphaned_json_links)
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]")
orphaned_data_dir_links: dict[str, dict[str, object]] = {}
for link_dict in parse_json_links_details(DATA_DIR):
@@ -143,7 +145,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
orphaned_data_dir_links[url] = record
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]")
if pending_links:
for link_dict in pending_links.values():
@@ -151,42 +153,44 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
# Hint for orphaned snapshot directories
print()
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
print(' archivebox update')
print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:")
print(" archivebox update")
except (KeyboardInterrupt, SystemExit):
print(file=sys.stderr)
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr)
print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
print(' archivebox init --quick', file=sys.stderr)
print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr)
print(" archivebox init --quick", file=sys.stderr)
raise SystemExit(1)
print('\n[green]----------------------------------------------------------------------[/green]')
print("\n[green]----------------------------------------------------------------------[/green]")
from django.contrib.auth.models import User
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(
username=SERVER_CONFIG.ADMIN_USERNAME,
).exists():
print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]")
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
if existing_index:
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]")
else:
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]")
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
(CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
(STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True)
if working_tmp_dir:
@@ -195,33 +199,35 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True)
if working_lib_dir:
working_lib_dir.mkdir(parents=True, exist_ok=True)
(working_lib_dir / 'bin').mkdir(parents=True, exist_ok=True)
(working_lib_dir / "bin").mkdir(parents=True, exist_ok=True)
if install:
from archivebox.cli.archivebox_install import install as install_method
install_method()
if Snapshot.objects.count() < 25: # hide the hints for experienced users
if Snapshot.objects.count() < 25: # hide the hints for experienced users
print()
print(' [violet]Hint:[/violet] To view your archive index, run:')
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
print(" [violet]Hint:[/violet] To view your archive index, run:")
print(
" archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]",
)
print()
print(' To add new links, you can run:')
print(" To add new links, you can run:")
print(" archivebox add < ~/some/path/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
print(" For more usage and examples, run:")
print(" archivebox help")
@click.command()
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway")
@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs")
@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving")
@docstring(init.__doc__)
def main(**kwargs) -> None:
init(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import os
@@ -11,7 +11,7 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None:
"""Detect and install ArchiveBox dependencies by running the abx-dl install flow
Examples:
@@ -31,33 +31,34 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
# Show what we're installing
if binaries:
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]")
else:
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]")
if binproviders != '*':
print(f'[green][+] Using providers: {binproviders}[/green]')
if binproviders != "*":
print(f"[green][+] Using providers: {binproviders}[/green]")
if IS_ROOT:
EUID = os.geteuid()
print()
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]")
print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].")
print()
if dry_run:
print('[dim]Dry run - would run the abx-dl install flow[/dim]')
print("[dim]Dry run - would run the abx-dl install flow[/dim]")
return
# Set up Django
from archivebox.config.django import setup_django
setup_django()
plugin_names = list(binaries)
if binproviders != '*':
plugin_names.extend(provider.strip() for provider in binproviders.split(',') if provider.strip())
if binproviders != "*":
plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip())
print('[+] Running installer via abx-dl bus...')
print("[+] Running installer via abx-dl bus...")
print()
from archivebox.services.runner import run_install
@@ -68,28 +69,36 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
# Check for superuser
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green")
stderr(" archivebox manage createsuperuser")
print()
# Show version to display full status including installed binaries
# Django is already loaded, so just import and call the function directly
from archivebox.cli.archivebox_version import version as show_version
show_version(quiet=False)
@click.command()
@click.argument('binaries', nargs=-1, type=str, required=False)
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@click.argument("binaries", nargs=-1, type=str, required=False)
@click.option(
"--binproviders",
"-p",
default="*",
help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all",
show_default=True,
)
@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False)
@docstring(install.__doc__)
def main(**kwargs) -> None:
install(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,10 +1,9 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
__package__ = "archivebox.cli"
__command__ = "archivebox list"
import sys
from typing import Optional
import rich_click as click
@@ -12,31 +11,47 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
@click.command()
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--url__icontains', help='Filter by URL contains')
@click.option('--url__istartswith', help='Filter by URL starts with')
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--url__icontains", help="Filter by URL contains")
@click.option("--url__istartswith", help="Filter by URL starts with")
@click.option("--tag", "-t", help="Filter by tag name")
@click.option("--crawl-id", help="Filter by crawl ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
@click.argument("query", nargs=-1)
def main(
status: str | None,
url__icontains: str | None,
url__istartswith: str | None,
tag: str | None,
crawl_id: str | None,
limit: int | None,
sort: str | None,
csv: str | None,
with_headers: bool,
search: str | None,
query: tuple[str, ...],
) -> None:
"""List Snapshots."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
))
sys.exit(
list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
search=search,
query=" ".join(query),
),
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -19,11 +19,10 @@ Examples:
archivebox machine list --hostname__icontains=myserver
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox machine'
__package__ = "archivebox.cli"
__command__ = "archivebox machine"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -35,10 +34,11 @@ from archivebox.cli.cli_utils import apply_filters
# LIST
# =============================================================================
def list_machines(
hostname__icontains: Optional[str] = None,
os_platform: Optional[str] = None,
limit: Optional[int] = None,
hostname__icontains: str | None = None,
os_platform: str | None = None,
limit: int | None = None,
) -> int:
"""
List Machines as JSONL with optional filters.
@@ -51,24 +51,24 @@ def list_machines(
is_tty = sys.stdout.isatty()
queryset = Machine.objects.all().order_by('-created_at')
queryset = Machine.objects.all().order_by("-created_at")
# Apply filters
filter_kwargs = {
'hostname__icontains': hostname__icontains,
'os_platform': os_platform,
"hostname__icontains": hostname__icontains,
"os_platform": os_platform,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for machine in queryset:
if is_tty:
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}")
else:
write_record(machine.to_json())
count += 1
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr)
return 0
@@ -76,24 +76,27 @@ def list_machines(
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Machine records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--hostname__icontains', help='Filter by hostname contains')
@click.option('--os-platform', help='Filter by OS platform')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--hostname__icontains", help="Filter by hostname contains")
@click.option("--os-platform", help="Filter by OS platform")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None):
"""List Machines as JSONL."""
sys.exit(list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
))
sys.exit(
list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
),
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,33 +1,34 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import rich_click as click
from archivebox.misc.util import docstring, enforce_types
@enforce_types
def manage(args: list[str] | None=None) -> None:
def manage(args: list[str] | None = None) -> None:
"""Run an ArchiveBox Django management command"""
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.logging import stderr
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')
stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow")
stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow")
stderr("")
from django.core.management import execute_from_command_line
execute_from_command_line(['manage.py', *(args or ['help'])])
execute_from_command_line(["manage.py", *(args or ["help"])])
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
@click.argument('args', nargs=-1)
@click.argument("args", nargs=-1)
@docstring(manage.__doc__)
def main(args: list[str] | None=None) -> None:
def main(args: list[str] | None = None) -> None:
manage(args=args)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -6,8 +6,8 @@ Start the Model Context Protocol (MCP) server in stdio mode.
Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox mcp'
__package__ = "archivebox.cli"
__command__ = "archivebox mcp"
import rich_click as click
@@ -45,5 +45,5 @@ def main(**kwargs):
mcp()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -24,8 +24,8 @@ Examples:
archivebox persona list --name=old | archivebox persona delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox persona'
__package__ = "archivebox.cli"
__command__ = "archivebox persona"
import os
import sys
@@ -35,7 +35,7 @@ import subprocess
import tempfile
import json
from pathlib import Path
from typing import Optional, Iterable
from collections.abc import Iterable
from collections import OrderedDict
import rich_click as click
@@ -49,134 +49,145 @@ from archivebox.personas import importers as persona_importers
# Browser Profile Locations
# =============================================================================
def get_chrome_user_data_dir() -> Optional[Path]:
def get_chrome_user_data_dir() -> Path | None:
"""Get the default Chrome user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin': # macOS
if system == "Darwin": # macOS
candidates = [
home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
home / 'Library' / 'Application Support' / 'Chromium',
home / "Library" / "Application Support" / "Google" / "Chrome",
home / "Library" / "Application Support" / "Chromium",
]
elif system == 'Linux':
elif system == "Linux":
candidates = [
home / '.config' / 'google-chrome',
home / '.config' / 'chromium',
home / '.config' / 'chrome',
home / 'snap' / 'chromium' / 'common' / 'chromium',
home / ".config" / "google-chrome",
home / ".config" / "chromium",
home / ".config" / "chrome",
home / "snap" / "chromium" / "common" / "chromium",
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / 'Google' / 'Chrome' / 'User Data',
local_app_data / 'Chromium' / 'User Data',
local_app_data / "Google" / "Chrome" / "User Data",
local_app_data / "Chromium" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
if candidate.exists() and (candidate / "Default").exists():
return candidate
return None
def get_brave_user_data_dir() -> Optional[Path]:
def get_brave_user_data_dir() -> Path | None:
"""Get the default Brave user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
if system == "Darwin":
candidates = [
home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
]
elif system == 'Linux':
elif system == "Linux":
candidates = [
home / '.config' / 'BraveSoftware' / 'Brave-Browser',
home / ".config" / "BraveSoftware" / "Brave-Browser",
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
if candidate.exists() and (candidate / "Default").exists():
return candidate
return None
def get_edge_user_data_dir() -> Optional[Path]:
def get_edge_user_data_dir() -> Path | None:
"""Get the default Edge user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
if system == "Darwin":
candidates = [
home / 'Library' / 'Application Support' / 'Microsoft Edge',
home / "Library" / "Application Support" / "Microsoft Edge",
]
elif system == 'Linux':
elif system == "Linux":
candidates = [
home / '.config' / 'microsoft-edge',
home / '.config' / 'microsoft-edge-beta',
home / '.config' / 'microsoft-edge-dev',
home / ".config" / "microsoft-edge",
home / ".config" / "microsoft-edge-beta",
home / ".config" / "microsoft-edge-dev",
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
local_app_data / "Microsoft" / "Edge" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
if candidate.exists() and (candidate / "Default").exists():
return candidate
return None
def get_browser_binary(browser: str) -> Optional[str]:
def get_browser_binary(browser: str) -> str | None:
system = platform.system()
home = Path.home()
browser = browser.lower()
if system == 'Darwin':
if system == "Darwin":
candidates = {
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
}.get(browser, [])
elif system == 'Linux':
elif system == "Linux":
candidates = {
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
"chrome": [
"/usr/bin/google-chrome",
"/usr/bin/google-chrome-stable",
"/usr/bin/google-chrome-beta",
"/usr/bin/google-chrome-unstable",
],
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
"edge": [
"/usr/bin/microsoft-edge",
"/usr/bin/microsoft-edge-stable",
"/usr/bin/microsoft-edge-beta",
"/usr/bin/microsoft-edge-dev",
],
}.get(browser, [])
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = {
'chrome': [
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
"chrome": [
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
],
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
'brave': [
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
"brave": [
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
],
'edge': [
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
"edge": [
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
],
}.get(browser, [])
else:
@@ -190,13 +201,13 @@ def get_browser_binary(browser: str) -> Optional[str]:
BROWSER_PROFILE_FINDERS = {
'chrome': get_chrome_user_data_dir,
'chromium': get_chrome_user_data_dir, # Same locations
'brave': get_brave_user_data_dir,
'edge': get_edge_user_data_dir,
"chrome": get_chrome_user_data_dir,
"chromium": get_chrome_user_data_dir, # Same locations
"brave": get_brave_user_data_dir,
"edge": get_edge_user_data_dir,
}
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"}
# =============================================================================
@@ -204,12 +215,12 @@ CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
# =============================================================================
NETSCAPE_COOKIE_HEADER = [
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by ArchiveBox persona cookie extraction',
'#',
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
'',
"# Netscape HTTP Cookie File",
"# https://curl.se/docs/http-cookies.html",
"# This file was generated by ArchiveBox persona cookie extraction",
"#",
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
"",
]
@@ -219,9 +230,9 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
return cookies
for line in path.read_text().splitlines():
if not line or line.startswith('#'):
if not line or line.startswith("#"):
continue
parts = line.split('\t')
parts = line.split("\t")
if len(parts) < 7:
continue
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
@@ -233,8 +244,8 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
lines = list(NETSCAPE_COOKIE_HEADER)
for cookie in cookies.values():
lines.append('\t'.join(cookie))
path.write_text('\n'.join(lines) + '\n')
lines.append("\t".join(cookie))
path.write_text("\n".join(lines) + "\n")
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
@@ -259,52 +270,52 @@ def extract_cookies_via_cdp(
from archivebox.config.common import STORAGE_CONFIG
# Find the cookie extraction script
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
extract_script = chrome_plugin_dir / 'extract_cookies.js'
chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome"
extract_script = chrome_plugin_dir / "extract_cookies.js"
if not extract_script.exists():
rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr)
return False
# Get node modules dir
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
# Set up environment
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(node_modules_dir)
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
env['CHROME_HEADLESS'] = 'true'
env["NODE_MODULES_DIR"] = str(node_modules_dir)
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
env["CHROME_HEADLESS"] = "true"
if chrome_binary:
env['CHROME_BINARY'] = str(chrome_binary)
env["CHROME_BINARY"] = str(chrome_binary)
output_path = output_file
temp_output = None
temp_dir = None
if output_file.exists():
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
temp_output = temp_dir / 'cookies.txt'
temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_"))
temp_output = temp_dir / "cookies.txt"
output_path = temp_output
if profile_dir:
extra_arg = f'--profile-directory={profile_dir}'
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
extra_arg = f"--profile-directory={profile_dir}"
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
args_list = []
if existing_extra:
if existing_extra.startswith('['):
if existing_extra.startswith("["):
try:
parsed = json.loads(existing_extra)
if isinstance(parsed, list):
args_list.extend(str(x) for x in parsed)
except Exception:
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
else:
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
args_list.append(extra_arg)
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
env['COOKIES_OUTPUT_FILE'] = str(output_path)
env["COOKIES_OUTPUT_FILE"] = str(output_path)
try:
result = subprocess.run(
['node', str(extract_script)],
["node", str(extract_script)],
env=env,
capture_output=True,
text=True,
@@ -316,17 +327,17 @@ def extract_cookies_via_cdp(
_merge_netscape_cookies(output_file, temp_output)
return True
else:
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr)
return False
except subprocess.TimeoutExpired:
rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr)
return False
except FileNotFoundError:
rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr)
return False
except Exception as e:
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr)
return False
finally:
if temp_dir and temp_dir.exists():
@@ -337,6 +348,7 @@ def extract_cookies_via_cdp(
# Validation Helpers
# =============================================================================
def validate_persona_name(name: str) -> tuple[bool, str]:
"""
Validate persona name to prevent path traversal attacks.
@@ -348,19 +360,19 @@ def validate_persona_name(name: str) -> tuple[bool, str]:
return False, "Persona name cannot be empty"
# Check for path separators
if '/' in name or '\\' in name:
if "/" in name or "\\" in name:
return False, "Persona name cannot contain path separators (/ or \\)"
# Check for parent directory references
if '..' in name:
if ".." in name:
return False, "Persona name cannot contain parent directory references (..)"
# Check for hidden files/directories
if name.startswith('.'):
if name.startswith("."):
return False, "Persona name cannot start with a dot (.)"
# Ensure name doesn't contain null bytes or other dangerous chars
if '\x00' in name or '\n' in name or '\r' in name:
if "\x00" in name or "\n" in name or "\r" in name:
return False, "Persona name contains invalid characters"
return True, ""
@@ -394,10 +406,11 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
# CREATE
# =============================================================================
def create_personas(
names: Iterable[str],
import_from: Optional[str] = None,
profile: Optional[str] = None,
import_from: str | None = None,
profile: str | None = None,
) -> int:
"""
Create Personas from names.
@@ -416,7 +429,7 @@ def create_personas(
name_list = list(names) if names else []
if not name_list:
rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
return 1
# Validate import source if specified
@@ -424,23 +437,23 @@ def create_personas(
if import_from:
import_from = import_from.lower()
if import_from not in BROWSER_PROFILE_FINDERS:
rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr)
rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr)
return 1
source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
if not source_profile_dir:
rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr)
return 1
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr)
if profile is None and (source_profile_dir / 'Default').exists():
profile = 'Default'
if profile is None and (source_profile_dir / "Default").exists():
profile = "Default"
browser_binary = get_browser_binary(import_from)
if browser_binary:
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr)
created_count = 0
for name in name_list:
@@ -459,11 +472,11 @@ def create_personas(
if created:
persona.ensure_dirs()
created_count += 1
rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr)
else:
rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr)
cookies_file = Path(persona.path) / 'cookies.txt'
cookies_file = Path(persona.path) / "cookies.txt"
# Import browser profile if requested
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
@@ -477,29 +490,31 @@ def create_personas(
capture_storage=False,
)
except Exception as e:
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr)
return 1
if import_result.profile_copied:
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr)
if import_result.cookies_imported:
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr)
elif not import_result.profile_copied:
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr)
for warning in import_result.warnings:
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr)
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
'name': persona.name,
'path': str(persona.path),
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
'COOKIES_FILE': persona.COOKIES_FILE,
})
write_record(
{
"id": str(persona.id) if hasattr(persona, "id") else None,
"name": persona.name,
"path": str(persona.path),
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
"COOKIES_FILE": persona.COOKIES_FILE,
},
)
rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr)
return 0
@@ -507,10 +522,11 @@ def create_personas(
# LIST
# =============================================================================
def list_personas(
name: Optional[str] = None,
name__icontains: Optional[str] = None,
limit: Optional[int] = None,
name: str | None = None,
name__icontains: str | None = None,
limit: int | None = None,
) -> int:
"""
List Personas as JSONL with optional filters.
@@ -523,33 +539,35 @@ def list_personas(
is_tty = sys.stdout.isatty()
queryset = Persona.objects.all().order_by('name')
queryset = Persona.objects.all().order_by("name")
# Apply filters
filter_kwargs = {
'name': name,
'name__icontains': name__icontains,
"name": name,
"name__icontains": name__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for persona in queryset:
cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]"
chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]"
if is_tty:
rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]")
else:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
'name': persona.name,
'path': str(persona.path),
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
'COOKIES_FILE': persona.COOKIES_FILE,
})
write_record(
{
"id": str(persona.id) if hasattr(persona, "id") else None,
"name": persona.name,
"path": str(persona.path),
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
"COOKIES_FILE": persona.COOKIES_FILE,
},
)
count += 1
rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr)
return 0
@@ -557,7 +575,8 @@ def list_personas(
# UPDATE
# =============================================================================
def update_personas(name: Optional[str] = None) -> int:
def update_personas(name: str | None = None) -> int:
"""
Update Personas from stdin JSONL.
@@ -575,13 +594,13 @@ def update_personas(name: Optional[str] = None) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
persona_id = record.get('id')
old_name = record.get('name')
persona_id = record.get("id")
old_name = record.get("name")
if not persona_id and not old_name:
continue
@@ -613,17 +632,19 @@ def update_personas(name: Optional[str] = None) -> int:
updated_count += 1
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
'name': persona.name,
'path': str(persona.path),
})
write_record(
{
"id": str(persona.id) if hasattr(persona, "id") else None,
"name": persona.name,
"path": str(persona.path),
},
)
except Persona.DoesNotExist:
rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr)
return 0
@@ -631,6 +652,7 @@ def update_personas(name: Optional[str] = None) -> int:
# DELETE
# =============================================================================
def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Personas from stdin JSONL.
@@ -646,23 +668,24 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Collect persona IDs or names
persona_ids = []
persona_names = []
for r in records:
if r.get('id'):
persona_ids.append(r['id'])
elif r.get('name'):
persona_names.append(r['name'])
if r.get("id"):
persona_ids.append(r["id"])
elif r.get("name"):
persona_names.append(r["name"])
if not persona_ids and not persona_names:
rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr)
return 1
from django.db.models import Q
query = Q()
if persona_ids:
query |= Q(id__in=persona_ids)
@@ -673,17 +696,17 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
count = personas.count()
if count == 0:
rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr)
for persona in personas:
rprint(f' {persona.name} ({persona.path})', file=sys.stderr)
rprint(f" {persona.name} ({persona.path})", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Delete persona directories and database records
@@ -701,7 +724,7 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
persona.delete()
deleted_count += 1
rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr)
return 0
@@ -709,44 +732,45 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Persona records (browser profiles)."""
pass
@main.command('create')
@click.argument('names', nargs=-1)
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
@main.command("create")
@click.argument("names", nargs=-1)
@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)")
@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)")
def create_cmd(names: tuple, import_from: str | None, profile: str | None):
"""Create Personas, optionally importing from a browser profile."""
sys.exit(create_personas(names, import_from=import_from, profile=profile))
@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--name", help="Filter by exact name")
@click.option("--name__icontains", help="Filter by name contains")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
"""List Personas as JSONL."""
sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
@main.command("update")
@click.option("--name", "-n", help="Set new name")
def update_cmd(name: str | None):
"""Update Personas from stdin JSONL."""
sys.exit(update_personas(name=name))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Personas from stdin JSONL."""
sys.exit(delete_personas(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from typing import Optional
import rich_click as click
@@ -137,7 +136,7 @@ BINARY_MACHINE_DIAGRAM = """
@enforce_types
def pluginmap(
show_disabled: bool = False,
model: Optional[str] = None,
model: str | None = None,
quiet: bool = False,
) -> dict:
"""
@@ -164,25 +163,25 @@ def pluginmap(
# Model event types that can have hooks
model_events = {
'Crawl': {
'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
'machine': 'CrawlMachine',
'diagram': CRAWL_MACHINE_DIAGRAM,
"Crawl": {
"description": "Hooks run when a Crawl starts (QUEUED→STARTED)",
"machine": "CrawlMachine",
"diagram": CRAWL_MACHINE_DIAGRAM,
},
'CrawlEnd': {
'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
'machine': 'CrawlMachine',
'diagram': None, # Part of CrawlMachine
"CrawlEnd": {
"description": "Hooks run when a Crawl finishes (STARTED→SEALED)",
"machine": "CrawlMachine",
"diagram": None, # Part of CrawlMachine
},
'Snapshot': {
'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
'machine': 'SnapshotMachine',
'diagram': SNAPSHOT_MACHINE_DIAGRAM,
"Snapshot": {
"description": "Hooks run for each Snapshot (creates ArchiveResults)",
"machine": "SnapshotMachine",
"diagram": SNAPSHOT_MACHINE_DIAGRAM,
},
'Binary': {
'description': 'Hooks for installing binary dependencies (providers)',
'machine': 'BinaryMachine',
'diagram': BINARY_MACHINE_DIAGRAM,
"Binary": {
"description": "Hooks for installing binary dependencies (providers)",
"machine": "BinaryMachine",
"diagram": BINARY_MACHINE_DIAGRAM,
},
}
@@ -195,16 +194,16 @@ def pluginmap(
model_events = {model: model_events[model]}
result = {
'models': {},
'plugins_dir': str(BUILTIN_PLUGINS_DIR),
'user_plugins_dir': str(USER_PLUGINS_DIR),
"models": {},
"plugins_dir": str(BUILTIN_PLUGINS_DIR),
"user_plugins_dir": str(USER_PLUGINS_DIR),
}
if not quiet:
prnt()
prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]")
prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]")
prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]")
prnt()
for event_name, info in model_events.items():
@@ -218,88 +217,93 @@ def pluginmap(
plugin_name = hook_path.parent.name
is_bg = is_background_hook(hook_path.name)
hook_infos.append({
'path': str(hook_path),
'name': hook_path.name,
'plugin': plugin_name,
'is_background': is_bg,
'extension': hook_path.suffix,
})
hook_infos.append(
{
"path": str(hook_path),
"name": hook_path.name,
"plugin": plugin_name,
"is_background": is_bg,
"extension": hook_path.suffix,
},
)
result['models'][event_name] = {
'description': info['description'],
'machine': info['machine'],
'hooks': hook_infos,
'hook_count': len(hook_infos),
result["models"][event_name] = {
"description": info["description"],
"machine": info["machine"],
"hooks": hook_infos,
"hook_count": len(hook_infos),
}
if not quiet:
# Show diagram if this model has one
if info.get('diagram'):
assert info['diagram'] is not None
prnt(Panel(
info['diagram'],
title=f'[bold green]{info["machine"]}[/bold green]',
border_style='green',
expand=False,
))
if info.get("diagram"):
assert info["diagram"] is not None
prnt(
Panel(
info["diagram"],
title=f"[bold green]{info['machine']}[/bold green]",
border_style="green",
expand=False,
),
)
prnt()
# Create hooks table
table = Table(
title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)",
box=box.ROUNDED,
show_header=True,
header_style='bold magenta',
header_style="bold magenta",
)
table.add_column('Plugin', style='cyan', width=20)
table.add_column('Hook Name', style='green')
table.add_column('BG', justify='center', width=4)
table.add_column('Type', justify='center', width=5)
table.add_column("Plugin", style="cyan", width=20)
table.add_column("Hook Name", style="green")
table.add_column("BG", justify="center", width=4)
table.add_column("Type", justify="center", width=5)
# Sort lexicographically by hook name
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
sorted_hooks = sorted(hook_infos, key=lambda h: h["name"])
for hook in sorted_hooks:
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
ext = hook['extension'].lstrip('.')
bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else ""
ext = hook["extension"].lstrip(".")
table.add_row(
hook['plugin'],
hook['name'],
hook["plugin"],
hook["name"],
bg_marker,
ext,
)
prnt(table)
prnt()
prnt(f'[dim]{info["description"]}[/dim]')
prnt(f"[dim]{info['description']}[/dim]")
prnt()
# Summary
if not quiet:
total_hooks = sum(m['hook_count'] for m in result['models'].values())
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
total_hooks = sum(m["hook_count"] for m in result["models"].values())
prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]")
prnt()
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
prnt('[dim] - ext: py, sh, or js[/dim]')
prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]")
prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]")
prnt("[dim] - .bg: Background hook (non-blocking)[/dim]")
prnt("[dim] - ext: py, sh, or js[/dim]")
prnt()
return result
@click.command()
@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too")
@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)")
@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams")
@docstring(pluginmap.__doc__)
def main(**kwargs):
import json
result = pluginmap(**kwargs)
if kwargs.get('quiet'):
if kwargs.get("quiet"):
print(json.dumps(result, indent=2))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -22,11 +22,10 @@ Examples:
archivebox process list --limit=10
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox process'
__package__ = "archivebox.cli"
__command__ = "archivebox process"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -38,10 +37,11 @@ from archivebox.cli.cli_utils import apply_filters
# LIST
# =============================================================================
def list_processes(
binary_name: Optional[str] = None,
machine_id: Optional[str] = None,
limit: Optional[int] = None,
binary_name: str | None = None,
machine_id: str | None = None,
limit: int | None = None,
) -> int:
"""
List Processes as JSONL with optional filters.
@@ -54,29 +54,29 @@ def list_processes(
is_tty = sys.stdout.isatty()
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts")
# Apply filters
filter_kwargs = {}
if binary_name:
filter_kwargs['binary__name'] = binary_name
filter_kwargs["binary__name"] = binary_name
if machine_id:
filter_kwargs['machine_id'] = machine_id
filter_kwargs["machine_id"] = machine_id
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for process in queryset:
if is_tty:
binary_name_str = process.binary.name if process.binary else 'unknown'
exit_code = process.exit_code if process.exit_code is not None else '?'
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
binary_name_str = process.binary.name if process.binary else "unknown"
exit_code = process.exit_code if process.exit_code is not None else "?"
status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow"
rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]")
else:
write_record(process.to_json())
count += 1
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr)
return 0
@@ -84,24 +84,27 @@ def list_processes(
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Process records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--binary-name', '-b', help='Filter by binary name')
@click.option('--machine-id', '-m', help='Filter by machine ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--binary-name", "-b", help="Filter by binary name")
@click.option("--machine-id", "-m", help="Filter by machine ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None):
"""List Processes as JSONL."""
sys.exit(list_processes(
binary_name=binary_name,
machine_id=machine_id,
limit=limit,
))
sys.exit(
list_processes(
binary_name=binary_name,
machine_id=machine_id,
limit=limit,
),
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
__package__ = "archivebox.cli"
__command__ = "archivebox remove"
import shutil
from pathlib import Path
from typing import Iterable
from collections.abc import Iterable
import rich_click as click
@@ -26,25 +26,27 @@ from archivebox.misc.logging_util import (
@enforce_types
def remove(filter_patterns: Iterable[str]=(),
filter_type: str='exact',
snapshots: QuerySet | None=None,
after: float | None=None,
before: float | None=None,
yes: bool=False,
delete: bool=False,
out_dir: Path=DATA_DIR) -> QuerySet:
def remove(
filter_patterns: Iterable[str] = (),
filter_type: str = "exact",
snapshots: QuerySet | None = None,
after: float | None = None,
before: float | None = None,
yes: bool = False,
delete: bool = False,
out_dir: Path = DATA_DIR,
) -> QuerySet:
"""Remove the specified URLs from the archive"""
setup_django()
check_data_folder()
from archivebox.cli.archivebox_search import get_snapshots
pattern_list = list(filter_patterns)
log_list_started(pattern_list or None, filter_type)
timer = TimedProgress(360, prefix=' ')
timer = TimedProgress(360, prefix=" ")
try:
snapshots = get_snapshots(
snapshots=snapshots,
@@ -63,7 +65,7 @@ def remove(filter_patterns: Iterable[str]=(),
log_list_finished(snapshots)
log_removal_started(snapshots, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
timer = TimedProgress(360, prefix=" ")
try:
for snapshot in snapshots:
if delete:
@@ -88,17 +90,23 @@ def remove(filter_patterns: Iterable[str]=(),
@click.command()
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.argument('filter_patterns', nargs=-1)
@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm")
@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index")
@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp")
@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp")
@click.option(
"--filter-type",
"-f",
type=click.Choice(("exact", "substring", "domain", "regex", "tag")),
default="exact",
help="Type of pattern matching to use when filtering URLs",
)
@click.argument("filter_patterns", nargs=-1)
@docstring(remove.__doc__)
def main(**kwargs):
"""Remove the specified URLs from the archive"""
remove(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -37,8 +37,8 @@ Examples:
archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox run'
__package__ = "archivebox.cli"
__command__ = "archivebox run"
import sys
from collections import defaultdict
@@ -87,8 +87,8 @@ def process_stdin_records() -> int:
binary_ids: list[str] = []
for record in records:
record_type = record.get('type', '')
record_id = record.get('id')
record_type = record.get("type", "")
record_id = record.get("id")
try:
if record_type == TYPE_CRAWL:
@@ -97,10 +97,10 @@ def process_stdin_records() -> int:
try:
crawl = Crawl.objects.get(id=record_id)
except Crawl.DoesNotExist:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
else:
# New crawl - create it
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
if crawl:
crawl.retry_at = timezone.now()
@@ -112,16 +112,16 @@ def process_stdin_records() -> int:
output_records.append(crawl.to_json())
queued_count += 1
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type):
if record_id:
# Existing snapshot - re-queue
try:
snapshot = Snapshot.objects.get(id=record_id)
except Snapshot.DoesNotExist:
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
else:
# New snapshot - create it
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
if snapshot:
snapshot.retry_at = timezone.now()
@@ -132,7 +132,7 @@ def process_stdin_records() -> int:
crawl.retry_at = timezone.now()
if crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.QUEUED
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=["status", "retry_at", "modified_at"])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
run_all_plugins_for_crawl.add(crawl_id)
@@ -149,11 +149,16 @@ def process_stdin_records() -> int:
else:
archiveresult = None
snapshot_id = record.get('snapshot_id')
plugin_name = record.get('plugin')
snapshot_id = record.get("snapshot_id")
plugin_name = record.get("plugin")
snapshot = None
if archiveresult:
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
if archiveresult.status in [
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
ArchiveResult.StatusChoices.BACKOFF,
]:
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
plugin_name = plugin_name or archiveresult.plugin
@@ -167,12 +172,12 @@ def process_stdin_records() -> int:
snapshot.retry_at = timezone.now()
if snapshot.status != Snapshot.StatusChoices.STARTED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
crawl.retry_at = timezone.now()
if crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.QUEUED
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=["status", "retry_at", "modified_at"])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
if plugin_name:
@@ -203,7 +208,7 @@ def process_stdin_records() -> int:
output_records.append(record)
except Exception as e:
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr)
continue
# Output all processed records (for chaining)
@@ -212,10 +217,10 @@ def process_stdin_records() -> int:
write_record(rec)
if queued_count == 0:
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
rprint("[yellow]No records to process[/yellow]", file=sys.stderr)
return 0
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr)
for binary_id in binary_ids:
run_binary(binary_id)
@@ -245,13 +250,14 @@ def run_runner(daemon: bool = False) -> int:
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
Process.cleanup_stale_running()
Process.cleanup_orphaned_workers()
recover_orphaned_snapshots()
recover_orphaned_crawls()
Machine.current()
current = Process.current()
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
current.process_type = Process.TypeChoices.ORCHESTRATOR
current.save(update_fields=['process_type', 'modified_at'])
current.save(update_fields=["process_type", "modified_at"])
try:
run_pending_crawls(daemon=daemon)
@@ -259,21 +265,21 @@ def run_runner(daemon: bool = False) -> int:
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
return 1
finally:
current.refresh_from_db()
if current.status != Process.StatusChoices.EXITED:
current.status = Process.StatusChoices.EXITED
current.ended_at = current.ended_at or timezone.now()
current.save(update_fields=['status', 'ended_at', 'modified_at'])
current.save(update_fields=["status", "ended_at", "modified_at"])
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--crawl-id', help="Run the crawl runner for a specific crawl only")
@click.option('--snapshot-id', help="Run one snapshot through its crawl")
@click.option('--binary-id', help="Run one queued binary install directly on the bus")
@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)")
@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only")
@click.option("--snapshot-id", help="Run one snapshot through its crawl")
@click.option("--binary-id", help="Run one queued binary install directly on the bus")
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
"""
Process queued work.
@@ -297,21 +303,24 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
if crawl_id:
try:
from archivebox.services.runner import run_crawl
run_crawl(crawl_id)
sys.exit(0)
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
@@ -333,17 +342,18 @@ def run_snapshot_worker(snapshot_id: str) -> int:
from archivebox.services.runner import run_crawl
try:
snapshot = Snapshot.objects.select_related('crawl').get(id=snapshot_id)
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)])
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import rich_click as click
from rich import print
@@ -10,18 +10,20 @@ from archivebox.config.common import ARCHIVING_CONFIG
@enforce_types
def schedule(add: bool = False,
show: bool = False,
clear: bool = False,
foreground: bool = False,
run_all: bool = False,
quiet: bool = False,
every: str | None = None,
tag: str = '',
depth: int | str = 0,
overwrite: bool = False,
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
import_path: str | None = None):
def schedule(
add: bool = False,
show: bool = False,
clear: bool = False,
foreground: bool = False,
run_all: bool = False,
quiet: bool = False,
every: str | None = None,
tag: str = "",
depth: int | str = 0,
overwrite: bool = False,
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
import_path: str | None = None,
):
"""Manage database-backed scheduled crawls processed by the crawl runner."""
from django.utils import timezone
@@ -33,55 +35,51 @@ def schedule(add: bool = False,
depth = int(depth)
result: dict[str, object] = {
'created_schedule_ids': [],
'disabled_count': 0,
'run_all_enqueued': 0,
'active_schedule_ids': [],
"created_schedule_ids": [],
"disabled_count": 0,
"run_all_enqueued": 0,
"active_schedule_ids": [],
}
def _active_schedules():
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at")
if clear:
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
is_enabled=False,
modified_at=timezone.now(),
)
result['disabled_count'] = disabled_count
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
result["disabled_count"] = disabled_count
print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]")
if every or add:
schedule_str = (every or 'day').strip()
schedule_str = (every or "day").strip()
validate_schedule(schedule_str)
created_by_id = get_or_create_system_user_pk()
is_update_schedule = not import_path
template_urls = import_path or 'archivebox://update'
template_label = (
f'Scheduled import: {template_urls}'
if import_path else
'Scheduled ArchiveBox update'
)[:64]
template_urls = import_path or "archivebox://update"
template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64]
template_notes = (
f'Created by archivebox schedule for {template_urls}'
if import_path else
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
f"Created by archivebox schedule for {template_urls}"
if import_path
else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls."
)
template = Crawl.objects.create(
urls=template_urls,
max_depth=0 if is_update_schedule else depth,
tags_str='' if is_update_schedule else tag,
tags_str="" if is_update_schedule else tag,
label=template_label,
notes=template_notes,
created_by_id=created_by_id,
status=Crawl.StatusChoices.SEALED,
retry_at=None,
config={
'ONLY_NEW': not update,
'OVERWRITE': overwrite,
'DEPTH': 0 if is_update_schedule else depth,
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
"ONLY_NEW": not update,
"OVERWRITE": overwrite,
"DEPTH": 0 if is_update_schedule else depth,
"SCHEDULE_KIND": "update" if is_update_schedule else "crawl",
},
)
crawl_schedule = CrawlSchedule.objects.create(
@@ -92,31 +90,31 @@ def schedule(add: bool = False,
notes=template_notes,
created_by_id=created_by_id,
)
result['created_schedule_ids'] = [str(crawl_schedule.id)]
result["created_schedule_ids"] = [str(crawl_schedule.id)]
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
print(f' id={crawl_schedule.id}')
print(f' every={crawl_schedule.schedule}')
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
schedule_type = "maintenance update" if is_update_schedule else "crawl"
print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]")
print(f" id={crawl_schedule.id}")
print(f" every={crawl_schedule.schedule}")
print(f" next_run={crawl_schedule.next_run_at.isoformat()}")
if import_path:
print(f' source={import_path}')
print(f" source={import_path}")
schedules = list(_active_schedules())
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules]
if show:
if schedules:
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]")
for scheduled_crawl in schedules:
template = scheduled_crawl.template
print(
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
f'source={template.urls.splitlines()[0] if template.urls else ""}'
f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} "
f"next_run={scheduled_crawl.next_run_at.isoformat()} "
f"source={template.urls.splitlines()[0] if template.urls else ''}",
)
else:
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
if run_all:
enqueued = 0
@@ -124,13 +122,17 @@ def schedule(add: bool = False,
for scheduled_crawl in schedules:
scheduled_crawl.enqueue(queued_at=now)
enqueued += 1
result['run_all_enqueued'] = enqueued
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
result["run_all_enqueued"] = enqueued
print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]")
if enqueued:
print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
print(
"[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]",
)
if foreground:
print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
print(
"[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]",
)
run_pending_crawls(daemon=True)
if quiet:
@@ -138,33 +140,38 @@ def schedule(add: bool = False,
if not any((every, add, show, clear, foreground, run_all)):
if schedules:
print('[green]\\[*] Active scheduled crawls:[/green]')
print("[green]\\[*] Active scheduled crawls:[/green]")
for scheduled_crawl in schedules:
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}")
else:
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
return result
@click.command()
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
@click.argument('import_path', required=False)
@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output")
@click.option("--add", is_flag=True, help="Create a new scheduled crawl")
@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots")
@click.option(
"--depth",
type=click.Choice([str(i) for i in range(5)]),
default="0",
help="Recursively archive linked pages up to N hops away",
)
@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run")
@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules")
@click.option("--show", is_flag=True, help="Print all currently enabled schedules")
@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)")
@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once")
@click.argument("import_path", required=False)
@docstring(schedule.__doc__)
def main(**kwargs):
"""Manage database-backed scheduled crawls processed by the crawl runner."""
schedule(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,12 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox search'
__package__ = "archivebox.cli"
__command__ = "archivebox search"
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Callable
from typing import TYPE_CHECKING
from collections.abc import Callable
import rich_click as click
@@ -20,30 +21,28 @@ if TYPE_CHECKING:
# Filter types for URL matching
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
'exact': lambda pattern: Q(url=pattern),
'substring': lambda pattern: Q(url__icontains=pattern),
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: (
Q(url__istartswith=f'http://{pattern}')
| Q(url__istartswith=f'https://{pattern}')
| Q(url__istartswith=f'ftp://{pattern}')
"exact": lambda pattern: Q(url=pattern),
"substring": lambda pattern: Q(url__icontains=pattern),
"regex": lambda pattern: Q(url__iregex=pattern),
"domain": lambda pattern: (
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
),
'tag': lambda pattern: Q(tags__name=pattern),
'timestamp': lambda pattern: Q(timestamp=pattern),
"tag": lambda pattern: Q(tags__name=pattern),
"timestamp": lambda pattern: Q(timestamp=pattern),
}
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
def _apply_pattern_filters(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
filter_patterns: list[str],
filter_type: str,
) -> QuerySet['Snapshot', 'Snapshot']:
) -> QuerySet["Snapshot", "Snapshot"]:
filter_builder = LINK_FILTERS.get(filter_type)
if filter_builder is None:
stderr()
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red')
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
raise SystemExit(2)
query = Q()
@@ -53,7 +52,7 @@ def _apply_pattern_filters(
def _snapshots_to_json(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
with_headers: bool,
) -> str:
@@ -63,31 +62,35 @@ def _snapshots_to_json(
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.util import to_json
main_index_header = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': VERSION,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': {},
},
} if with_headers else {}
main_index_header = (
{
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
"schema": "archivebox.index.json",
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
"meta": {
"project": "ArchiveBox",
"version": VERSION,
"git_sha": VERSION,
"website": "https://ArchiveBox.io",
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
"source": "https://github.com/ArchiveBox/ArchiveBox",
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
"dependencies": {},
},
}
if with_headers
else {}
)
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
output: dict[str, object] | list[dict[str, object]]
if with_headers:
output = {
**main_index_header,
'num_links': len(snapshot_dicts),
'updated': datetime.now(tz.utc),
'last_run_cmd': sys.argv,
'links': snapshot_dicts,
"num_links": len(snapshot_dicts),
"updated": datetime.now(tz.utc),
"last_run_cmd": sys.argv,
"links": snapshot_dicts,
}
else:
output = snapshot_dicts
@@ -96,18 +99,18 @@ def _snapshots_to_json(
def _snapshots_to_csv(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
cols: list[str],
with_headers: bool,
) -> str:
header = ','.join(cols) if with_headers else ''
rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)]
return '\n'.join((header, *rows))
header = ",".join(cols) if with_headers else ""
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
return "\n".join((header, *rows))
def _snapshots_to_html(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
with_headers: bool,
) -> str:
@@ -119,26 +122,31 @@ def _snapshots_to_html(
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
template = 'static_index.html' if with_headers else 'minimal_index.html'
template = "static_index.html" if with_headers else "minimal_index.html"
snapshot_list = list(snapshots.iterator(chunk_size=500))
return render_to_string(template, {
'version': VERSION,
'git_sha': get_COMMIT_HASH() or VERSION,
'num_links': str(len(snapshot_list)),
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
'links': snapshot_list,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
})
return render_to_string(
template,
{
"version": VERSION,
"git_sha": get_COMMIT_HASH() or VERSION,
"num_links": str(len(snapshot_list)),
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
"links": snapshot_list,
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
},
)
def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
filter_patterns: list[str] | None=None,
filter_type: str='substring',
after: float | None=None,
before: float | None=None,
out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']:
def get_snapshots(
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
filter_patterns: list[str] | None = None,
filter_type: str = "substring",
after: float | None = None,
before: float | None = None,
out_dir: Path = DATA_DIR,
) -> QuerySet["Snapshot", "Snapshot"]:
"""Filter and return Snapshots matching the given criteria."""
from archivebox.core.models import Snapshot
@@ -155,29 +163,31 @@ def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
result = _apply_pattern_filters(result, filter_patterns, filter_type)
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
result = result.select_related('crawl', 'crawl__created_by')
result = result.select_related("crawl", "crawl__created_by")
if not result.exists():
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
return result
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
status: str='indexed',
before: float | None=None,
after: float | None=None,
sort: str | None=None,
json: bool=False,
html: bool=False,
csv: str | None=None,
with_headers: bool=False):
def search(
filter_patterns: list[str] | None = None,
filter_type: str = "substring",
status: str = "indexed",
before: float | None = None,
after: float | None = None,
sort: str | None = None,
json: bool = False,
html: bool = False,
csv: str | None = None,
with_headers: bool = False,
):
"""List, filter, and export information about archive entries"""
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
raise SystemExit(2)
# Query DB directly - no filesystem scanning
@@ -189,9 +199,9 @@ def search(filter_patterns: list[str] | None=None,
)
# Apply status filter
if status == 'archived':
if status == "archived":
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == 'unarchived':
elif status == "unarchived":
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
@@ -204,9 +214,10 @@ def search(filter_patterns: list[str] | None=None,
elif html:
output = _snapshots_to_html(snapshots, with_headers=with_headers)
elif csv:
output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers)
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
output = printable_folders(folders, with_headers)
@@ -214,28 +225,33 @@ def search(filter_patterns: list[str] | None=None,
# Structured exports must be written directly to stdout.
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
if not output.endswith("\n"):
sys.stdout.write("\n")
return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@click.option(
"--filter-type",
"-f",
type=click.Choice(["search", *LINK_FILTERS.keys()]),
default="substring",
help="Pattern matching type for filtering URLs",
)
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
@click.help_option("--help", "-h")
@click.argument("filter_patterns", nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from typing import Iterable
from collections.abc import Iterable
import sys
import rich_click as click
@@ -15,20 +15,23 @@ from archivebox.config.common import SERVER_CONFIG
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
"""Stop any existing orchestrator process so the server can take ownership."""
process_model.cleanup_stale_running(machine=machine)
process_model.cleanup_orphaned_workers()
running_runners = list(process_model.objects.filter(
machine=machine,
status=process_model.StatusChoices.RUNNING,
process_type=process_model.TypeChoices.ORCHESTRATOR,
).order_by('created_at'))
running_runners = list(
process_model.objects.filter(
machine=machine,
status=process_model.StatusChoices.RUNNING,
process_type=process_model.TypeChoices.ORCHESTRATOR,
).order_by("created_at"),
)
if not running_runners:
return 0
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]")
if supervisor is not None and stop_worker_fn is not None:
for worker_name in ('worker_runner', 'worker_runner_watch'):
for worker_name in ("worker_runner", "worker_runner_watch"):
try:
stop_worker_fn(supervisor, worker_name)
except Exception:
@@ -47,23 +50,70 @@ def stop_existing_background_runner(*, machine, process_model, supervisor=None,
return len(running_runners)
def _read_supervisor_worker_command(worker_name: str) -> str:
from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file
worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf"
if not worker_conf.exists():
return ""
for line in worker_conf.read_text().splitlines():
if line.startswith("command="):
return line.removeprefix("command=").strip()
return ""
def _worker_command_matches_bind(command: str, host: str, port: str) -> bool:
if not command:
return False
return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command)
def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int:
"""Stop existing ArchiveBox web workers if they already own the requested bind."""
stopped = 0
for worker_name in ("worker_runserver", "worker_daphne"):
try:
proc = supervisor.getProcessInfo(worker_name) if supervisor else None
except Exception:
proc = None
if not isinstance(proc, dict) or proc.get("statename") != "RUNNING":
continue
command = _read_supervisor_worker_command(worker_name)
if not _worker_command_matches_bind(command, host, port):
continue
if stopped == 0:
log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]")
stop_worker_fn(supervisor, worker_name)
stopped += 1
return stopped
@enforce_types
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
reload: bool=False,
init: bool=False,
debug: bool=False,
daemonize: bool=False,
nothreading: bool=False) -> None:
def server(
runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,),
reload: bool = False,
init: bool = False,
debug: bool = False,
daemonize: bool = False,
nothreading: bool = False,
) -> None:
"""Run the ArchiveBox HTTP server"""
runserver_args = list(runserver_args)
if init:
from archivebox.cli.archivebox_init import init as archivebox_init
archivebox_init(quick=True)
print()
from archivebox.misc.checks import check_data_folder
check_data_folder()
from archivebox.config.common import SHELL_CONFIG
@@ -73,22 +123,24 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
SHELL_CONFIG.DEBUG = True
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
print()
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
print(' [green]archivebox manage createsuperuser[/green]')
print(
"[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:",
)
print(" [green]archivebox manage createsuperuser[/green]")
print()
host = '127.0.0.1'
port = '8000'
host = "127.0.0.1"
port = "8000"
try:
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
if ':' in host_and_port:
host, port = host_and_port.split(':')
host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0]
if ":" in host_and_port:
host, port = host_and_port.split(":")
else:
if '.' in host_and_port:
if "." in host_and_port:
host = host_and_port
else:
port = host_and_port
@@ -104,66 +156,80 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
)
from archivebox.machine.models import Machine, Process
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
machine = Machine.current()
supervisor = get_existing_supervisord_process()
stop_existing_background_runner(
machine=machine,
process_model=Process,
supervisor=get_existing_supervisord_process(),
supervisor=supervisor,
stop_worker_fn=stop_worker,
)
if supervisor:
stop_existing_server_workers(
supervisor=supervisor,
stop_worker_fn=stop_worker,
host=host,
port=port,
)
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f"[red][X] Error: Port {port} is already in use[/red]")
print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}")
print(" Stop the conflicting process or choose a different port")
sys.exit(1)
supervisor = get_existing_supervisord_process()
if supervisor:
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne"
server_proc = get_worker(supervisor, server_worker_name)
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
if server_state == 'RUNNING':
runner_proc = get_worker(supervisor, 'worker_runner')
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if runner_state == 'RUNNING':
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
if runner_watch_state == 'RUNNING':
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None
if server_state == "RUNNING":
runner_proc = get_worker(supervisor, "worker_runner")
runner_watch_proc = get_worker(supervisor, "worker_runner_watch")
runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None
runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None
print("[red][X] Error: ArchiveBox server is already running[/red]")
print(
f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
)
if runner_state == "RUNNING":
print(" [green]√[/green] Background runner (worker_runner) is RUNNING")
if runner_watch_state == "RUNNING":
print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING")
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print("[yellow]To stop the existing server, run:[/yellow]")
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
print(" pkill -f supervisord")
sys.exit(1)
if run_in_debug:
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]")
else:
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print("[green][+] Starting ArchiveBox webserver...[/green]")
print(
f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
)
print(
f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]",
)
print(" > Writing ArchiveBox error log to ./logs/errors.log")
print()
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
@click.command()
@click.argument('runserver_args', nargs=-1)
@click.option('--reload', is_flag=True, help='Enable auto-reloading when code or templates change')
@click.option('--debug', is_flag=True, help='Enable DEBUG=True mode with more verbose errors')
@click.option('--nothreading', is_flag=True, help='Force runserver to run in single-threaded mode')
@click.option('--init', is_flag=True, help='Run a full archivebox init/upgrade before starting the server')
@click.option('--daemonize', is_flag=True, help='Run the server in the background as a daemon')
@click.argument("runserver_args", nargs=-1)
@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change")
@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors")
@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode")
@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server")
@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon")
@docstring(server.__doc__)
def main(**kwargs):
server(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,27 +1,28 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from typing import Iterable
from collections.abc import Iterable
import rich_click as click
from archivebox.misc.util import docstring
def shell(args: Iterable[str]=()) -> None:
def shell(args: Iterable[str] = ()) -> None:
"""Enter an interactive ArchiveBox Django shell"""
from django.core.management import call_command
call_command("shell_plus", *args)
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
@click.argument('args', nargs=-1)
@click.argument("args", nargs=-1)
@docstring(shell.__doc__)
def main(args: Iterable[str]=()) -> None:
def main(args: Iterable[str] = ()) -> None:
shell(args=args)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,14 +27,16 @@ Examples:
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
__package__ = "archivebox.cli"
__command__ = "archivebox snapshot"
import sys
from typing import Optional, Iterable
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
from django.db.models import Q, Sum
from django.db.models.functions import Coalesce
from archivebox.cli.cli_utils import apply_filters
@@ -43,12 +45,13 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_snapshots(
urls: Iterable[str],
tag: str = '',
status: str = 'queued',
tag: str = "",
status: str = "queued",
depth: int = 0,
created_by_id: Optional[int] = None,
created_by_id: int | None = None,
) -> int:
"""
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
@@ -59,8 +62,10 @@ def create_snapshots(
1: Failure
"""
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_CRAWL
read_args_or_stdin,
write_record,
TYPE_SNAPSHOT,
TYPE_CRAWL,
)
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.core.models import Snapshot
@@ -73,7 +78,7 @@ def create_snapshots(
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Process each record - handle Crawls and plain URLs/Snapshots
@@ -81,7 +86,7 @@ def create_snapshots(
pass_through_count = 0
for record in records:
record_type = record.get('type', '')
record_type = record.get("type", "")
try:
if record_type == TYPE_CRAWL:
@@ -91,14 +96,14 @@ def create_snapshots(
# Input is a Crawl - get or create it, then create Snapshots for its URLs
crawl = None
crawl_id = record.get('id')
crawl_id = record.get("id")
if crawl_id:
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
else:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
if not crawl:
continue
@@ -109,27 +114,27 @@ def create_snapshots(
if tag:
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
snapshot_record = {
'url': url,
'tags': merged_tags,
'crawl_id': str(crawl.id),
'depth': depth,
'status': status,
"url": url,
"tags": merged_tags,
"crawl_id": str(crawl.id),
"depth": depth,
"status": status,
}
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_json())
elif record_type == TYPE_SNAPSHOT or record.get('url'):
elif record_type == TYPE_SNAPSHOT or record.get("url"):
# Input is a Snapshot or plain URL
if tag and not record.get('tags'):
record['tags'] = tag
if tag and not record.get("tags"):
record["tags"] = tag
if status:
record['status'] = status
record['depth'] = record.get('depth', depth)
record["status"] = status
record["depth"] = record.get("depth", depth)
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
@@ -142,21 +147,21 @@ def create_snapshots(
pass_through_count += 1
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr)
continue
if not created_snapshots:
if pass_through_count > 0:
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr)
return 0
rprint('[red]No snapshots created[/red]', file=sys.stderr)
rprint("[red]No snapshots created[/red]", file=sys.stderr)
return 1
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr)
if is_tty:
for snapshot in created_snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
return 0
@@ -165,16 +170,19 @@ def create_snapshots(
# LIST
# =============================================================================
def list_snapshots(
status: Optional[str] = None,
url__icontains: Optional[str] = None,
url__istartswith: Optional[str] = None,
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
sort: Optional[str] = None,
csv: Optional[str] = None,
status: str | None = None,
url__icontains: str | None = None,
url__istartswith: str | None = None,
tag: str | None = None,
crawl_id: str | None = None,
limit: int | None = None,
sort: str | None = None,
csv: str | None = None,
with_headers: bool = False,
search: str | None = None,
query: str | None = None,
) -> int:
"""
List Snapshots as JSONL with optional filters.
@@ -184,64 +192,106 @@ def list_snapshots(
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
from archivebox.search import (
get_default_search_mode,
get_search_mode,
prioritize_metadata_matches,
query_search_index,
)
if with_headers and not csv:
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr)
return 2
is_tty = sys.stdout.isatty() and not csv
queryset = Snapshot.objects.all().order_by('-created_at')
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at")
# Apply filters
filter_kwargs = {
'status': status,
'url__icontains': url__icontains,
'url__istartswith': url__istartswith,
'crawl_id': crawl_id,
"status": status,
"url__icontains": url__icontains,
"url__istartswith": url__istartswith,
"crawl_id": crawl_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
queryset = apply_filters(queryset, filter_kwargs)
# Tag filter requires special handling (M2M)
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
query = (query or "").strip()
if query:
metadata_qs = queryset.filter(
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
)
requested_search_mode = (search or "").strip().lower()
if requested_search_mode == "content":
requested_search_mode = "contents"
search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode)
if search_mode == "meta":
queryset = metadata_qs
else:
try:
deep_qsearch = None
if search_mode == "deep":
qsearch = query_search_index(query, search_mode="contents")
deep_qsearch = query_search_index(query, search_mode="deep")
else:
qsearch = query_search_index(query, search_mode=search_mode)
queryset = prioritize_metadata_matches(
queryset,
metadata_qs,
qsearch,
deep_queryset=deep_qsearch,
ordering=("-created_at",) if not sort else None,
)
except Exception as err:
rprint(
f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]",
file=sys.stderr,
)
queryset = metadata_qs
if sort:
queryset = queryset.order_by(sort)
if limit:
queryset = queryset[:limit]
count = 0
if csv:
cols = [col.strip() for col in csv.split(',') if col.strip()]
cols = [col.strip() for col in csv.split(",") if col.strip()]
if not cols:
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
rprint("[red]No CSV columns provided[/red]", file=sys.stderr)
return 2
rows: list[str] = []
if with_headers:
rows.append(','.join(cols))
rows.append(",".join(cols))
for snapshot in queryset.iterator(chunk_size=500):
rows.append(snapshot.to_csv(cols=cols, separator=','))
rows.append(snapshot.to_csv(cols=cols, separator=","))
count += 1
output = '\n'.join(rows)
output = "\n".join(rows)
if output:
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
if not output.endswith("\n"):
sys.stdout.write("\n")
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
return 0
for snapshot in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(snapshot.status, 'dim')
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
"queued": "yellow",
"started": "blue",
"sealed": "green",
}.get(snapshot.status, "dim")
rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}")
else:
write_record(snapshot.to_json())
count += 1
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
return 0
@@ -249,9 +299,10 @@ def list_snapshots(
# UPDATE
# =============================================================================
def update_snapshots(
status: Optional[str] = None,
tag: Optional[str] = None,
status: str | None = None,
tag: str | None = None,
) -> int:
"""
Update Snapshots from stdin JSONL.
@@ -272,12 +323,12 @@ def update_snapshots(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
snapshot_id = record.get('id')
snapshot_id = record.get("id")
if not snapshot_id:
continue
@@ -292,6 +343,7 @@ def update_snapshots(
# Add tag to existing tags
snapshot.save() # Ensure saved before M2M
from archivebox.core.models import Tag
tag_obj, _ = Tag.objects.get_or_create(name=tag)
snapshot.tags.add(tag_obj)
@@ -302,10 +354,10 @@ def update_snapshots(
write_record(snapshot.to_json())
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr)
return 0
@@ -313,6 +365,7 @@ def update_snapshots(
# DELETE
# =============================================================================
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Snapshots from stdin JSONL.
@@ -328,35 +381,35 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
snapshot_ids = [r.get('id') for r in records if r.get('id')]
snapshot_ids = [r.get("id") for r in records if r.get("id")]
if not snapshot_ids:
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr)
return 1
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
count = snapshots.count()
if count == 0:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr)
for snapshot in snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = snapshots.delete()
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr)
return 0
@@ -364,57 +417,81 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Snapshot records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
"""Create Snapshots from URLs or stdin JSONL."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--url__icontains', help='Filter by URL contains')
@click.option('--url__istartswith', help='Filter by URL starts with')
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--url__icontains", help="Filter by URL contains")
@click.option("--url__istartswith", help="Filter by URL starts with")
@click.option("--tag", "-t", help="Filter by tag name")
@click.option("--crawl-id", help="Filter by crawl ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
@click.argument("query", nargs=-1)
def list_cmd(
status: str | None,
url__icontains: str | None,
url__istartswith: str | None,
tag: str | None,
crawl_id: str | None,
limit: int | None,
sort: str | None,
csv: str | None,
with_headers: bool,
search: str | None,
query: tuple[str, ...],
):
"""List Snapshots as JSONL."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
))
sys.exit(
list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
search=search,
query=" ".join(query),
),
)
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--tag', '-t', help='Add tag')
def update_cmd(status: Optional[str], tag: Optional[str]):
@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--tag", "-t", help="Add tag")
def update_cmd(status: str | None, tag: str | None):
"""Update Snapshots from stdin JSONL."""
sys.exit(update_snapshots(status=status, tag=tag))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Snapshots from stdin JSONL."""
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
__package__ = "archivebox.cli"
__command__ = "archivebox snapshot"
import sys
@@ -10,15 +10,15 @@ import rich_click as click
from archivebox.cli.archivebox_snapshot import create_snapshots
@click.command(context_settings={'ignore_unknown_options': True})
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
@click.argument('urls', nargs=-1)
@click.command(context_settings={"ignore_unknown_options": True})
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
@click.argument("urls", nargs=-1)
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from pathlib import Path
@@ -16,31 +16,34 @@ from archivebox.misc.logging_util import printable_filesize
@enforce_types
def status(out_dir: Path=DATA_DIR) -> None:
def status(out_dir: Path = DATA_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
from django.contrib.auth import get_user_model
from django.db.models import Sum
from django.db.models.functions import Coalesce
from archivebox.core.models import Snapshot
User = get_user_model()
print('[green]\\[*] Scanning archive main index...[/green]')
print(f'[yellow] {out_dir}/*[/yellow]')
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
print("[green]\\[*] Scanning archive main index...[/green]")
print(f"[yellow] {out_dir}/*[/yellow]")
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.")
size = printable_filesize(num_bytes)
print(f' Index size: {size} across {num_files} files')
print(f" Index size: {size} across {num_files} files")
print()
links = list(Snapshot.objects.all())
links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)))
num_sql_links = len(links)
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})")
print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)")
print()
print('[green]\\[*] Scanning archive data directories...[/green]')
users_dir = out_dir / 'users'
print("[green]\\[*] Scanning archive data directories...[/green]")
users_dir = out_dir / "users"
scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()]
scan_roots_display = ', '.join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
print(f'[yellow] {scan_roots_display}[/yellow]')
scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
print(f"[yellow] {scan_roots_display}[/yellow]")
num_bytes = num_dirs = num_files = 0
for root in scan_roots:
root_bytes, root_dirs, root_files = get_dir_size(root)
@@ -48,80 +51,66 @@ def status(out_dir: Path=DATA_DIR) -> None:
num_dirs += root_dirs
num_files += root_files
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print(f" Size: {size} across {num_files} files in {num_dirs} directories")
# Use DB as source of truth for snapshot status
num_indexed = len(links)
num_archived = sum(1 for snapshot in links if snapshot.is_archived)
num_unarchived = max(num_indexed - num_archived, 0)
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)")
print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)")
print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)")
# Count snapshot directories on filesystem across both legacy and current layouts.
expected_snapshot_dirs = {
str(Path(snapshot.output_dir).resolve())
for snapshot in links
if Path(snapshot.output_dir).exists()
}
expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()}
discovered_snapshot_dirs = set()
if ARCHIVE_DIR.exists():
discovered_snapshot_dirs.update(
str(entry.resolve())
for entry in ARCHIVE_DIR.iterdir()
if entry.is_dir()
)
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir())
if users_dir.exists():
discovered_snapshot_dirs.update(
str(entry.resolve())
for entry in users_dir.glob('*/snapshots/*/*/*')
if entry.is_dir()
)
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir())
orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs)
num_present = len(discovered_snapshot_dirs)
num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs)
print()
print(f' > present: {num_present}'.ljust(36), '(snapshot directories on disk)')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)")
print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)")
num_orphaned = len(orphaned_dirs)
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)")
if num_indexed:
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
print(" [violet]Hint:[/violet] You can list snapshots by status like so:")
print(" [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]")
if orphaned_dirs:
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
print(' [green]archivebox init[/green]')
print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:")
print(" [green]archivebox init[/green]")
print()
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
admin_users = User.objects.filter(is_superuser=True).exclude(username='system')
print("[green]\\[*] Scanning recent archive changes and user logins:[/green]")
print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]")
admin_users = User.objects.filter(is_superuser=True).exclude(username="system")
users = [user.get_username() for user in admin_users]
print(f' UI users {len(users)}: {", ".join(users)}')
last_login = admin_users.order_by('last_login').last()
print(f" UI users {len(users)}: {', '.join(users)}")
last_login = admin_users.order_by("last_login").last()
if last_login:
print(f' Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}')
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}")
last_downloaded = Snapshot.objects.order_by("downloaded_at").last()
if last_downloaded:
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}")
if not users:
print()
print(' [violet]Hint:[/violet] You can create an admin user by running:')
print(' [green]archivebox manage createsuperuser[/green]')
print(" [violet]Hint:[/violet] You can create an admin user by running:")
print(" [green]archivebox manage createsuperuser[/green]")
print()
recent_snapshots = sorted(
links,
key=lambda snapshot: (
snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at
),
key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at,
reverse=True,
)[:10]
for snapshot in recent_snapshots:
@@ -129,14 +118,14 @@ def status(out_dir: Path=DATA_DIR) -> None:
continue
print(
(
'[grey53] '
f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
"[grey53] "
f" > {str(snapshot.downloaded_at)[:16]} "
f"[{snapshot.num_outputs} {('X', '')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] "
f'"{snapshot.title}": {snapshot.url}'
'[/grey53]'
)[:SHELL_CONFIG.TERM_WIDTH],
"[/grey53]"
)[: SHELL_CONFIG.TERM_WIDTH],
)
print('[grey53] ...')
print("[grey53] ...")
@click.command()
@@ -146,5 +135,5 @@ def main(**kwargs):
status(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,11 +27,11 @@ Examples:
archivebox tag list --name=unused | archivebox tag delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox tag'
__package__ = "archivebox.cli"
__command__ = "archivebox tag"
import sys
from typing import Optional, Iterable
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
@@ -43,6 +43,7 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_tags(names: Iterable[str]) -> int:
"""
Create Tags from names.
@@ -60,7 +61,7 @@ def create_tags(names: Iterable[str]) -> int:
name_list = list(names) if names else []
if not name_list:
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
return 1
created_count = 0
@@ -76,11 +77,11 @@ def create_tags(names: Iterable[str]) -> int:
if created:
created_count += 1
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr)
else:
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr)
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr)
return 0
@@ -88,10 +89,11 @@ def create_tags(names: Iterable[str]) -> int:
# LIST
# =============================================================================
def list_tags(
name: Optional[str] = None,
name__icontains: Optional[str] = None,
limit: Optional[int] = None,
name: str | None = None,
name__icontains: str | None = None,
limit: int | None = None,
) -> int:
"""
List Tags as JSONL with optional filters.
@@ -104,12 +106,12 @@ def list_tags(
is_tty = sys.stdout.isatty()
queryset = Tag.objects.all().order_by('name')
queryset = Tag.objects.all().order_by("name")
# Apply filters
filter_kwargs = {
'name': name,
'name__icontains': name__icontains,
"name": name,
"name__icontains": name__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
@@ -117,12 +119,12 @@ def list_tags(
for tag in queryset:
snapshot_count = tag.snapshot_set.count()
if is_tty:
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]")
else:
write_record(tag.to_json())
count += 1
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr)
return 0
@@ -130,7 +132,8 @@ def list_tags(
# UPDATE
# =============================================================================
def update_tags(name: Optional[str] = None) -> int:
def update_tags(name: str | None = None) -> int:
"""
Update Tags from stdin JSONL.
@@ -148,13 +151,13 @@ def update_tags(name: Optional[str] = None) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
tag_id = record.get('id')
old_name = record.get('name')
tag_id = record.get("id")
old_name = record.get("name")
if not tag_id and not old_name:
continue
@@ -176,10 +179,10 @@ def update_tags(name: Optional[str] = None) -> int:
write_record(tag.to_json())
except Tag.DoesNotExist:
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr)
return 0
@@ -187,6 +190,7 @@ def update_tags(name: Optional[str] = None) -> int:
# DELETE
# =============================================================================
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Tags from stdin JSONL.
@@ -202,23 +206,24 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Collect tag IDs or names
tag_ids = []
tag_names = []
for r in records:
if r.get('id'):
tag_ids.append(r['id'])
elif r.get('name'):
tag_names.append(r['name'])
if r.get("id"):
tag_ids.append(r["id"])
elif r.get("name"):
tag_names.append(r["name"])
if not tag_ids and not tag_names:
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr)
return 1
from django.db.models import Q
query = Q()
if tag_ids:
query |= Q(id__in=tag_ids)
@@ -229,22 +234,22 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
count = tags.count()
if count == 0:
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr)
for tag in tags:
rprint(f' {tag.name}', file=sys.stderr)
rprint(f" {tag.name}", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = tags.delete()
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr)
return 0
@@ -252,42 +257,43 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Tag records."""
pass
@main.command('create')
@click.argument('names', nargs=-1)
@main.command("create")
@click.argument("names", nargs=-1)
def create_cmd(names: tuple):
"""Create Tags from names."""
sys.exit(create_tags(names))
@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--name", help="Filter by exact name")
@click.option("--name__icontains", help="Filter by name contains")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
"""List Tags as JSONL."""
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
@main.command("update")
@click.option("--name", "-n", help="Set new name")
def update_cmd(name: str | None):
"""Update Tags from stdin JSONL."""
sys.exit(update_tags(name=name))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Tags from stdin JSONL."""
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,12 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import os
import time
from typing import TYPE_CHECKING, Callable, Iterable
from typing import TYPE_CHECKING, Any
from collections.abc import Callable, Iterable
from pathlib import Path
import rich_click as click
@@ -20,24 +21,22 @@ if TYPE_CHECKING:
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
'exact': lambda pattern: Q(url=pattern),
'substring': lambda pattern: Q(url__icontains=pattern),
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: (
Q(url__istartswith=f'http://{pattern}')
| Q(url__istartswith=f'https://{pattern}')
| Q(url__istartswith=f'ftp://{pattern}')
"exact": lambda pattern: Q(url=pattern),
"substring": lambda pattern: Q(url__icontains=pattern),
"regex": lambda pattern: Q(url__iregex=pattern),
"domain": lambda pattern: (
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
),
'tag': lambda pattern: Q(tags__name=pattern),
'timestamp': lambda pattern: Q(timestamp=pattern),
"tag": lambda pattern: Q(tags__name=pattern),
"timestamp": lambda pattern: Q(timestamp=pattern),
}
def _apply_pattern_filters(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
filter_patterns: list[str],
filter_type: str,
) -> QuerySet['Snapshot', 'Snapshot']:
) -> QuerySet["Snapshot", "Snapshot"]:
filter_builder = LINK_FILTERS.get(filter_type)
if filter_builder is None:
raise SystemExit(2)
@@ -48,21 +47,120 @@ def _apply_pattern_filters(
return snapshots.filter(query)
def _get_snapshot_crawl(snapshot: 'Snapshot') -> 'Crawl | None':
def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None":
try:
return snapshot.crawl
except ObjectDoesNotExist:
return None
def _get_search_indexing_plugins() -> list[str]:
from abx_dl.models import discover_plugins
from archivebox.hooks import get_search_backends
available_backends = set(get_search_backends())
plugins = discover_plugins()
return sorted(
plugin_name
for plugin_name, plugin in plugins.items()
if plugin_name.startswith("search_backend_")
and plugin_name.removeprefix("search_backend_") in available_backends
and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks)
)
def _build_filtered_snapshots_queryset(
*,
filter_patterns: Iterable[str],
filter_type: str,
before: float | None,
after: float | None,
resume: str | None = None,
):
from archivebox.core.models import Snapshot
from datetime import datetime
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
if before:
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
if resume:
snapshots = snapshots.filter(timestamp__lte=resume)
return snapshots.select_related("crawl").order_by("-bookmarked_at")
def reindex_snapshots(
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
search_plugins: list[str],
batch_size: int,
) -> dict[str, int]:
from archivebox.cli.archivebox_extract import run_plugins
stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0}
records: list[dict[str, str]] = []
total = snapshots.count()
print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}")
for snapshot in snapshots.iterator(chunk_size=batch_size):
stats["processed"] += 1
if _get_snapshot_crawl(snapshot) is None:
continue
output_dir = Path(snapshot.output_dir)
has_directory = output_dir.exists() and output_dir.is_dir()
if has_directory:
snapshot.reconcile_with_index_json()
stats["reconciled"] += 1
for plugin_name in search_plugins:
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
if existing_result:
existing_result.reset_for_retry()
records.append(
{
"type": "ArchiveResult",
"snapshot_id": str(snapshot.id),
"plugin": plugin_name,
},
)
stats["queued"] += 1
if not records:
return stats
exit_code = run_plugins(
args=(),
records=records,
wait=True,
emit_results=False,
)
if exit_code != 0:
raise SystemExit(exit_code)
stats["reindexed"] = len(records)
return stats
@enforce_types
def update(filter_patterns: Iterable[str] = (),
filter_type: str = 'exact',
before: float | None = None,
after: float | None = None,
resume: str | None = None,
batch_size: int = 100,
continuous: bool = False) -> None:
def update(
filter_patterns: Iterable[str] = (),
filter_type: str = "exact",
before: float | None = None,
after: float | None = None,
resume: str | None = None,
batch_size: int = 100,
continuous: bool = False,
index_only: bool = False,
) -> None:
"""
Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving.
@@ -77,41 +175,69 @@ def update(filter_patterns: Iterable[str] = (),
from rich import print
from archivebox.config.django import setup_django
setup_django()
from django.core.management import call_command
# Run migrations first to ensure DB schema is up-to-date
print('[*] Checking for pending migrations...')
print("[*] Checking for pending migrations...")
try:
call_command('migrate', '--no-input', verbosity=0)
call_command("migrate", "--no-input", verbosity=0)
except Exception as e:
print(f'[!] Warning: Migration check failed: {e}')
print(f"[!] Warning: Migration check failed: {e}")
while True:
if filter_patterns or before or after:
if index_only:
search_plugins = _get_search_indexing_plugins()
if not search_plugins:
print("[*] No search indexing plugins are available, nothing to backfill.")
break
if not (filter_patterns or before or after):
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
drain_old_archive_dirs(
resume_from=resume,
batch_size=batch_size,
)
snapshots = _build_filtered_snapshots_queryset(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
resume=resume,
)
stats = reindex_snapshots(
snapshots,
search_plugins=search_plugins,
batch_size=batch_size,
)
print_index_stats(stats)
elif filter_patterns or before or after:
# Filtered mode: query DB only
print('[*] Processing filtered snapshots from database...')
print("[*] Processing filtered snapshots from database...")
stats = process_filtered_snapshots(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
batch_size=batch_size
resume=resume,
batch_size=batch_size,
)
print_stats(stats)
else:
# Full mode: drain old dirs + process DB
stats_combined = {'phase1': {}, 'phase2': {}}
stats_combined = {"phase1": {}, "phase2": {}}
print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
stats_combined['phase1'] = drain_old_archive_dirs(
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
stats_combined["phase1"] = drain_old_archive_dirs(
resume_from=resume,
batch_size=batch_size
batch_size=batch_size,
)
print('[*] Phase 2: Processing all database snapshots (most recent first)...')
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
print("[*] Phase 2: Processing all database snapshots (most recent first)...")
stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume)
# Phase 3: Deduplication (disabled for now)
# print('[*] Phase 3: Deduplicating...')
@@ -122,7 +248,7 @@ def update(filter_patterns: Iterable[str] = (),
if not continuous:
break
print('[yellow]Sleeping 60s before next pass...[/yellow]')
print("[yellow]Sleeping 60s before next pass...[/yellow]")
time.sleep(60)
resume = None
@@ -144,34 +270,34 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
from archivebox.config import CONSTANTS
from django.db import transaction
stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0}
archive_dir = CONSTANTS.ARCHIVE_DIR
if not archive_dir.exists():
return stats
print('[DEBUG Phase1] Scanning for old directories in archive/...')
print("[DEBUG Phase1] Scanning for old directories in archive/...")
# Scan for real directories only (skip symlinks - they're already migrated)
all_entries = list(os.scandir(archive_dir))
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}")
entries = [
(e.stat().st_mtime, e.path)
for e in all_entries
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
print(f'[*] Found {len(entries)} old directories to drain')
print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}")
print(f"[*] Found {len(entries)} old directories to drain")
for mtime, entry_path in entries:
entry_path = Path(entry_path)
# Resume from timestamp if specified
if resume_from and entry_path.name < resume_from:
if resume_from and entry_path.name > resume_from:
continue
stats['processed'] += 1
stats["processed"] += 1
# Try to load existing snapshot from DB
snapshot = Snapshot.load_from_directory(entry_path)
@@ -182,16 +308,16 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
if not snapshot:
# Invalid directory - move to invalid/
Snapshot.move_directory_to_invalid(entry_path)
stats['invalid'] += 1
stats["invalid"] += 1
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
try:
snapshot.save()
stats['migrated'] += 1
stats["migrated"] += 1
print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}")
except Exception as e:
stats['skipped'] += 1
stats["skipped"] += 1
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
continue
@@ -201,30 +327,35 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
if not has_valid_crawl:
# Create a new crawl (created_by will default to system user)
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.create(urls=snapshot.url)
# Use queryset update to avoid triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
# Refresh the instance
snapshot.crawl = crawl
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
# Check if needs migration (0.8.x → 0.9.x)
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
print(
f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
)
if snapshot.fs_migration_needed:
try:
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
# because snapshot.timestamp might be truncated
old_dir = entry_path
new_dir = snapshot.get_storage_path_for_version('0.9.0')
new_dir = snapshot.get_storage_path_for_version("0.9.0")
print(f"[DEBUG Phase1] Migrating {old_dir.name}{new_dir}")
# Manually migrate files
if not new_dir.exists() and old_dir.exists():
new_dir.mkdir(parents=True, exist_ok=True)
import shutil
file_count = 0
for old_file in old_dir.rglob('*'):
for old_file in old_dir.rglob("*"):
if old_file.is_file():
rel_path = old_file.relative_to(old_dir)
new_file = new_dir / rel_path
@@ -236,7 +367,8 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
# Update only fs_version field using queryset update (bypasses validation)
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
# Commit the transaction
transaction.commit()
@@ -245,22 +377,22 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
if old_dir.exists() and old_dir != new_dir:
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
stats['migrated'] += 1
stats["migrated"] += 1
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
except Exception as e:
stats['skipped'] += 1
stats["skipped"] += 1
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
else:
stats['skipped'] += 1
stats["skipped"] += 1
if stats['processed'] % batch_size == 0:
if stats["processed"] % batch_size == 0:
transaction.commit()
transaction.commit()
return stats
def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]:
"""
O(n) scan over entire DB from most recent to least recent.
@@ -275,24 +407,30 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
from django.db import transaction
from django.utils import timezone
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
stats = {"processed": 0, "reconciled": 0, "queued": 0}
total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database (most recent first)...')
queryset = Snapshot.objects.all()
if resume:
queryset = queryset.filter(timestamp__lte=resume)
total = queryset.count()
print(f"[*] Processing {total} snapshots from database (most recent first)...")
# Process from most recent to least recent
for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size):
stats['processed'] += 1
for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size):
stats["processed"] += 1
# Skip snapshots with missing crawl references (orphaned by migration errors)
if _get_snapshot_crawl(snapshot) is None:
continue
try:
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
print(
f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
)
# Check if snapshot has a directory on disk
from pathlib import Path
output_dir = Path(snapshot.output_dir)
has_directory = output_dir.exists() and output_dir.is_dir()
@@ -313,22 +451,23 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
# Use queryset update to set fs_version without triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
snapshot.fs_version = '0.9.0'
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
snapshot.fs_version = "0.9.0"
# Queue for archiving (state machine will handle it)
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1 if has_directory else 0
stats['queued'] += 1
stats["reconciled"] += 1 if has_directory else 0
stats["queued"] += 1
except Exception as e:
# Skip snapshots that can't be processed (e.g., missing crawl)
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
continue
if stats['processed'] % batch_size == 0:
if stats["processed"] % batch_size == 0:
transaction.commit()
print(f" [{stats['processed']}/{total}] Processed...")
@@ -341,31 +480,28 @@ def process_filtered_snapshots(
filter_type: str,
before: float | None,
after: float | None,
batch_size: int
resume: str | None,
batch_size: int,
) -> dict[str, int]:
"""Process snapshots matching filters (DB query only)."""
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
from datetime import datetime
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
stats = {"processed": 0, "reconciled": 0, "queued": 0}
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
if before:
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
snapshots = _build_filtered_snapshots_queryset(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
resume=resume,
)
total = snapshots.count()
print(f'[*] Found {total} matching snapshots')
print(f"[*] Found {total} matching snapshots")
for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size):
stats['processed'] += 1
for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size):
stats["processed"] += 1
# Skip snapshots with missing crawl references
if _get_snapshot_crawl(snapshot) is None:
@@ -384,14 +520,14 @@ def process_filtered_snapshots(
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1
stats['queued'] += 1
stats["reconciled"] += 1
stats["queued"] += 1
except Exception as e:
# Skip snapshots that can't be processed
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
continue
if stats['processed'] % batch_size == 0:
if stats["processed"] % batch_size == 0:
transaction.commit()
print(f" [{stats['processed']}/{total}] Processed...")
@@ -405,9 +541,9 @@ def print_stats(stats: dict):
print(f"""
[green]Update Complete[/green]
Processed: {stats['processed']}
Reconciled: {stats['reconciled']}
Queued: {stats['queued']}
Processed: {stats["processed"]}
Reconciled: {stats["reconciled"]}
Queued: {stats["queued"]}
""")
@@ -415,37 +551,50 @@ def print_combined_stats(stats_combined: dict):
"""Print statistics for full mode."""
from rich import print
s1 = stats_combined['phase1']
s2 = stats_combined['phase2']
s1 = stats_combined["phase1"]
s2 = stats_combined["phase2"]
print(f"""
[green]Archive Update Complete[/green]
Phase 1 (Drain Old Dirs):
Checked: {s1.get('processed', 0)}
Migrated: {s1.get('migrated', 0)}
Skipped: {s1.get('skipped', 0)}
Invalid: {s1.get('invalid', 0)}
Checked: {s1.get("processed", 0)}
Migrated: {s1.get("migrated", 0)}
Skipped: {s1.get("skipped", 0)}
Invalid: {s1.get("invalid", 0)}
Phase 2 (Process DB):
Processed: {s2.get('processed', 0)}
Reconciled: {s2.get('reconciled', 0)}
Queued: {s2.get('queued', 0)}
Processed: {s2.get("processed", 0)}
Reconciled: {s2.get("reconciled", 0)}
Queued: {s2.get("queued", 0)}
""")
def print_index_stats(stats: dict[str, Any]) -> None:
from rich import print
print(f"""
[green]Search Reindex Complete[/green]
Processed: {stats["processed"]}
Reconciled: {stats["reconciled"]}
Queued: {stats["queued"]}
Reindexed: {stats["reindexed"]}
""")
@click.command()
@click.option('--resume', type=str, help='Resume from timestamp')
@click.option('--before', type=float, help='Only snapshots before timestamp')
@click.option('--after', type=float, help='Only snapshots after timestamp')
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
@click.argument('filter_patterns', nargs=-1)
@click.option("--resume", type=str, help="Resume from timestamp")
@click.option("--before", type=float, help="Only snapshots before timestamp")
@click.option("--after", type=float, help="Only snapshots after timestamp")
@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact")
@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots")
@click.option("--continuous", is_flag=True, help="Run continuously as background worker")
@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content")
@click.argument("filter_patterns", nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
update(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,12 +1,12 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import sys
import os
import platform
from pathlib import Path
from typing import Iterable
from collections.abc import Iterable
import rich_click as click
@@ -14,19 +14,22 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def version(quiet: bool=False,
binaries: Iterable[str]=()) -> list[str]:
def version(
quiet: bool = False,
binaries: Iterable[str] = (),
) -> list[str]:
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
# fast path for just getting the version and exiting, dont do any slower imports
from archivebox.config.version import VERSION
print(VERSION)
if quiet or '--version' in sys.argv:
if quiet or "--version" in sys.argv:
return []
from rich.panel import Panel
from rich.console import Console
from archivebox.config import CONSTANTS, DATA_DIR
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
@@ -34,78 +37,89 @@ def version(quiet: bool=False,
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.misc.logging_util import printable_folder_status
from archivebox.config.configset import get_config
console = Console()
prnt = console.print
# Check if LDAP is enabled (simple config lookup)
config = get_config()
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
LDAP_ENABLED = config.get("LDAP_ENABLED", False)
p = platform.uname()
COMMIT_HASH = get_COMMIT_HASH()
prnt(
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={get_BUILD_TIME()}',
f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]",
f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}",
f"BUILD_TIME={get_BUILD_TIME()}",
)
prnt(
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
f"IN_DOCKER={IN_DOCKER}",
f"IN_QEMU={SHELL_CONFIG.IN_QEMU}",
f"ARCH={p.machine}",
f"OS={p.system}",
f"PLATFORM={platform.platform()}",
f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""),
)
try:
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
except Exception:
OUTPUT_IS_REMOTE_FS = False
try:
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}",
f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}",
f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}",
f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}",
)
except Exception:
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
)
prnt(
f'DEBUG={SHELL_CONFIG.DEBUG}',
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
f'SUDO={CONSTANTS.IS_ROOT}',
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP_ENABLED}',
f"DEBUG={SHELL_CONFIG.DEBUG}",
f"IS_TTY={SHELL_CONFIG.IS_TTY}",
f"SUDO={CONSTANTS.IS_ROOT}",
f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}",
f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}",
f"LDAP={LDAP_ENABLED}",
)
prnt()
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
PANEL_TEXT = '\n'.join((
'',
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
'',
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
'',
))
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
PANEL_TEXT = "\n".join(
(
"",
"[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...",
" [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.",
"",
" [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]",
"",
),
)
prnt(
Panel(
PANEL_TEXT,
expand=False,
border_style="grey53",
title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]",
subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
),
)
prnt()
return []
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]")
failures = []
# Setup Django before importing models
try:
from archivebox.config.django import setup_django
setup_django()
from archivebox.machine.models import Machine, Binary
@@ -113,12 +127,17 @@ def version(quiet: bool=False,
machine = Machine.current()
# Get all binaries from the database with timeout protection
all_installed = Binary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
all_installed = (
Binary.objects.filter(
machine=machine,
)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("name")
)
if not all_installed.exists():
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
else:
for installed in all_installed:
# Skip if user specified specific binaries and this isn't one
@@ -126,71 +145,91 @@ def version(quiet: bool=False,
continue
if installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
version_str = (installed.version or "unknown")[:15]
provider = (installed.binprovider or "env")[:8]
prnt(
"",
"[green]√[/green]",
"",
installed.name.ljust(18),
version_str.ljust(16),
provider.ljust(8),
display_path,
overflow="ignore",
crop=False,
)
else:
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
failures.append(installed.name)
# Show hint if no binaries are installed yet
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()
if not has_any_installed:
prnt()
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]")
except Exception as e:
# Handle database errors gracefully (locked, missing, etc.)
prnt()
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]")
prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]")
if not binaries:
# Show code and data locations
prnt()
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]")
try:
for name, path in get_code_locations().items():
if isinstance(name, str) and isinstance(path, dict):
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
except Exception as e:
prnt(f' [red]Error getting code locations: {e}[/red]')
prnt(f" [red]Error getting code locations: {e}[/red]")
prnt()
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
prnt("[bright_yellow][i] Data locations:[/bright_yellow]")
try:
for name, path in get_data_locations().items():
if isinstance(name, str) and isinstance(path, dict):
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
except Exception as e:
prnt(f' [red]Error getting data locations: {e}[/red]')
prnt(f" [red]Error getting data locations: {e}[/red]")
try:
from archivebox.misc.checks import check_data_dir_permissions
check_data_dir_permissions()
except Exception:
pass
else:
prnt()
prnt('[red][i] Data locations:[/red] (not in a data directory)')
prnt("[red][i] Data locations:[/red] (not in a data directory)")
prnt()
if failures:
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
prnt(f' [red]{", ".join(failures)}[/red]')
prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]")
prnt(f" [red]{', '.join(failures)}[/red]")
prnt()
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
prnt(' [green]archivebox install[/green]')
prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:")
prnt(" [green]archivebox install[/green]")
prnt()
return failures
@click.command()
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
@click.option(
"--quiet",
"-q",
is_flag=True,
help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)",
)
@click.option(
"--binaries",
"-b",
help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)",
)
@docstring(version.__doc__)
def main(**kwargs):
failures = version(**kwargs)
@@ -198,5 +237,5 @@ def main(**kwargs):
raise SystemExit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -5,12 +5,10 @@ This module contains common utilities used across multiple CLI commands,
extracted to avoid code duplication.
"""
__package__ = 'archivebox.cli'
from typing import Optional
__package__ = "archivebox.cli"
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None):
"""
Apply Django-style filters from CLI kwargs to a QuerySet.
@@ -31,11 +29,11 @@ def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""
filters = {}
for key, value in filter_kwargs.items():
if value is None or key in ('limit', 'offset'):
if value is None or key in ("limit", "offset"):
continue
# Handle CSV lists for __in filters
if key.endswith('__in') and isinstance(value, str):
value = [v.strip() for v in value.split(',')]
if key.endswith("__in") and isinstance(value, str):
value = [v.strip() for v in value.split(",")]
filters[key] = value
if filters:

View File

@@ -5,16 +5,16 @@ This module provides backwards-compatible config exports for extractors
and other modules that expect to import config values directly.
"""
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
__order__ = 200
from .paths import (
PACKAGE_DIR, # noqa
DATA_DIR, # noqa
ARCHIVE_DIR, # noqa
PACKAGE_DIR,
DATA_DIR,
ARCHIVE_DIR,
)
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa
###############################################################################
@@ -22,15 +22,18 @@ from .version import VERSION # noqa
# These provide backwards compatibility with extractors that import from ..config
###############################################################################
def _get_config():
"""Lazy import to avoid circular imports."""
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
return ARCHIVING_CONFIG, STORAGE_CONFIG
# Direct exports (evaluated at import time for backwards compat)
# These are recalculated each time the module attribute is accessed
def __getattr__(name: str):
"""
Module-level __getattr__ for lazy config loading.
@@ -40,38 +43,38 @@ def __getattr__(name: str):
"""
# Generic timeout settings (used by multiple plugins)
if name == 'TIMEOUT':
if name == "TIMEOUT":
cfg, _ = _get_config()
return cfg.TIMEOUT
# Generic SSL/Security settings (used by multiple plugins)
if name == 'CHECK_SSL_VALIDITY':
if name == "CHECK_SSL_VALIDITY":
cfg, _ = _get_config()
return cfg.CHECK_SSL_VALIDITY
# Generic storage settings (used by multiple plugins)
if name == 'RESTRICT_FILE_NAMES':
if name == "RESTRICT_FILE_NAMES":
_, storage = _get_config()
return storage.RESTRICT_FILE_NAMES
# Generic user agent / cookies (used by multiple plugins)
if name == 'COOKIES_FILE':
if name == "COOKIES_FILE":
cfg, _ = _get_config()
return cfg.COOKIES_FILE
if name == 'USER_AGENT':
if name == "USER_AGENT":
cfg, _ = _get_config()
return cfg.USER_AGENT
# Generic resolution settings (used by multiple plugins)
if name == 'RESOLUTION':
if name == "RESOLUTION":
cfg, _ = _get_config()
return cfg.RESOLUTION
# Allowlist/Denylist patterns (compiled regexes)
if name == 'SAVE_ALLOWLIST_PTN':
if name == "SAVE_ALLOWLIST_PTN":
cfg, _ = _get_config()
return cfg.SAVE_ALLOWLIST_PTNS
if name == 'SAVE_DENYLIST_PTN':
if name == "SAVE_DENYLIST_PTN":
cfg, _ = _get_config()
return cfg.SAVE_DENYLIST_PTNS
@@ -90,12 +93,13 @@ def get_CONFIG():
SEARCH_BACKEND_CONFIG,
)
from .ldap import LDAP_CONFIG
return {
'SHELL_CONFIG': SHELL_CONFIG,
'STORAGE_CONFIG': STORAGE_CONFIG,
'GENERAL_CONFIG': GENERAL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
'LDAP_CONFIG': LDAP_CONFIG,
"SHELL_CONFIG": SHELL_CONFIG,
"STORAGE_CONFIG": STORAGE_CONFIG,
"GENERAL_CONFIG": GENERAL_CONFIG,
"SERVER_CONFIG": SERVER_CONFIG,
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
"SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
"LDAP_CONFIG": LDAP_CONFIG,
}

View File

@@ -1,8 +1,8 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import os
import json
from typing import Any, Optional, Type, Tuple, Dict
from typing import Any
from pathlib import Path
from configparser import ConfigParser
@@ -27,13 +27,15 @@ def get_real_name(key: str) -> str:
return key
def load_config_val(key: str,
default: Any=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[benedict]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> Any:
def load_config_val(
key: str,
default: Any = None,
type: type | None = None,
aliases: tuple[str, ...] | None = None,
config: benedict | None = None,
env_vars: os._Environ | None = None,
config_file_vars: dict[str, str] | None = None,
) -> Any:
"""parse bool, int, and str key=value pairs from env"""
assert isinstance(config, dict)
@@ -67,8 +69,8 @@ def load_config_val(key: str,
assert isinstance(val, str)
# calculate value based on expected type
BOOL_TRUEIES = ('true', 'yes', '1')
BOOL_FALSEIES = ('false', 'no', '0')
BOOL_TRUEIES = ("true", "yes", "1")
BOOL_FALSEIES = ("false", "no", "0")
if type is bool:
if val.lower() in BOOL_TRUEIES:
@@ -76,28 +78,28 @@ def load_config_val(key: str,
elif val.lower() in BOOL_FALSEIES:
return False
else:
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
raise ValueError(f"Invalid configuration option {key}={val} (expected a boolean: True/False)")
elif type is str:
if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES):
raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)')
raise ValueError(f"Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)")
return val.strip()
elif type is int:
if not val.strip().isdigit():
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
raise ValueError(f"Invalid configuration option {key}={val} (expected an integer)")
return int(val.strip())
elif type is list or type is dict:
return json.loads(val)
elif type is Path:
return Path(val)
raise Exception('Config values can only be str, bool, int, or json')
raise Exception("Config values can only be str, bool, int, or json")
def load_config_file() -> Optional[benedict]:
def load_config_file() -> benedict | None:
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
config_path = CONSTANTS.CONFIG_FILE
@@ -105,17 +107,16 @@ def load_config_file() -> Optional[benedict]:
config_file = CaseConfigParser()
config_file.read(config_path)
# flatten into one namespace
config_file_vars = benedict({
key.upper(): val
for section, options in config_file.items()
for key, val in options.items()
})
config_file_vars = benedict({key.upper(): val for section, options in config_file.items() for key, val in options.items()})
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
class PluginConfigSection:
"""Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
toml_section_header = "PLUGINS"
def __init__(self, key: str):
@@ -144,8 +145,14 @@ def section_for_key(key: str) -> Any:
)
# First check core config sections
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
for section in [
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
]:
if hasattr(section, key):
return section
@@ -154,20 +161,19 @@ def section_for_key(key: str) -> Any:
plugin_configs = discover_plugin_configs()
for plugin_name, schema in plugin_configs.items():
if 'properties' in schema and key in schema['properties']:
if "properties" in schema and key in schema["properties"]:
# All plugin config goes to [PLUGINS] section
return PluginConfigSection(key)
raise ValueError(f'No config section found for key: {key}')
raise ValueError(f"No config section found for key: {key}")
def write_config_file(config: Dict[str, str]) -> benedict:
def write_config_file(config: dict[str, str]) -> benedict:
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
from archivebox.misc.system import atomic_write
CONFIG_HEADER = (
"""# This is the config file for your ArchiveBox collection.
CONFIG_HEADER = """# This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
@@ -178,7 +184,7 @@ def write_config_file(config: Dict[str, str]) -> benedict:
# A list of all possible config with documentation and examples can be found here:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
""")
"""
config_path = CONSTANTS.CONFIG_FILE
@@ -188,57 +194,56 @@ def write_config_file(config: Dict[str, str]) -> benedict:
config_file = CaseConfigParser()
config_file.read(config_path)
with open(config_path, 'r', encoding='utf-8') as old:
atomic_write(f'{config_path}.bak', old.read())
with open(config_path, encoding="utf-8") as old:
atomic_write(f"{config_path}.bak", old.read())
# Set up sections in empty config file
for key, val in config.items():
section = section_for_key(key)
assert section is not None
if not hasattr(section, 'toml_section_header'):
raise ValueError(f'{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.')
if not hasattr(section, "toml_section_header"):
raise ValueError(f"{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.")
section_name = section.toml_section_header
if section_name in config_file:
existing_config = dict(config_file[section_name])
else:
existing_config = {}
config_file[section_name] = benedict({**existing_config, key: val})
section.update_in_place(warn=False, persist=False, **{key: val})
with open(config_path, 'w+', encoding='utf-8') as new:
with open(config_path, "w+", encoding="utf-8") as new:
config_file.write(new)
updated_config = {}
try:
# validate the updated_config by attempting to re-parse it
from archivebox.config.configset import get_flat_config
updated_config = {**load_all_config(), **get_flat_config()}
except BaseException: # lgtm [py/catch-base-exception]
except BaseException: # lgtm [py/catch-base-exception]
# something went horribly wrong, revert to the previous version
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
with open(f"{config_path}.bak", encoding="utf-8") as old:
atomic_write(config_path, old.read())
raise
if os.access(f'{config_path}.bak', os.F_OK):
os.remove(f'{config_path}.bak')
if os.access(f"{config_path}.bak", os.F_OK):
os.remove(f"{config_path}.bak")
return benedict({
key.upper(): updated_config.get(key.upper())
for key in config.keys()
})
return benedict({key.upper(): updated_config.get(key.upper()) for key in config.keys()})
def load_config(defaults: Dict[str, Any],
config: Optional[benedict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> benedict:
def load_config(
defaults: dict[str, Any],
config: benedict | None = None,
out_dir: str | None = None,
env_vars: os._Environ | None = None,
config_file_vars: dict[str, str] | None = None,
) -> benedict:
env_vars = env_vars or os.environ
config_file_vars = config_file_vars or load_config_file()
@@ -249,9 +254,9 @@ def load_config(defaults: Dict[str, Any],
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
extended_config[key] = load_config_val(
key,
default=default['default'],
type=default.get('type'),
aliases=default.get('aliases'),
default=default["default"],
type=default.get("type"),
aliases=default.get("aliases"),
config=extended_config,
env_vars=env_vars,
config_file_vars=config_file_vars,
@@ -260,19 +265,20 @@ def load_config(defaults: Dict[str, Any],
raise SystemExit(0)
except Exception as e:
stderr()
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
stderr(' {}: {}'.format(e.__class__.__name__, e))
stderr(f"[X] Error while loading configuration value: {key}", color="red", config=extended_config)
stderr(f" {e.__class__.__name__}: {e}")
stderr()
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
stderr(" Check your config for mistakes and try again (your archive data is unaffected).")
stderr()
stderr(' For config documentation and examples see:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
stderr(" For config documentation and examples see:")
stderr(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration")
stderr()
# raise
# raise SystemExit(2)
return benedict(extended_config)
def load_all_config():
"""Load all config sections and return as a flat dict."""
from archivebox.config.common import (
@@ -283,11 +289,17 @@ def load_all_config():
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
flat_config = benedict()
for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
for config_section in [
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
]:
flat_config.update(dict(config_section))
return flat_config

View File

@@ -4,7 +4,7 @@ import re
import secrets
import sys
import shutil
from typing import ClassVar, Dict, Optional, List
from typing import ClassVar
from pathlib import Path
from rich.console import Console
@@ -39,8 +39,8 @@ class ShellConfig(BaseConfigSet):
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
ANSI: Dict[str, str] = Field(
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
ANSI: dict[str, str] = Field(
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS,
)
@property
@@ -50,7 +50,7 @@ class ShellConfig(BaseConfigSet):
return shutil.get_terminal_size((140, 10)).columns
@property
def COMMIT_HASH(self) -> Optional[str]:
def COMMIT_HASH(self) -> str | None:
return get_COMMIT_HASH()
@property
@@ -112,7 +112,7 @@ class ServerConfig(BaseConfigSet):
"danger-onedomain-fullreplay",
)
SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
ADMIN_BASE_URL: str = Field(default="")
@@ -124,7 +124,7 @@ class ServerConfig(BaseConfigSet):
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)
FOOTER_INFO: str = Field(
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.",
)
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
@@ -132,8 +132,8 @@ class ServerConfig(BaseConfigSet):
PUBLIC_SNAPSHOTS: bool = Field(default=True)
PUBLIC_ADD_VIEW: bool = Field(default=False)
ADMIN_USERNAME: Optional[str] = Field(default=None)
ADMIN_PASSWORD: Optional[str] = Field(default=None)
ADMIN_USERNAME: str | None = Field(default=None)
ADMIN_PASSWORD: str | None = Field(default=None)
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
REVERSE_PROXY_WHITELIST: str = Field(default="")
@@ -234,22 +234,22 @@ class ArchivingConfig(BaseConfigSet):
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)
USER_AGENT: str = Field(
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)",
)
COOKIES_FILE: Path | None = Field(default=None)
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
SAVE_ALLOWLIST: dict[str, list[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
SAVE_DENYLIST: dict[str, list[str]] = Field(default={})
DEFAULT_PERSONA: str = Field(default="Default")
def warn_if_invalid(self) -> None:
if int(self.TIMEOUT) < 5:
rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr)
rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
rprint(file=sys.stderr)
rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
@@ -274,7 +274,7 @@ class ArchivingConfig(BaseConfigSet):
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
@property
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
def SAVE_ALLOWLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
return (
{
# regexp: methods list
@@ -286,7 +286,7 @@ class ArchivingConfig(BaseConfigSet):
)
@property
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
def SAVE_DENYLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
return (
{
# regexp: methods list

View File

@@ -11,7 +11,7 @@ __package__ = "archivebox.config"
import os
import json
from pathlib import Path
from typing import Any, Dict, Optional, Type, Tuple
from typing import Any
from configparser import ConfigParser
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
@@ -28,17 +28,18 @@ class IniConfigSettingsSource(PydanticBaseSettingsSource):
Flattens all sections into a single namespace.
"""
def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]:
config_vals = self._load_config_file()
field_value = config_vals.get(field_name.upper())
return field_value, field_name, False
def __call__(self) -> Dict[str, Any]:
def __call__(self) -> dict[str, Any]:
return self._load_config_file()
def _load_config_file(self) -> Dict[str, Any]:
def _load_config_file(self) -> dict[str, Any]:
try:
from archivebox.config.constants import CONSTANTS
config_path = CONSTANTS.CONFIG_FILE
except ImportError:
return {}
@@ -78,25 +79,25 @@ class BaseConfigSet(BaseSettings):
@classmethod
def settings_customise_sources(
cls,
settings_cls: Type[BaseSettings],
settings_cls: type[BaseSettings],
init_settings: PydanticBaseSettingsSource,
env_settings: PydanticBaseSettingsSource,
dotenv_settings: PydanticBaseSettingsSource,
file_secret_settings: PydanticBaseSettingsSource,
) -> Tuple[PydanticBaseSettingsSource, ...]:
) -> tuple[PydanticBaseSettingsSource, ...]:
"""
Define the order of settings sources (first = highest priority).
"""
return (
init_settings, # 1. Passed to __init__
env_settings, # 2. Environment variables
init_settings, # 1. Passed to __init__
env_settings, # 2. Environment variables
IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
# dotenv_settings, # Skip .env files
# file_secret_settings, # Skip secrets files
)
@classmethod
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
def load_from_file(cls, config_path: Path) -> dict[str, str]:
"""Load config values from INI file."""
if not config_path.exists():
return {}
@@ -120,14 +121,14 @@ class BaseConfigSet(BaseSettings):
def get_config(
defaults: Optional[Dict] = None,
defaults: dict | None = None,
persona: Any = None,
user: Any = None,
crawl: Any = None,
snapshot: Any = None,
archiveresult: Any = None,
machine: Any = None,
) -> Dict[str, Any]:
) -> dict[str, Any]:
"""
Get merged config from all sources.
@@ -176,7 +177,7 @@ def get_config(
if persona_id:
persona = Persona.objects.filter(id=persona_id).first()
if persona is None:
raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
raise Persona.DoesNotExist(f"Crawl {getattr(crawl, 'id', None)} references missing Persona {persona_id}")
if persona is None:
crawl_config = getattr(crawl, "config", None) or {}
@@ -200,6 +201,7 @@ def get_config(
# Add plugin config defaults from JSONSchema config.json files
try:
from archivebox.hooks import get_config_defaults_from_plugins
plugin_defaults = get_config_defaults_from_plugins()
config.update(plugin_defaults)
except ImportError:
@@ -224,6 +226,7 @@ def get_config(
# Default to current machine if not provided
try:
from archivebox.machine.models import Machine
machine = Machine.current()
except Exception:
pass # Machine might not be available during early init
@@ -246,16 +249,17 @@ def get_config(
# Also check plugin config aliases in environment
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
for plugin_name, schema in plugin_configs.items():
for key, prop_schema in schema.get('properties', {}).items():
for key, prop_schema in schema.get("properties", {}).items():
# Check x-aliases
for alias in prop_schema.get('x-aliases', []):
for alias in prop_schema.get("x-aliases", []):
if alias in os.environ and key not in os.environ:
config[key] = _parse_env_value(os.environ[alias], config.get(key))
break
# Check x-fallback
fallback = prop_schema.get('x-fallback')
fallback = prop_schema.get("x-fallback")
if fallback and fallback in config and key not in config:
config[key] = config[fallback]
except ImportError:
@@ -275,33 +279,34 @@ def get_config(
# Add crawl path aliases for hooks that need shared crawl state.
if crawl and hasattr(crawl, "output_dir"):
config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
config['CRAWL_DIR'] = str(crawl.output_dir)
config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
config["CRAWL_DIR"] = str(crawl.output_dir)
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
# Apply snapshot config overrides (highest priority)
if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config)
if snapshot:
config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID")
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
if hasattr(snapshot, "output_dir"):
config['SNAP_DIR'] = str(snapshot.output_dir)
config["SNAP_DIR"] = str(snapshot.output_dir)
if getattr(snapshot, "crawl_id", None):
config['CRAWL_ID'] = str(snapshot.crawl_id)
config["CRAWL_ID"] = str(snapshot.crawl_id)
# Normalize all aliases to canonical names (after all sources merged)
# This handles aliases that came from user/crawl/snapshot configs, not just env
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
aliases_to_normalize = {} # {alias_key: canonical_key}
# Build alias mapping from all plugin schemas
for plugin_name, schema in plugin_configs.items():
for canonical_key, prop_schema in schema.get('properties', {}).items():
for alias in prop_schema.get('x-aliases', []):
for canonical_key, prop_schema in schema.get("properties", {}).items():
for alias in prop_schema.get("x-aliases", []):
aliases_to_normalize[alias] = canonical_key
# Normalize: copy alias values to canonical keys (aliases take precedence)
@@ -314,10 +319,14 @@ def get_config(
except ImportError:
pass
if not config.get("DATA_DIR"):
config["DATA_DIR"] = str(CONSTANTS.DATA_DIR)
config["ABX_RUNTIME"] = "archivebox"
return config
def get_flat_config() -> Dict[str, Any]:
def get_flat_config() -> dict[str, Any]:
"""
Get a flat dictionary of all config values.
@@ -326,20 +335,24 @@ def get_flat_config() -> Dict[str, Any]:
return get_config()
def get_all_configs() -> Dict[str, BaseConfigSet]:
def get_all_configs() -> dict[str, BaseConfigSet]:
"""
Get all config section objects as a dictionary.
Replaces abx.pm.hook.get_CONFIGS()
"""
from archivebox.config.common import (
SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
SHELL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
"SHELL_CONFIG": SHELL_CONFIG,
"SERVER_CONFIG": SERVER_CONFIG,
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
"SEARCH_BACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
}
@@ -394,7 +407,7 @@ DEFAULT_WORKER_CONCURRENCY = {
}
def get_worker_concurrency() -> Dict[str, int]:
def get_worker_concurrency() -> dict[str, int]:
"""
Get worker concurrency settings.

View File

@@ -5,17 +5,16 @@ Constants are for things that never change at runtime.
DATA_DIR will never change at runtime, but you can run
archivebox from inside a different DATA_DIR on the same machine.
This is loaded very early in the archivebox startup flow, so nothing in this file
or imported from this file should import anything from archivebox.config.common,
This is loaded very early in the archivebox startup flow, so nothing in this file
or imported from this file should import anything from archivebox.config.common,
django, other INSTALLED_APPS, or anything else that is not in a standard library.
"""
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import re
import sys
from typing import Dict
from pathlib import Path
from benedict import benedict
@@ -46,184 +45,235 @@ from .version import detect_installed_version
class ConstantsDict:
PACKAGE_DIR: Path = PACKAGE_DIR
DATA_DIR: Path = DATA_DIR
ARCHIVE_DIR: Path = ARCHIVE_DIR
MACHINE_TYPE: str = get_machine_type()
MACHINE_ID: str = get_machine_id()
COLLECTION_ID: str = get_collection_id(DATA_DIR)
PACKAGE_DIR: Path = PACKAGE_DIR
DATA_DIR: Path = DATA_DIR
ARCHIVE_DIR: Path = ARCHIVE_DIR
MACHINE_TYPE: str = get_machine_type()
MACHINE_ID: str = get_machine_id()
COLLECTION_ID: str = get_collection_id(DATA_DIR)
# Host system
VERSION: str = detect_installed_version(PACKAGE_DIR)
IN_DOCKER: bool = IN_DOCKER
VERSION: str = detect_installed_version(PACKAGE_DIR)
IN_DOCKER: bool = IN_DOCKER
# Permissions
IS_ROOT: bool = IS_ROOT
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
RUNNING_AS_UID: int = RUNNING_AS_UID
RUNNING_AS_GID: int = RUNNING_AS_GID
DEFAULT_PUID: int = DEFAULT_PUID
DEFAULT_PGID: int = DEFAULT_PGID
IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
IS_ROOT: bool = IS_ROOT
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
RUNNING_AS_UID: int = RUNNING_AS_UID
RUNNING_AS_GID: int = RUNNING_AS_GID
DEFAULT_PUID: int = DEFAULT_PUID
DEFAULT_PGID: int = DEFAULT_PGID
IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
# Source code dirs
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
TEMPLATES_DIR_NAME: str = 'templates'
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
STATIC_DIR_NAME: str = 'static'
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
TEMPLATES_DIR_NAME: str = "templates"
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
STATIC_DIR_NAME: str = "static"
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
# Data dirs
ARCHIVE_DIR_NAME: str = 'archive'
SOURCES_DIR_NAME: str = 'sources'
PERSONAS_DIR_NAME: str = 'personas'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates'
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
ARCHIVE_DIR_NAME: str = "archive"
SOURCES_DIR_NAME: str = "sources"
PERSONAS_DIR_NAME: str = "personas"
CACHE_DIR_NAME: str = "cache"
LOGS_DIR_NAME: str = "logs"
CUSTOM_PLUGINS_DIR_NAME: str = "custom_plugins"
CUSTOM_TEMPLATES_DIR_NAME: str = "custom_templates"
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
# Data dir files
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
JSON_INDEX_FILENAME: str = 'index.json'
JSONL_INDEX_FILENAME: str = 'index.jsonl'
HTML_INDEX_FILENAME: str = 'index.html'
ROBOTS_TXT_FILENAME: str = 'robots.txt'
FAVICON_FILENAME: str = 'favicon.ico'
# Runtime dirs
TMP_DIR_NAME: str = 'tmp'
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
CONFIG_FILENAME: str = "ArchiveBox.conf"
SQL_INDEX_FILENAME: str = "index.sqlite3"
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
LIB_DIR_NAME: str = 'lib'
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / 'bin' # ./data/lib/arm64-linux-docker/bin
JSON_INDEX_FILENAME: str = "index.json"
JSONL_INDEX_FILENAME: str = "index.jsonl"
HTML_INDEX_FILENAME: str = "index.html"
ROBOTS_TXT_FILENAME: str = "robots.txt"
FAVICON_FILENAME: str = "favicon.ico"
# Runtime dirs
TMP_DIR_NAME: str = "tmp"
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
LIB_DIR_NAME: str = "lib"
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / "bin" # ./data/lib/arm64-linux-docker/bin
# Config constants
TIMEZONE: str = 'UTC'
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
TIMEZONE: str = "UTC"
DEFAULT_CLI_COLORS: dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: dict[str, str] = benedict({k: "" for k in DEFAULT_CLI_COLORS})
# Hard safety limits (seconds)
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
STATICFILE_EXTENSIONS: frozenset[str] = frozenset(
(
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
"gif",
"jpeg",
"jpg",
"png",
"tif",
"tiff",
"wbmp",
"ico",
"jng",
"bmp",
"svg",
"svgz",
"webp",
"ps",
"eps",
"ai",
"mp3",
"mp4",
"m4a",
"mpeg",
"mpg",
"mkv",
"mov",
"webm",
"m4v",
"flv",
"wmv",
"avi",
"ogg",
"ts",
"m3u8",
"pdf",
"txt",
"rtf",
"rtfd",
"doc",
"docx",
"ppt",
"pptx",
"xls",
"xlsx",
"atom",
"rss",
"css",
"js",
"json",
"dmg",
"iso",
"img",
"rar",
"war",
"hqx",
"zip",
"gz",
"bz2",
"7z",
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
),
)
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
))
PIP_RELATED_NAMES: frozenset[str] = frozenset((
".venv",
"venv",
"virtualenv",
".virtualenv",
))
NPM_RELATED_NAMES: frozenset[str] = frozenset((
"node_modules",
"package.json",
"package-lock.json",
"yarn.lock",
))
PIP_RELATED_NAMES: frozenset[str] = frozenset(
(
".venv",
"venv",
"virtualenv",
".virtualenv",
),
)
NPM_RELATED_NAMES: frozenset[str] = frozenset(
(
"node_modules",
"package.json",
"package-lock.json",
"yarn.lock",
),
)
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
### Dirs:
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
CACHE_DIR_NAME,
LIB_DIR_NAME,
TMP_DIR_NAME,
PERSONAS_DIR_NAME,
CUSTOM_TEMPLATES_DIR_NAME,
CUSTOM_PLUGINS_DIR_NAME,
"invalid",
"users",
"machine",
# Backwards compatibility with old directory names
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount / sonic FTS process
".git",
".svn",
### Files:
CONFIG_FILENAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
"search.sqlite3",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME,
JSONL_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
f".{CONFIG_FILENAME}.bak",
"static_index.json",
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
".collection_id",
".archivebox_id",
"Dockerfile",
))
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset(
(
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
### Dirs:
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
CACHE_DIR_NAME,
LIB_DIR_NAME,
TMP_DIR_NAME,
PERSONAS_DIR_NAME,
CUSTOM_TEMPLATES_DIR_NAME,
CUSTOM_PLUGINS_DIR_NAME,
"invalid",
"users",
"machine",
# Backwards compatibility with old directory names
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount / sonic FTS process
".git",
".svn",
### Files:
CONFIG_FILENAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
"search.sqlite3",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME,
JSONL_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
f".{CONFIG_FILENAME}.bak",
"static_index.json",
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
".collection_id",
".archivebox_id",
"Dockerfile",
),
)
@classmethod
def __getitem__(cls, key: str):
# so it behaves like a dict[key] == dict.key or object attr
return getattr(cls, key)
@classmethod
def __benedict__(cls):
# when casting to benedict, only include uppercase keys that don't start with an underscore
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith("_")})
CONSTANTS = ConstantsDict

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import os
import sys
@@ -17,9 +17,9 @@ from .common import SHELL_CONFIG
if not SHELL_CONFIG.USE_COLOR:
os.environ['NO_COLOR'] = '1'
os.environ["NO_COLOR"] = "1"
if not SHELL_CONFIG.SHOW_PROGRESS:
os.environ['TERM'] = 'dumb'
os.environ["TERM"] = "dumb"
# recreate rich console obj based on new config values
STDOUT = CONSOLE = Console()
@@ -32,7 +32,8 @@ def setup_django_minimal():
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
# os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# django.setup()
raise Exception('dont use this anymore')
raise Exception("dont use this anymore")
DJANGO_SET_UP = False
@@ -61,15 +62,18 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
# This warning can be triggered during django.setup() but is safe to ignore
# since we're doing intentional setup operations
import warnings
warnings.filterwarnings('ignore',
message='.*Accessing the database during app initialization.*',
category=RuntimeWarning)
warnings.filterwarnings(
"ignore",
message=".*Accessing the database during app initialization.*",
category=RuntimeWarning,
)
try:
from django.core.management import call_command
if in_memory_db:
raise Exception('dont use this anymore')
raise Exception("dont use this anymore")
# some commands dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
@@ -84,19 +88,22 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
try:
django.setup()
except Exception as e:
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ("help", "version", "--help", "--version"))
if not is_using_meta_cmd:
# show error message to user only if they're not running a meta command / just trying to get help
STDERR.print()
STDERR.print(Panel(
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
title='\n\n[red][X] Error while trying to load database![/red]',
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
expand=False,
style='bold red',
))
STDERR.print(
Panel(
f"\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n",
title="\n\n[red][X] Error while trying to load database![/red]",
subtitle="[grey53]NO WRITES CAN BE PERFORMED[/grey53]",
expand=False,
style="bold red",
),
)
STDERR.print()
import traceback
traceback.print_exc()
return
@@ -104,28 +111,29 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG
# log startup message to the error log
error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG)
with open(error_log, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
error_log = getattr(settings, "ERROR_LOG", DEFAULT_ERROR_LOG)
with open(error_log, "a", encoding="utf-8") as f:
command = " ".join(sys.argv)
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d__%H:%M:%S")
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
# make sure the data dir is owned by a non-root user
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
STDERR.print(f' {CONSTANTS.DATA_DIR}')
STDERR.print("[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]")
STDERR.print(f" {CONSTANTS.DATA_DIR}")
STDERR.print()
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
STDERR.print(' cd path/to/your/archive/data')
STDERR.print(' archivebox [command]')
STDERR.print("[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)")
STDERR.print(" cd path/to/your/archive/data")
STDERR.print(" archivebox [command]")
STDERR.print()
raise SystemExit(9)
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
cache.get("test", None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
@@ -133,12 +141,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
sql_index_path = CONSTANTS.DATABASE_FILE
assert os.access(sql_index_path, os.F_OK), (
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
f"No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)"
)
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
# if settings.DEBUG_LOGFIRE:

View File

@@ -1,6 +1,5 @@
__package__ = "archivebox.config"
from typing import Optional
from pydantic import Field
from archivebox.config.configset import BaseConfigSet
@@ -13,13 +12,14 @@ class LDAPConfig(BaseConfigSet):
Only loads and validates if django-auth-ldap is installed.
These settings integrate with Django's LDAP authentication backend.
"""
toml_section_header: str = "LDAP_CONFIG"
LDAP_ENABLED: bool = Field(default=False)
LDAP_SERVER_URI: Optional[str] = Field(default=None)
LDAP_BIND_DN: Optional[str] = Field(default=None)
LDAP_BIND_PASSWORD: Optional[str] = Field(default=None)
LDAP_USER_BASE: Optional[str] = Field(default=None)
LDAP_SERVER_URI: str | None = Field(default=None)
LDAP_BIND_DN: str | None = Field(default=None)
LDAP_BIND_PASSWORD: str | None = Field(default=None)
LDAP_USER_BASE: str | None = Field(default=None)
LDAP_USER_FILTER: str = Field(default="(uid=%(user)s)")
LDAP_USERNAME_ATTR: str = Field(default="username")
LDAP_FIRSTNAME_ATTR: str = Field(default="givenName")

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import os
import socket
@@ -15,24 +15,25 @@ from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
#############################################################################################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
DATABASE_FILE = DATA_DIR / 'index.sqlite3'
DATABASE_FILE = DATA_DIR / "index.sqlite3"
#############################################################################################
def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
collection_id_file = DATA_DIR / '.archivebox_id'
collection_id_file = DATA_DIR / ".archivebox_id"
try:
return collection_id_file.read_text().strip()
except (OSError, FileNotFoundError, PermissionError):
pass
# hash the machine_id + collection dir path + creation time to get a unique collection_id
machine_id = get_machine_id()
collection_path = DATA_DIR.resolve()
@@ -40,55 +41,60 @@ def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
creation_date = DATA_DIR.stat().st_ctime
except Exception:
creation_date = datetime.now().isoformat()
collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8]
collection_id = hashlib.sha256(f"{machine_id}:{collection_path}@{creation_date}".encode()).hexdigest()[:8]
try:
# only persist collection_id file if we already have an index.sqlite3 file present
# otherwise we might be running in a directory that is not a collection, no point creating cruft files
collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
if collection_is_active or force_create:
collection_id_file.write_text(collection_id)
# if we're running as root right now, make sure the collection_id file is owned by the archivebox user
if IS_ROOT:
with SudoPermission(uid=0):
if ARCHIVEBOX_USER == 0:
os.system(f'chmod 777 "{collection_id_file}"')
else:
else:
os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"')
except (OSError, FileNotFoundError, PermissionError):
pass
return collection_id
@cache
def get_collection_id(DATA_DIR=DATA_DIR) -> str:
"""Get a short, stable, unique ID for the current collection (e.g. abc45678)"""
return _get_collection_id(DATA_DIR=DATA_DIR)
@cache
def get_machine_id() -> str:
"""Get a short, stable, unique ID for the current machine (e.g. abc45678)"""
MACHINE_ID = 'unknown'
MACHINE_ID = "unknown"
try:
import machineid
MACHINE_ID = machineid.hashed_id('archivebox')[:8]
MACHINE_ID = machineid.hashed_id("archivebox")[:8]
except Exception:
try:
import uuid
import hashlib
MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8]
except Exception:
pass
return MACHINE_ID
@cache
def get_machine_type() -> str:
"""Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)"""
OS: str = platform.system().lower() # darwin, linux, etc.
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}'
OS: str = platform.system().lower() # darwin, linux, etc.
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
LIB_DIR_SCOPE: str = f"{ARCH}-{OS}-docker" if IN_DOCKER else f"{ARCH}-{OS}"
return LIB_DIR_SCOPE
@@ -97,27 +103,28 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
current_uid, current_gid = os.geteuid(), os.getegid()
uid, gid = uid or current_uid, gid or current_gid
test_file = dir_path / '.permissions_test'
test_file = dir_path / ".permissions_test"
try:
with SudoPermission(uid=uid, fallback=fallback):
test_file.exists()
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
test_file.write_text(f"Checking if PUID={uid} PGID={gid} can write to dir")
test_file.unlink()
return True
except (IOError, OSError, PermissionError):
if chown:
except (OSError, PermissionError):
if chown:
# try fixing it using sudo permissions
with SudoPermission(uid=uid, fallback=fallback):
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
return False
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
from archivebox.misc.logging_util import pretty_path
try:
socket_path = str(dir_path / '.test_socket.sock')
socket_path = str(dir_path / ".test_socket.sock")
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
os.remove(socket_path)
@@ -130,8 +137,8 @@ def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
except OSError:
pass
except Exception as e:
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
raise Exception(f"ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}") from e
return True
@@ -143,8 +150,9 @@ def create_and_chown_dir(dir_path: Path) -> None:
def tmp_dir_socket_path_is_short_enough(dir_path: Path) -> bool:
socket_file = dir_path.absolute().resolve() / 'supervisord.sock'
return len(f'file://{socket_file}') <= 96
socket_file = dir_path.absolute().resolve() / "supervisord.sock"
return len(f"file://{socket_file}") <= 96
@cache
def get_or_create_working_tmp_dir(autofix=True, quiet=True):
@@ -154,14 +162,18 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.TMP_DIR, # <user-specified>
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
STORAGE_CONFIG.TMP_DIR, # <user-specified>
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
Path("/var/run/archivebox") / get_collection_id(), # /var/run/archivebox/abc5d8512
Path("/tmp") / "archivebox" / get_collection_id(), # /tmp/archivebox/abc5d8512
Path("~/.tmp/archivebox").expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
Path(tempfile.gettempdir())
/ "archivebox"
/ get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
Path(tempfile.gettempdir())
/ "archivebox"
/ get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
Path(tempfile.gettempdir()) / "abx" / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
]
fallback_candidate = None
for candidate in CANDIDATES:
@@ -174,7 +186,12 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
return candidate
try:
if fallback_candidate is None and candidate.exists() and dir_is_writable(candidate) and tmp_dir_socket_path_is_short_enough(candidate):
if (
fallback_candidate is None
and candidate.exists()
and dir_is_writable(candidate)
and tmp_dir_socket_path_is_short_enough(candidate)
):
fallback_candidate = candidate
except Exception:
pass
@@ -186,25 +203,28 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
if autofix and STORAGE_CONFIG.TMP_DIR != fallback_candidate:
STORAGE_CONFIG.update_in_place(TMP_DIR=fallback_candidate)
return fallback_candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
raise OSError(f"ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!")
@cache
def get_or_create_working_lib_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_lib_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.LIB_DIR, # <user-specified>
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
STORAGE_CONFIG.LIB_DIR, # <user-specified>
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
Path("/usr/local/share/archivebox") / get_collection_id(), # /usr/local/share/archivebox/abc5
*(
[Path("/opt/homebrew/share/archivebox") / get_collection_id()] if os.path.isfile("/opt/homebrew/bin/archivebox") else []
), # /opt/homebrew/share/archivebox/abc5
Path("~/.local/share/archivebox").expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
@@ -214,10 +234,9 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False):
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
if not quiet:
raise OSError(f"ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!")
@cache
@@ -229,57 +248,68 @@ def get_data_locations():
tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR
except Exception:
tmp_dir = STORAGE_CONFIG.TMP_DIR
return benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
return benedict(
{
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONSTANTS.CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE)
and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)
and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": CONSTANTS.SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR)
and os.access(CONSTANTS.SOURCES_DIR, os.R_OK)
and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": CONSTANTS.PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR)
and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK)
and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": CONSTANTS.LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR)
and os.access(CONSTANTS.LOGS_DIR, os.R_OK)
and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
},
"TMP_DIR": {
"path": tmp_dir.resolve(),
"enabled": True,
"is_valid": os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
},
"CONFIG_FILE": {
"path": CONSTANTS.CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": CONSTANTS.SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": CONSTANTS.PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": CONSTANTS.LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': tmp_dir.resolve(),
'enabled': True,
'is_valid': os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
)
@cache
def get_code_locations():
@@ -291,41 +321,45 @@ def get_code_locations():
except Exception:
lib_dir = STORAGE_CONFIG.LIB_DIR
lib_bin_dir = lib_dir / 'bin'
return benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': lib_dir.resolve(),
'enabled': True,
'is_valid': os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write
},
'LIB_BIN_DIR': {
'path': lib_bin_dir.resolve(),
'enabled': True,
'is_valid': os.path.isdir(lib_bin_dir) and os.access(lib_bin_dir, os.R_OK) and os.access(lib_bin_dir, os.W_OK), # read + write
},
})
lib_bin_dir = lib_dir / "bin"
return benedict(
{
"PACKAGE_DIR": {
"path": (PACKAGE_DIR).resolve(),
"enabled": True,
"is_valid": os.access(PACKAGE_DIR / "__main__.py", os.X_OK), # executable
},
"TEMPLATES_DIR": {
"path": CONSTANTS.TEMPLATES_DIR.resolve(),
"enabled": True,
"is_valid": os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
},
"CUSTOM_TEMPLATES_DIR": {
"path": STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(),
"enabled": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR),
"is_valid": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR)
and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
"USER_PLUGINS_DIR": {
"path": CONSTANTS.USER_PLUGINS_DIR.resolve(),
"enabled": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
"is_valid": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
},
"LIB_DIR": {
"path": lib_dir.resolve(),
"enabled": True,
"is_valid": os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write
},
"LIB_BIN_DIR": {
"path": lib_bin_dir.resolve(),
"enabled": True,
"is_valid": os.path.isdir(lib_bin_dir)
and os.access(lib_bin_dir, os.R_OK)
and os.access(lib_bin_dir, os.W_OK), # read + write
},
},
)
# @cache
@@ -340,9 +374,9 @@ def get_code_locations():
# - ok to have a long path (doesnt contain SOCKETS)
# """
# from .version import detect_installed_version
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
# lib_dir = tempfile.gettempdir()
# try:
# if 'SYSTEM_LIB_DIR' in os.environ:
@@ -350,7 +384,7 @@ def get_code_locations():
# else:
# with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
# lib_dir = HOST_DIRS.site_data_path
# # Docker: /usr/local/share/archivebox/0.8.5
# # Ubuntu: /usr/local/share/archivebox/0.8.5
# # macOS: /Library/Application Support/archivebox
@@ -358,16 +392,16 @@ def get_code_locations():
# with SudoPermission(uid=0, fallback=True):
# lib_dir.mkdir(parents=True, exist_ok=True)
# except PermissionError:
# # our user cannot
# # our user cannot
# lib_dir = HOST_DIRS.user_data_path
# lib_dir.mkdir(parents=True, exist_ok=True)
# if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER):
# if IS_ROOT:
# # make sure lib dir is owned by the archivebox user, not root
# with SudoPermission(uid=0):
# if ARCHIVEBOX_USER == 0:
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
# os.system(f'chmod -R 777 "{lib_dir}"')
# else:
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
@@ -376,9 +410,9 @@ def get_code_locations():
# except (PermissionError, AssertionError):
# # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
# print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
# return lib_dir
# @cache
# def get_TMP_DIR():
# """
@@ -390,9 +424,9 @@ def get_code_locations():
# - must be cleared on every archivebox version upgrade
# """
# from .version import detect_installed_version
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
# # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
# # print('RUNNING AS:', self.PUID, self.PGID)
# run_dir = tempfile.gettempdir()
@@ -405,7 +439,7 @@ def get_code_locations():
# if IS_ROOT:
# with SudoPermission(uid=0, fallback=False):
# if ARCHIVEBOX_USER == 0:
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
# os.system(f'chmod -R 777 "{run_dir}"')
# else:
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
@@ -413,30 +447,30 @@ def get_code_locations():
# raise PermissionError()
# assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
# return run_dir
# run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
# try:
# assert len(str(run_dir)) + len('/supervisord.sock') < 95
# except AssertionError:
# run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
# assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
# with SudoPermission(uid=0, fallback=True):
# run_dir.mkdir(parents=True, exist_ok=True)
# if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
# if IS_ROOT:
# with SudoPermission(uid=0):
# if ARCHIVEBOX_USER == 0:
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
# os.system(f'chmod -R 777 "{run_dir}"')
# else:
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
# else:
# raise PermissionError()
# except (PermissionError, AssertionError):
# # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
# print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
# return run_dir

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import os
import pwd
@@ -17,26 +17,26 @@ from contextlib import contextmanager
DATA_DIR = Path(os.getcwd())
try:
DATA_DIR_STAT = DATA_DIR.stat()
DATA_DIR_UID = DATA_DIR_STAT.st_uid
DATA_DIR_GID = DATA_DIR_STAT.st_gid
DATA_DIR_STAT = DATA_DIR.stat()
DATA_DIR_UID = DATA_DIR_STAT.st_uid
DATA_DIR_GID = DATA_DIR_STAT.st_gid
except PermissionError:
DATA_DIR_UID = 0
DATA_DIR_GID = 0
DATA_DIR_UID = 0
DATA_DIR_GID = 0
DEFAULT_PUID = 911
DEFAULT_PGID = 911
RUNNING_AS_UID = os.getuid()
RUNNING_AS_GID = os.getgid()
EUID = os.geteuid()
EGID = os.getegid()
SUDO_UID = int(os.environ.get('SUDO_UID', 0))
SUDO_GID = int(os.environ.get('SUDO_GID', 0))
USER: str = Path('~').expanduser().resolve().name
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
DEFAULT_PUID = 911
DEFAULT_PGID = 911
RUNNING_AS_UID = os.getuid()
RUNNING_AS_GID = os.getgid()
EUID = os.geteuid()
EGID = os.getegid()
SUDO_UID = int(os.environ.get("SUDO_UID", 0))
SUDO_GID = int(os.environ.get("SUDO_GID", 0))
USER: str = Path("~").expanduser().resolve().name
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
IS_ROOT = RUNNING_AS_UID == 0
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose
@@ -47,74 +47,79 @@ if RUNNING_AS_UID == 0:
# if we are running as root it's really hard to figure out what the correct archivebox user should be
# as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users)
# check if 911:911 archivebox user exists on host system, and use it instead of 0
if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox':
if pwd.getpwuid(DEFAULT_PUID).pw_name == "archivebox":
FALLBACK_UID = DEFAULT_PUID
FALLBACK_GID = DEFAULT_PGID
except Exception:
pass
os.environ.setdefault('PUID', str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID))
os.environ.setdefault('PGID', str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID))
os.environ.setdefault("PUID", str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID))
os.environ.setdefault("PGID", str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID))
ARCHIVEBOX_USER = int(os.environ['PUID'])
ARCHIVEBOX_GROUP = int(os.environ['PGID'])
ARCHIVEBOX_USER = int(os.environ["PUID"])
ARCHIVEBOX_GROUP = int(os.environ["PGID"])
if not USER:
try:
# alternative method 1 to get username
USER = pwd.getpwuid(ARCHIVEBOX_USER).pw_name
except Exception:
pass
if not USER:
try:
# alternative method 2 to get username
import getpass
USER = getpass.getuser()
except Exception:
pass
if not USER:
try:
# alternative method 3 to get username
USER = os.getlogin() or 'archivebox'
USER = os.getlogin() or "archivebox"
except Exception:
USER = 'archivebox'
USER = "archivebox"
ARCHIVEBOX_USER_EXISTS = False
try:
pwd.getpwuid(ARCHIVEBOX_USER)
ARCHIVEBOX_USER_EXISTS = True
except Exception:
ARCHIVEBOX_USER_EXISTS = False
#############################################################################################
def drop_privileges():
"""If running as root, drop privileges to the user that owns the data dir (or PUID)"""
# always run archivebox as the user that owns the data dir, never as root
if os.getuid() == 0:
# drop permissions to the user that owns the data dir / provided PUID
if os.geteuid() != ARCHIVEBOX_USER and ARCHIVEBOX_USER != 0 and ARCHIVEBOX_USER_EXISTS:
# drop our effective UID to the archivebox user's UID
os.seteuid(ARCHIVEBOX_USER)
# update environment variables so that subprocesses dont try to write to /root
pw_record = pwd.getpwuid(ARCHIVEBOX_USER)
os.environ['HOME'] = pw_record.pw_dir
os.environ['LOGNAME'] = pw_record.pw_name
os.environ['USER'] = pw_record.pw_name
os.environ["HOME"] = pw_record.pw_dir
os.environ["LOGNAME"] = pw_record.pw_name
os.environ["USER"] = pw_record.pw_name
if ARCHIVEBOX_USER == 0 or not ARCHIVEBOX_USER_EXISTS:
print('[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)', file=sys.stderr)
print(
"[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)",
file=sys.stderr,
)
@contextmanager
def SudoPermission(uid=0, fallback=False):
"""Attempt to run code with sudo permissions for a given user (or root)"""
if os.geteuid() == uid:
# no need to change effective UID, we are already that user
yield
@@ -125,7 +130,7 @@ def SudoPermission(uid=0, fallback=False):
os.seteuid(uid)
except PermissionError as err:
if not fallback:
raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err
raise PermissionError(f"Not enough permissions to run code as uid={uid}, please retry with sudo") from err
try:
# yield back to the caller so they can run code inside context as root
yield
@@ -135,4 +140,4 @@ def SudoPermission(uid=0, fallback=False):
os.seteuid(ARCHIVEBOX_USER)
except PermissionError as err:
if not fallback:
raise PermissionError(f'Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo') from err
raise PermissionError(f"Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo") from err

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import os
import importlib.metadata
@@ -6,71 +6,71 @@ import importlib.metadata
from pathlib import Path
from functools import cache
from datetime import datetime
from typing import Optional
#############################################################################################
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir
#############################################################################################
@cache
def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
def detect_installed_version(PACKAGE_DIR: Path = PACKAGE_DIR):
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
try:
# if in production install, use pip-installed package metadata
return importlib.metadata.version('archivebox').strip()
return importlib.metadata.version("archivebox").strip()
except importlib.metadata.PackageNotFoundError:
pass
try:
# if in dev Git repo dir, use pyproject.toml file
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
pyproject_config = (PACKAGE_DIR.parent / "pyproject.toml").read_text().split("\n")
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"').strip()
if line.startswith("version = "):
return line.split(" = ", 1)[-1].strip('"').strip()
except FileNotFoundError:
# building docs, pyproject.toml is not available
pass
# raise Exception('Failed to detect installed archivebox version!')
return 'dev'
return "dev"
@cache
def get_COMMIT_HASH() -> Optional[str]:
def get_COMMIT_HASH() -> str | None:
try:
git_dir = PACKAGE_DIR.parent / '.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
git_dir = PACKAGE_DIR.parent / ".git"
ref = (git_dir / "HEAD").read_text().strip().split(" ")[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip()
return list((PACKAGE_DIR.parent / ".git/refs/heads/").glob("*"))[0].read_text().strip()
except Exception:
pass
return None
@cache
def get_BUILD_TIME() -> str:
if IN_DOCKER:
try:
# if we're in the archivebox official docker image, /VERSION.txt will contain the build time
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
docker_build_end_time = Path("/VERSION.txt").read_text().rsplit("BUILD_END_TIME=")[-1].split("\n", 1)[0]
return docker_build_end_time
except Exception:
pass
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
src_last_modified_unix_timestamp = (PACKAGE_DIR / "README.md").stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime("%Y-%m-%d %H:%M:%S %s")
# def get_versions_available_on_github(config):
@@ -78,14 +78,14 @@ def get_BUILD_TIME() -> str:
# returns a dictionary containing the ArchiveBox GitHub release info for
# the recommended upgrade version and the currently installed version
# """
# # we only want to perform the (relatively expensive) check for new versions
# # when its most relevant, e.g. when the user runs a long-running command
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
# if subcommand_run_by_user not in long_running_commands:
# return None
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
# response = requests.get(github_releases_api)
# if response.status_code != 200:
@@ -104,7 +104,7 @@ def get_BUILD_TIME() -> str:
# break
# current_version = current_version or all_releases[-1]
# # recommended version is whatever comes after current_version in the release list
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
# try:

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import html
import json
@@ -6,7 +6,8 @@ import os
import inspect
import re
from pathlib import Path
from typing import Any, Callable, Dict
from typing import Any
from collections.abc import Callable
from urllib.parse import quote, urlencode
from django.http import HttpRequest
from django.utils import timezone
@@ -21,30 +22,48 @@ from archivebox.misc.util import parse_date
from archivebox.machine.models import Binary
ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
ABX_PLUGINS_DOCS_BASE_URL = "https://archivebox.github.io/abx-plugins/"
ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/"
LIVE_CONFIG_BASE_URL = "/admin/environment/config/"
ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/"
INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/"
# Common binaries to check for
KNOWN_BINARIES = [
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
'node', 'npm', 'npx', 'yt-dlp',
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
'python3', 'python', 'bash', 'zsh',
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
"wget",
"curl",
"chromium",
"chrome",
"google-chrome",
"google-chrome-stable",
"node",
"npm",
"npx",
"yt-dlp",
"git",
"singlefile",
"readability-extractor",
"mercury-parser",
"python3",
"python",
"bash",
"zsh",
"ffmpeg",
"ripgrep",
"rg",
"sonic",
"archivebox",
]
CANONICAL_BINARY_ALIASES = {
'youtube-dl': 'yt-dlp',
'ytdlp': 'yt-dlp',
"youtube-dl": "yt-dlp",
"ytdlp": "yt-dlp",
}
def is_superuser(request: HttpRequest) -> bool:
return bool(getattr(request.user, 'is_superuser', False))
return bool(getattr(request.user, "is_superuser", False))
def format_parsed_datetime(value: object) -> str:
@@ -55,9 +74,9 @@ def format_parsed_datetime(value: object) -> str:
JSON_TOKEN_RE = re.compile(
r'(?P<key>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
r'|(?P<string>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
r'|(?P<boolean>\btrue\b|\bfalse\b)'
r'|(?P<null>\bnull\b)'
r'|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
r"|(?P<boolean>\btrue\b|\bfalse\b)"
r"|(?P<null>\bnull\b)"
r"|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)",
)
@@ -65,13 +84,14 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str:
code = html.escape(text, quote=False)
if highlighted:
def _wrap_token(match: re.Match[str]) -> str:
styles = {
'key': 'color: #0550ae;',
'string': 'color: #0a7f45;',
'boolean': 'color: #8250df; font-weight: 600;',
'null': 'color: #6e7781; font-style: italic;',
'number': 'color: #b35900;',
"key": "color: #0550ae;",
"string": "color: #0a7f45;",
"boolean": "color: #8250df; font-weight: 600;",
"null": "color: #6e7781; font-style: italic;",
"number": "color: #b35900;",
}
token_type = next(name for name, value in match.groupdict().items() if value is not None)
return f'<span style="{styles[token_type]}">{match.group(0)}</span>'
@@ -82,9 +102,9 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str:
'<pre style="max-height: 600px; overflow: auto; background: #f6f8fa; '
'border: 1px solid #d0d7de; border-radius: 6px; padding: 12px; margin: 0;">'
'<code style="font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, '
'\'Liberation Mono\', monospace; white-space: pre; line-height: 1.5;">'
f'{code}'
'</code></pre>'
"'Liberation Mono', monospace; white-space: pre; line-height: 1.5;\">"
f"{code}"
"</code></pre>"
)
@@ -93,34 +113,35 @@ def render_highlighted_json_block(value: Any) -> str:
def get_plugin_docs_url(plugin_name: str) -> str:
return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
return f"{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}"
def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
return f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}"
def get_live_config_url(key: str) -> str:
return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
return f"{LIVE_CONFIG_BASE_URL}{quote(key)}/"
def get_environment_binary_url(name: str) -> str:
return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/"
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
binary_id = getattr(binary, 'id', None)
binary_id = getattr(binary, "id", None)
if not binary_id:
return None
base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
changelist_filters = urlencode({'q': canonical_binary_name(name)})
return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/"
changelist_filters = urlencode({"q": canonical_binary_name(name)})
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
def get_machine_admin_url() -> str | None:
try:
from archivebox.machine.models import Machine
return Machine.current().admin_change_url
except Exception:
return None
@@ -130,12 +151,14 @@ def render_code_tag_list(values: list[str]) -> str:
if not values:
return '<span style="color: #6e7781;">(none)</span>'
tags = ''.join(
str(format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
))
tags = "".join(
str(
format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
),
)
for value in values
)
return f'<div style="display: flex; flex-wrap: wrap;">{tags}</div>'
@@ -143,22 +166,21 @@ def render_code_tag_list(values: list[str]) -> str:
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
rows = (
('Title', config.get('title') or '(none)'),
('Description', config.get('description') or '(none)'),
('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
("Title", config.get("title") or "(none)"),
("Description", config.get("description") or "(none)"),
("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))),
("Required Binaries", mark_safe(render_link_tag_list(config.get("required_binaries") or [], get_environment_binary_url))),
("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))),
)
rendered_rows = ''.join(
str(format_html(
'<div style="margin: 0 0 14px 0;">'
'<div style="font-weight: 600; margin-bottom: 4px;">{}</div>'
'<div>{}</div>'
'</div>',
label,
value,
))
rendered_rows = "".join(
str(
format_html(
'<div style="margin: 0 0 14px 0;"><div style="font-weight: 600; margin-bottom: 4px;">{}</div><div>{}</div></div>',
label,
value,
),
)
for label, value in rows
)
return f'<div style="margin: 4px 0 0 0;">{rendered_rows}</div>'
@@ -171,20 +193,28 @@ def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] |
tags = []
for value in values:
if url_resolver is None:
tags.append(str(format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
)))
tags.append(
str(
format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
),
),
)
else:
tags.append(str(format_html(
'<a href="{}" style="text-decoration: none;">'
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
'</a>',
url_resolver(value),
value,
)))
tags.append(
str(
format_html(
'<a href="{}" style="text-decoration: none;">'
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
"</a>",
url_resolver(value),
value,
),
),
)
return f'<div style="display: flex; flex-wrap: wrap;">{"".join(tags)}</div>'
@@ -195,21 +225,21 @@ def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_adm
if machine_admin_url:
links.append(str(format_html('<a href="{}">Edit override</a>', machine_admin_url)))
fallback = prop_info.get('x-fallback')
fallback = prop_info.get("x-fallback")
if isinstance(fallback, str) and fallback:
links.append(str(format_html('<a href="{}">Fallback: <code>{}</code></a>', get_live_config_url(fallback), fallback)))
aliases = prop_info.get('x-aliases') or []
aliases = prop_info.get("x-aliases") or []
if isinstance(aliases, list):
for alias in aliases:
if isinstance(alias, str) and alias:
links.append(str(format_html('<a href="{}">Alias: <code>{}</code></a>', get_live_config_url(alias), alias)))
default = prop_info.get('default')
if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
default = prop_info.get("default")
if prop_name.endswith("_BINARY") and isinstance(default, str) and default:
links.append(str(format_html('<a href="{}">Binary: <code>{}</code></a>', get_environment_binary_url(default), default)))
return ' &nbsp; '.join(links)
return " &nbsp; ".join(links)
def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
@@ -221,42 +251,48 @@ def render_config_properties_html(properties: dict[str, Any], machine_admin_url:
header_links.insert(0, str(format_html('<a href="{}">Machine Config Editor</a>', machine_admin_url)))
cards = [
f'<div style="margin: 0 0 16px 0;">{" &nbsp; | &nbsp; ".join(header_links)}</div>'
f'<div style="margin: 0 0 16px 0;">{" &nbsp; | &nbsp; ".join(header_links)}</div>',
]
for prop_name, prop_info in properties.items():
prop_type = prop_info.get('type', 'unknown')
prop_type = prop_info.get("type", "unknown")
if isinstance(prop_type, list):
prop_type = ' | '.join(str(type_name) for type_name in prop_type)
prop_desc = prop_info.get('description', '')
prop_type = " | ".join(str(type_name) for type_name in prop_type)
prop_desc = prop_info.get("description", "")
default_html = ''
if 'default' in prop_info:
default_html = str(format_html(
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
prop_info['default'],
))
default_html = ""
if "default" in prop_info:
default_html = str(
format_html(
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
prop_info["default"],
),
)
description_html = prop_desc or mark_safe('<span style="color: #6e7781;">(no description)</span>')
cards.append(str(format_html(
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
'<div style="margin-bottom: 6px;">'
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
' <span style="color: #6e7781;">({})</span>'
'</div>'
'<div style="margin-bottom: 6px;">{}</div>'
'<div style="font-size: 0.95em;">{}</div>'
'{}'
'</div>',
get_live_config_url(prop_name),
prop_name,
prop_type,
description_html,
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
mark_safe(default_html),
)))
cards.append(
str(
format_html(
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
'<div style="margin-bottom: 6px;">'
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
' <span style="color: #6e7781;">({})</span>'
"</div>"
'<div style="margin-bottom: 6px;">{}</div>'
'<div style="font-size: 0.95em;">{}</div>'
"{}"
"</div>",
get_live_config_url(prop_name),
prop_name,
prop_type,
description_html,
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
mark_safe(default_html),
),
),
)
return ''.join(cards)
return "".join(cards)
def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
@@ -265,40 +301,47 @@ def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> s
items = []
for hook_name in hooks:
if source == 'builtin':
items.append(str(format_html(
'<div style="margin: 0 0 8px 0;">'
'<a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a>'
'</div>',
get_plugin_hook_source_url(plugin_name, hook_name),
hook_name,
)))
if source == "builtin":
items.append(
str(
format_html(
'<div style="margin: 0 0 8px 0;"><a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a></div>',
get_plugin_hook_source_url(plugin_name, hook_name),
hook_name,
),
),
)
else:
items.append(str(format_html(
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
hook_name,
)))
return ''.join(items)
items.append(
str(
format_html(
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
hook_name,
),
),
)
return "".join(items)
def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
installed_binary_url = get_installed_binary_change_url(name, db_binary)
if installed_binary_url:
return str(format_html(
'<code>{}</code><br/>'
'<a href="{}">View Installed Binary Record</a>',
merged['abspath'],
installed_binary_url,
))
return str(
format_html(
'<code>{}</code><br/><a href="{}">View Installed Binary Record</a>',
merged["abspath"],
installed_binary_url,
),
)
return str(format_html('<code>{}</code>', merged['abspath']))
return str(format_html("<code>{}</code>", merged["abspath"]))
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
indent_str = " " * indent
if indent == 0:
indent_str = '\n' # put extra newline between top-level entries
indent_str = "\n" # put extra newline between top-level entries
if isinstance(obj, dict):
if not obj:
@@ -326,11 +369,11 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
return f" {str(obj)}"
elif callable(obj):
source = '\n'.join(
'' if 'def ' in line else line
for line in inspect.getsource(obj).split('\n')
if line.strip()
).split('lambda: ')[-1].rstrip(',')
source = (
"\n".join("" if "def " in line else line for line in inspect.getsource(obj).split("\n") if line.strip())
.split("lambda: ")[-1]
.rstrip(",")
)
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
else:
@@ -350,67 +393,64 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
)
def get_db_binaries_by_name() -> Dict[str, Binary]:
grouped: Dict[str, list[Binary]] = {}
def get_db_binaries_by_name() -> dict[str, Binary]:
grouped: dict[str, list[Binary]] = {}
for binary in Binary.objects.all():
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
return {
name: max(records, key=_binary_sort_key)
for name, records in grouped.items()
}
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
def serialize_binary_record(name: str, binary: Binary | None) -> dict[str, Any]:
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
return {
'name': canonical_binary_name(name),
'version': str(getattr(binary, 'version', '') or ''),
'binprovider': str(getattr(binary, 'binprovider', '') or ''),
'abspath': str(getattr(binary, 'abspath', '') or ''),
'sha256': str(getattr(binary, 'sha256', '') or ''),
'status': str(getattr(binary, 'status', '') or ''),
'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
"name": canonical_binary_name(name),
"version": str(getattr(binary, "version", "") or ""),
"binprovider": str(getattr(binary, "binprovider", "") or ""),
"abspath": str(getattr(binary, "abspath", "") or ""),
"sha256": str(getattr(binary, "sha256", "") or ""),
"status": str(getattr(binary, "status", "") or ""),
"is_available": is_installed and bool(getattr(binary, "abspath", "") or ""),
}
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
def get_filesystem_plugins() -> dict[str, dict[str, Any]]:
"""Discover plugins from filesystem directories."""
import json
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
plugins = {}
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
for base_dir, source in [(BUILTIN_PLUGINS_DIR, "builtin"), (USER_PLUGINS_DIR, "user")]:
if not base_dir.exists():
continue
for plugin_dir in base_dir.iterdir():
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
plugin_id = f'{source}.{plugin_dir.name}'
if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"):
plugin_id = f"{source}.{plugin_dir.name}"
# Find hook scripts
hooks = []
for ext in ('sh', 'py', 'js'):
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
for ext in ("sh", "py", "js"):
hooks.extend(plugin_dir.glob(f"on_*__*.{ext}"))
# Load config.json if it exists
config_file = plugin_dir / 'config.json'
config_file = plugin_dir / "config.json"
config_data = None
if config_file.exists():
try:
with open(config_file, 'r') as f:
with open(config_file) as f:
config_data = json.load(f)
except (json.JSONDecodeError, IOError):
except (json.JSONDecodeError, OSError):
config_data = None
plugins[plugin_id] = {
'id': plugin_id,
'name': plugin_dir.name,
'path': str(plugin_dir),
'source': source,
'hooks': [str(h.name) for h in hooks],
'config': config_data,
"id": plugin_id,
"name": plugin_dir.name,
"path": str(plugin_dir),
"source": source,
"hooks": [str(h.name) for h in hooks],
"config": config_data,
}
return plugins
@@ -418,7 +458,7 @@ def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
@render_with_table_view
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
assert is_superuser(request), "Must be a superuser to view configuration settings."
rows = {
"Binary Name": [],
@@ -433,16 +473,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
for name in all_binary_names:
merged = serialize_binary_record(name, db_binaries.get(name))
rows['Binary Name'].append(ItemLink(name, key=name))
rows["Binary Name"].append(ItemLink(name, key=name))
if merged['is_available']:
rows['Found Version'].append(f"{merged['version']}" if merged['version'] else '✅ found')
rows['Provided By'].append(merged['binprovider'] or '-')
rows['Found Abspath'].append(merged['abspath'] or '-')
if merged["is_available"]:
rows["Found Version"].append(f"{merged['version']}" if merged["version"] else "✅ found")
rows["Provided By"].append(merged["binprovider"] or "-")
rows["Found Abspath"].append(merged["abspath"] or "-")
else:
rows['Found Version'].append('❌ missing')
rows['Provided By'].append('-')
rows['Found Abspath'].append('-')
rows["Found Version"].append("❌ missing")
rows["Provided By"].append("-")
rows["Found Abspath"].append("-")
return TableContext(
title="Binaries",
@@ -452,23 +492,23 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
assert is_superuser(request), "Must be a superuser to view configuration settings."
key = canonical_binary_name(key)
db_binary = get_db_binaries_by_name().get(key)
merged = serialize_binary_record(key, db_binary)
if merged['is_available']:
if merged["is_available"]:
section: SectionData = {
"name": key,
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
"fields": {
'name': key,
'binprovider': merged['binprovider'] or '-',
'abspath': merged['abspath'] or 'not found',
'version': merged['version'] or 'unknown',
'sha256': merged['sha256'],
'status': merged['status'],
"name": key,
"binprovider": merged["binprovider"] or "-",
"abspath": merged["abspath"] or "not found",
"version": merged["version"] or "unknown",
"sha256": merged["sha256"],
"status": merged["status"],
},
"help_texts": {},
}
@@ -482,11 +522,11 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"name": key,
"description": "No persisted Binary record found",
"fields": {
'name': key,
'binprovider': merged['binprovider'] or 'not recorded',
'abspath': merged['abspath'] or 'not recorded',
'version': merged['version'] or 'N/A',
'status': merged['status'] or 'unrecorded',
"name": key,
"binprovider": merged["binprovider"] or "not recorded",
"abspath": merged["abspath"] or "not recorded",
"version": merged["version"] or "N/A",
"status": merged["status"] or "unrecorded",
},
"help_texts": {},
}
@@ -499,7 +539,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
@render_with_table_view
def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
assert is_superuser(request), "Must be a superuser to view configuration settings."
rows = {
"Name": [],
@@ -512,26 +552,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
plugins = get_filesystem_plugins()
for plugin_id, plugin in plugins.items():
rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
rows['Source'].append(plugin['source'])
rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
rows["Name"].append(ItemLink(plugin["name"], key=plugin_id))
rows["Source"].append(plugin["source"])
rows["Path"].append(format_html("<code>{}</code>", plugin["path"]))
rows["Hooks"].append(", ".join(plugin["hooks"]) or "(none)")
# Show config status
if plugin.get('config'):
config_properties = plugin['config'].get('properties', {})
if plugin.get("config"):
config_properties = plugin["config"].get("properties", {})
config_count = len(config_properties)
rows['Config'].append(f'{config_count} properties' if config_count > 0 else '✅ present')
rows["Config"].append(f"{config_count} properties" if config_count > 0 else "✅ present")
else:
rows['Config'].append('❌ none')
rows["Config"].append("❌ none")
if not plugins:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
rows['Path'].append(mark_safe('<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>'))
rows['Hooks'].append('-')
rows['Config'].append('-')
rows["Name"].append("(no plugins found)")
rows["Source"].append("-")
rows["Path"].append(mark_safe("<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>"))
rows["Hooks"].append("-")
rows["Config"].append("-")
return TableContext(
title="Installed plugins",
@@ -541,7 +581,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
assert is_superuser(request), "Must be a superuser to view configuration settings."
plugins = get_filesystem_plugins()
@@ -549,65 +589,75 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
if not plugin:
return ItemContext(
slug=key,
title=f'Plugin not found: {key}',
title=f"Plugin not found: {key}",
data=[],
)
# Base fields that all plugins have
docs_url = get_plugin_docs_url(plugin['name'])
docs_url = get_plugin_docs_url(plugin["name"])
machine_admin_url = get_machine_admin_url()
fields = {
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
"id": plugin["id"],
"name": plugin["name"],
"source": plugin["source"],
}
sections: list[SectionData] = [{
"name": plugin['name'],
"description": format_html(
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
plugin['path'],
docs_url,
),
"fields": fields,
"help_texts": {},
}]
if plugin['hooks']:
sections.append({
"name": "Hooks",
"description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
"fields": {},
sections: list[SectionData] = [
{
"name": plugin["name"],
"description": format_html(
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
plugin["path"],
docs_url,
),
"fields": fields,
"help_texts": {},
})
},
]
if plugin.get('config'):
sections.append({
"name": "Plugin Metadata",
"description": mark_safe(render_plugin_metadata_html(plugin['config'])),
"fields": {},
"help_texts": {},
})
sections.append({
"name": "config.json",
"description": mark_safe(render_highlighted_json_block(plugin['config'])),
"fields": {},
"help_texts": {},
})
config_properties = plugin['config'].get('properties', {})
if config_properties:
sections.append({
"name": "Config Properties",
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
if plugin["hooks"]:
sections.append(
{
"name": "Hooks",
"description": mark_safe(render_hook_links_html(plugin["name"], plugin["hooks"], plugin["source"])),
"fields": {},
"help_texts": {},
})
},
)
if plugin.get("config"):
sections.append(
{
"name": "Plugin Metadata",
"description": mark_safe(render_plugin_metadata_html(plugin["config"])),
"fields": {},
"help_texts": {},
},
)
sections.append(
{
"name": "config.json",
"description": mark_safe(render_highlighted_json_block(plugin["config"])),
"fields": {},
"help_texts": {},
},
)
config_properties = plugin["config"].get("properties", {})
if config_properties:
sections.append(
{
"name": "Config Properties",
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
"fields": {},
"help_texts": {},
},
)
return ItemContext(
slug=key,
title=plugin['name'],
title=plugin["name"],
data=sections,
)
@@ -648,20 +698,20 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
all_config[config_name] = config_data
# Add top row for supervisord process manager
rows["Name"].append(ItemLink('supervisord', key='supervisord'))
rows["Name"].append(ItemLink("supervisord", key="supervisord"))
supervisor_state = supervisor.getState()
rows["State"].append(str(supervisor_state.get('statename') if isinstance(supervisor_state, dict) else ''))
rows['PID'].append(str(supervisor.getPID()))
rows["Started"].append('-')
rows["Command"].append('supervisord --configuration=tmp/supervisord.conf')
rows["State"].append(str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""))
rows["PID"].append(str(supervisor.getPID()))
rows["Started"].append("-")
rows["Command"].append("supervisord --configuration=tmp/supervisord.conf")
rows["Logfile"].append(
format_html(
'<a href="/admin/environment/logs/{}/">{}</a>',
'supervisord',
'logs/supervisord.log',
)
"supervisord",
"logs/supervisord.log",
),
)
rows['Exit Status'].append('0')
rows["Exit Status"].append("0")
# Add a row for each worker process managed by supervisord
process_items = supervisor.getAllProcessInfo()
@@ -678,15 +728,15 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows["Name"].append(ItemLink(proc_name, key=proc_name))
rows["State"].append(str(proc_data.get("statename") or ""))
rows['PID'].append(proc_description.replace('pid ', ''))
rows["PID"].append(proc_description.replace("pid ", ""))
rows["Started"].append(format_parsed_datetime(proc_start))
rows["Command"].append(str(proc_config.get("command") or ""))
rows["Logfile"].append(
format_html(
'<a href="/admin/environment/logs/{}/">{}</a>',
proc_logfile.split("/")[-1].split('.')[0],
proc_logfile.split("/")[-1].split(".")[0],
proc_logfile,
)
),
)
rows["Exit Status"].append(str(proc_data.get("exitstatus") or ""))
@@ -708,8 +758,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
supervisor = get_existing_supervisord_process()
if supervisor is None:
return ItemContext(
slug='none',
title='error: No running supervisord process.',
slug="none",
title="error: No running supervisord process.",
data=[],
)
@@ -721,7 +771,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
if isinstance(config_data, dict):
all_config.append(config_data)
if key == 'supervisord':
if key == "supervisord":
relevant_config = CONFIG_FILE.read_text()
relevant_logs = str(supervisor.readLog(0, 10_000_000))
start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0]
@@ -729,7 +779,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else ""
supervisor_state = supervisor.getState()
proc: Dict[str, object] = {
proc: dict[str, object] = {
"name": "supervisord",
"pid": supervisor.getPID(),
"statename": str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""),
@@ -737,12 +787,12 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"stop": None,
"exitstatus": "",
"stdout_logfile": "logs/supervisord.log",
"description": f'pid 000, uptime {uptime}',
"description": f"pid 000, uptime {uptime}",
}
else:
worker_data = get_worker(supervisor, key)
proc = worker_data if isinstance(worker_data, dict) else {}
relevant_config = next((config for config in all_config if config.get('name') == key), {})
relevant_config = next((config for config in all_config if config.get("name") == key), {})
log_result = supervisor.tailProcessStdoutLog(key, 0, 10_000_000)
relevant_logs = str(log_result[0] if isinstance(log_result, tuple) else log_result)
@@ -775,7 +825,6 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert is_superuser(request), "Must be a superuser to view configuration settings."
log_files: list[Path] = []
for logfile in sorted(CONSTANTS.LOGS_DIR.glob("*.log"), key=os.path.getmtime)[::-1]:
if isinstance(logfile, Path):
@@ -793,14 +842,14 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
st = logfile.stat()
rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name))
rows["Last Updated"].append(format_parsed_datetime(st.st_mtime))
rows["Size"].append(f'{st.st_size//1000} kb')
rows["Size"].append(f"{st.st_size // 1000} kb")
with open(logfile, 'rb') as f:
with open(logfile, "rb") as f:
try:
f.seek(-1024, os.SEEK_END)
except OSError:
f.seek(0)
last_lines = f.read().decode('utf-8', errors='replace').split("\n")
last_lines = f.read().decode("utf-8", errors="replace").split("\n")
non_empty_lines = [line for line in last_lines if line.strip()]
rows["Most Recent Lines"].append(non_empty_lines[-1])
@@ -814,7 +863,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), "Must be a superuser to view configuration settings."
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob("*.log") if key in logfile.name][0]
log_text = log_file.read_text()
log_stat = log_file.stat()
@@ -824,7 +873,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"description": key,
"fields": {
"Path": str(log_file),
"Size": f"{log_stat.st_size//1000} kb",
"Size": f"{log_stat.st_size // 1000} kb",
"Last Updated": format_parsed_datetime(log_stat.st_mtime),
"Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]),
"Full Log": log_text,

View File

@@ -1,10 +1,11 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
__order__ = 100
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from archivebox.core.admin import register_admin as do_register
do_register(admin_site)
@@ -17,11 +18,12 @@ def get_CONFIG():
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'STORAGE_CONFIG': STORAGE_CONFIG,
'GENERAL_CONFIG': GENERAL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
"SHELL_CONFIG": SHELL_CONFIG,
"STORAGE_CONFIG": STORAGE_CONFIG,
"GENERAL_CONFIG": GENERAL_CONFIG,
"SERVER_CONFIG": SERVER_CONFIG,
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
"SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
}

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from django.contrib.auth import get_user_model

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
import html
import json
@@ -21,57 +21,45 @@ from django.utils.text import smart_split
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.misc.paginators import AcceleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.widgets import InlineTagEditorWidget
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
from archivebox.machine.env_utils import env_to_shell_exports
from archivebox.core.models import ArchiveResult, Snapshot
def _stringify_env_value(value) -> str:
if value is None:
return ''
if isinstance(value, str):
return value
return json.dumps(value, separators=(',', ':'))
def _quote_shell_string(value: str) -> str:
return "'" + str(value).replace("'", "'\"'\"'") + "'"
def _get_replay_source_url(result: ArchiveResult) -> str:
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
process_env = getattr(getattr(result, "process", None), "env", None) or {}
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "")
def build_abx_dl_display_command(result: ArchiveResult) -> str:
source_url = _get_replay_source_url(result)
plugin_name = str(result.plugin or '').strip()
plugin_name = str(result.plugin or "").strip()
if not plugin_name and not source_url:
return 'abx-dl'
return "abx-dl"
if not source_url:
return f'abx-dl --plugins={plugin_name}'
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
return f"abx-dl --plugins={plugin_name}"
return f"abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}"
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
display_command = build_abx_dl_display_command(result)
process = getattr(result, 'process', None)
env = getattr(process, 'env', None) or {}
env_items = ' '.join(
f'{key}={shlex.quote(_stringify_env_value(value))}'
for key, value in sorted(env.items())
if value is not None
)
process = getattr(result, "process", None)
env_items = env_to_shell_exports(getattr(process, "env", None) or {})
snapshot_dir = shlex.quote(str(result.snapshot_dir))
if env_items:
return f'cd {snapshot_dir}; env {env_items} {display_command}'
return f'cd {snapshot_dir}; {display_command}'
return f"cd {snapshot_dir}; env {env_items} {display_command}"
return f"cd {snapshot_dir}; {display_command}"
def get_plugin_admin_url(plugin_name: str) -> str:
@@ -81,50 +69,87 @@ def get_plugin_admin_url(plugin_name: str) -> str:
if plugin_dir:
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(builtin_root):
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/"
user_root = USER_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(user_root):
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/"
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/"
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
results = list(archiveresults_qs.order_by('plugin').select_related('snapshot')[:limit])
result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit])
if not result_ids:
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
results_by_id = {
result.pk: result
for result in ArchiveResult.objects.filter(pk__in=result_ids).select_related("snapshot", "process", "process__machine")
}
results = [results_by_id[result_id] for result_id in result_ids if result_id in results_by_id]
if not results:
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
# Status colors
status_colors = {
'succeeded': ('#166534', '#dcfce7'), # green
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
'backoff': ('#92400e', '#fef3c7'),
'skipped': ('#475569', '#f1f5f9'),
'noresults': ('#475569', '#f1f5f9'),
"succeeded": ("#166534", "#dcfce7"), # green
"failed": ("#991b1b", "#fee2e2"), # red
"queued": ("#6b7280", "#f3f4f6"), # gray
"started": ("#92400e", "#fef3c7"), # amber
"backoff": ("#92400e", "#fef3c7"),
"skipped": ("#475569", "#f1f5f9"),
"noresults": ("#475569", "#f1f5f9"),
}
rows = []
for idx, result in enumerate(results):
status = result.status or 'queued'
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
status = result.status or "queued"
color, bg = status_colors.get(status, ("#6b7280", "#f3f4f6"))
output_files = result.output_files or {}
if isinstance(output_files, dict):
output_file_count = len(output_files)
elif isinstance(output_files, (list, tuple, set)):
output_file_count = len(output_files)
elif isinstance(output_files, str):
try:
parsed = json.loads(output_files)
output_file_count = len(parsed) if isinstance(parsed, (dict, list, tuple, set)) else 0
except Exception:
output_file_count = 0
else:
output_file_count = 0
# Get plugin icon
icon = get_plugin_icon(result.plugin)
# Format timestamp
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
end_time = result.end_ts.strftime("%Y-%m-%d %H:%M:%S") if result.end_ts else "-"
process_display = "-"
if result.process_id and result.process:
process_display = f'''
<a href="{reverse("admin:machine_process_change", args=[result.process_id])}"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px;"
title="View process">{result.process.pid or "-"}</a>
'''
machine_display = "-"
if result.process_id and result.process and result.process.machine_id:
machine_display = f'''
<a href="{reverse("admin:machine_machine_change", args=[result.process.machine_id])}"
style="color: #2563eb; text-decoration: none; font-size: 12px;"
title="View machine">{result.process.machine.hostname}</a>
'''
# Truncate output for display
full_output = result.output_str or '-'
full_output = result.output_str or "-"
output_display = full_output[:60]
if len(full_output) > 60:
output_display += '...'
output_display += "..."
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
@@ -132,23 +157,23 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
cmd_attr = html.escape(replay_cmd, quote=True)
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
snapshot_id = str(getattr(result, 'snapshot_id', ''))
if embed_path and result.status == 'succeeded':
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
snapshot_id = str(getattr(result, "snapshot_id", ""))
if embed_path and result.status == "succeeded":
output_link = build_snapshot_url(snapshot_id, embed_path)
else:
output_link = build_snapshot_url(snapshot_id, '')
output_link = build_snapshot_url(snapshot_id, "")
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
version = result.cmd_version if result.cmd_version else "-"
# Unique ID for this row's expandable output
row_id = f'output_{idx}_{str(result.id)[:8]}'
row_id = f"output_{idx}_{str(result.id)[:8]}"
rows.append(f'''
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
<td style="padding: 10px 12px; white-space: nowrap;">
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
<a href="{reverse("admin:core_archiveresult_change", args=[result.id])}"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
title="View/edit archive result">
<code>{str(result.id)[-8:]}</code>
@@ -178,9 +203,18 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
{output_display}
</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px; text-align: right;">
{output_file_count}
</td>
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
{end_time}
</td>
<td style="padding: 10px 12px; white-space: nowrap;">
{process_display}
</td>
<td style="padding: 10px 12px; white-space: nowrap;">
{machine_display}
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
{version}
</td>
@@ -189,14 +223,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<a href="{output_link}" target="_blank"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="View output">📄</a>
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
<a href="{reverse("admin:core_archiveresult_change", args=[result.id])}"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="Edit">✏️</a>
</div>
</td>
</tr>
<tr style="border-bottom: 1px solid #e2e8f0;">
<td colspan="8" style="padding: 0 12px 10px 12px;">
<td colspan="11" style="padding: 0 12px 10px 12px;">
<details id="{row_id}" style="margin: 0;">
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
Details &amp; Output
@@ -205,7 +239,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or "-"}</code></span>
</div>
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<b>Output:</b>
@@ -230,19 +264,19 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
''')
total_count = archiveresults_qs.count()
footer = ''
footer = ""
if total_count > limit:
footer = f'''
footer = f"""
<tr>
<td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
<td colspan="11" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
Showing {limit} of {total_count} results &nbsp;
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ""}"
style="color: #2563eb;">View all →</a>
</td>
</tr>
'''
"""
return mark_safe(f'''
return mark_safe(f"""
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
@@ -252,86 +286,92 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
<th style="padding: 10px 12px; text-align: right; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Files</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Process</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Machine</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
{"".join(rows)}
{footer}
</tbody>
</table>
</div>
''')
""")
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
name = "Archive Results Log"
model = ArchiveResult
parent_model = Snapshot
# fk_name = 'snapshot'
extra = 0
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
sort_fields = ("end_ts", "plugin", "output_str", "status", "cmd_version")
readonly_fields = ("id", "result_id", "completed", "command", "version")
fields = ("start_ts", "end_ts", *readonly_fields, "plugin", "cmd", "cmd_version", "pwd", "status", "output_str")
# exclude = ('id',)
ordering = ('end_ts',)
ordering = ("end_ts",)
show_change_link = True
# # classes = ['collapse']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
try:
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
return self.parent_model.objects.get(pk=resolved.kwargs["object_id"])
except (self.parent_model.DoesNotExist, ValidationError):
return None
@admin.display(
description='Completed',
ordering='end_ts',
description="Completed",
ordering="end_ts",
)
def completed(self, obj):
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S"))
def result_id(self, obj):
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), str(obj.id)[:8])
return format_html(
'<a href="{}"><code style="font-size: 10px">[{}]</code></a>',
reverse("admin:core_archiveresult_change", args=(obj.id,)),
str(obj.id)[:8],
)
def command(self, obj):
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
return format_html("<small><code>{}</code></small>", " ".join(obj.cmd or []))
def version(self, obj):
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
return format_html("<small><code>{}</code></small>", obj.cmd_version or "-")
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
form_class = getattr(formset, 'form', None)
base_fields = getattr(form_class, 'base_fields', {})
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ''
form_class = getattr(formset, "form", None)
base_fields = getattr(form_class, "base_fields", {})
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ""
# import ipdb; ipdb.set_trace()
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
base_fields['status'].initial = 'succeeded'
base_fields['start_ts'].initial = timezone.now()
base_fields['end_ts'].initial = timezone.now()
base_fields['cmd_version'].initial = '-'
base_fields['pwd'].initial = snapshot_output_dir
base_fields['cmd'].initial = '["-"]'
base_fields['output_str'].initial = 'Manually recorded cmd output...'
base_fields["status"].initial = "succeeded"
base_fields["start_ts"].initial = timezone.now()
base_fields["end_ts"].initial = timezone.now()
base_fields["cmd_version"].initial = "-"
base_fields["pwd"].initial = snapshot_output_dir
base_fields["cmd"].initial = '["-"]'
base_fields["output_str"].initial = "Manually recorded cmd output..."
if obj is not None:
# hidden values for existing entries and new entries
base_fields['start_ts'].widget = base_fields['start_ts'].hidden_widget()
base_fields['end_ts'].widget = base_fields['end_ts'].hidden_widget()
base_fields['cmd'].widget = base_fields['cmd'].hidden_widget()
base_fields['pwd'].widget = base_fields['pwd'].hidden_widget()
base_fields['cmd_version'].widget = base_fields['cmd_version'].hidden_widget()
base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget()
base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget()
base_fields["cmd"].widget = base_fields["cmd"].hidden_widget()
base_fields["pwd"].widget = base_fields["pwd"].hidden_widget()
base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
@@ -339,62 +379,122 @@ class ArchiveResultInline(admin.TabularInline):
return []
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
list_display = (
"details_link",
"zip_link",
"created_at",
"snapshot_info",
"tags_inline",
"status_badge",
"plugin_with_icon",
"process_link",
"machine_link",
"cmd_str",
"output_str_display",
)
list_display_links = None
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
search_fields = ()
autocomplete_fields = ['snapshot']
sort_fields = ("id", "created_at", "plugin", "status")
readonly_fields = (
"admin_actions",
"cmd",
"cmd_version",
"pwd",
"cmd_str",
"snapshot_info",
"tags_str",
"created_at",
"modified_at",
"output_summary",
"plugin_with_icon",
"process_link",
)
search_fields = (
"snapshot__id",
"snapshot__url",
"snapshot__tags__name",
"snapshot__crawl_id",
"plugin",
"hook_name",
"output_str",
"output_json",
"process__cmd",
)
autocomplete_fields = ["snapshot"]
fieldsets = (
('Snapshot', {
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
'classes': ('card', 'wide'),
}),
('Plugin', {
'fields': ('plugin_with_icon', 'process_link', 'status'),
'classes': ('card',),
}),
('Timing', {
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Command', {
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
'classes': ('card',),
}),
('Output', {
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
'classes': ('card', 'wide'),
}),
(
"Actions",
{
"fields": ("admin_actions",),
"classes": ("card", "wide"),
},
),
(
"Snapshot",
{
"fields": ("snapshot", "snapshot_info", "tags_str"),
"classes": ("card", "wide"),
},
),
(
"Plugin",
{
"fields": ("plugin_with_icon", "process_link", "status"),
"classes": ("card",),
},
),
(
"Timing",
{
"fields": ("start_ts", "end_ts", "created_at", "modified_at"),
"classes": ("card",),
},
),
(
"Command",
{
"fields": ("cmd", "cmd_str", "cmd_version", "pwd"),
"classes": ("card",),
},
),
(
"Output",
{
"fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"),
"classes": ("card", "wide"),
},
),
)
list_filter = ('status', 'plugin', 'start_ts')
ordering = ['-start_ts']
list_filter = ("status", "plugin", "start_ts")
ordering = ["-start_ts"]
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
paginator = AcceleratedPaginator
save_on_top = True
actions = ['delete_selected']
actions = ["delete_selected"]
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'
verbose_name = "Archive Result"
verbose_name_plural = "Archive Results"
def change_view(self, request, object_id, form_url="", extra_context=None):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
def changelist_view(self, request, extra_context=None):
self.request = request
return super().changelist_view(request, extra_context)
def get_queryset(self, request):
return (
super()
.get_queryset(request)
.select_related('snapshot', 'process')
.prefetch_related('snapshot__tags')
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
.select_related("snapshot", "process")
.prefetch_related("snapshot__tags")
.annotate(snapshot_first_tag=Min("snapshot__tags__name"))
)
def get_search_results(self, request, queryset, search_term):
@@ -402,15 +502,14 @@ class ArchiveResultAdmin(BaseModelAdmin):
return queryset, False
queryset = queryset.annotate(
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
output_json_text=Cast('output_json', output_field=TextField()),
cmd_text=Cast('process__cmd', output_field=TextField()),
snapshot_id_text=Cast("snapshot__id", output_field=TextField()),
snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()),
output_json_text=Cast("output_json", output_field=TextField()),
cmd_text=Cast("process__cmd", output_field=TextField()),
)
search_bits = [
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
for bit in smart_split(search_term)
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term)
]
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
if not search_bits:
@@ -427,22 +526,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
| Q(hook_name__icontains=bit)
| Q(output_str__icontains=bit)
| Q(output_json_text__icontains=bit)
| Q(cmd_text__icontains=bit)
| Q(cmd_text__icontains=bit),
)
return queryset.filter(reduce(and_, filters)).distinct(), True
@admin.display(description='Details', ordering='id')
def get_snapshot_view_url(self, result: ArchiveResult) -> str:
return build_snapshot_url(str(result.snapshot_id), request=getattr(self, "request", None))
def get_output_view_url(self, result: ArchiveResult) -> str:
output_path = result.embed_path() if hasattr(result, "embed_path") else None
if not output_path:
output_path = result.plugin or ""
return build_snapshot_url(str(result.snapshot_id), output_path, request=getattr(self, "request", None))
def get_output_files_url(self, result: ArchiveResult) -> str:
return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=getattr(self, 'request', None))}/?files=1"
def get_output_zip_url(self, result: ArchiveResult) -> str:
return f"{self.get_output_files_url(result)}&download=zip"
@admin.display(description="Details", ordering="id")
def details_link(self, result):
return format_html(
'<a href="{}"><code>{}</code></a>',
reverse('admin:core_archiveresult_change', args=[result.id]),
reverse("admin:core_archiveresult_change", args=[result.id]),
str(result.id)[-8:],
)
@admin.display(description="Zip")
def zip_link(self, result):
return format_html(
'<a href="{}" class="archivebox-zip-button" data-loading-mode="spinner-only" onclick="return window.archiveboxHandleZipClick(this, event);" style="display:inline-flex; align-items:center; justify-content:center; gap:4px; width:48px; min-width:48px; height:24px; padding:0; box-sizing:border-box; border-radius:999px; border:1px solid #bfdbfe; background:#eff6ff; color:#1d4ed8; font-size:11px; font-weight:600; line-height:1; text-decoration:none;"><span class="archivebox-zip-spinner" aria-hidden="true"></span><span class="archivebox-zip-label">⬇ ZIP</span></a>',
self.get_output_zip_url(result),
)
@admin.display(
description='Snapshot',
ordering='snapshot__url',
description="Snapshot",
ordering="snapshot__url",
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
@@ -450,29 +571,28 @@ class ArchiveResultAdmin(BaseModelAdmin):
'<a href="{}"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
build_snapshot_url(snapshot_id, "index.html"),
snapshot_id[:8],
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.bookmarked_at.strftime("%Y-%m-%d %H:%M"),
result.snapshot.url[:128],
)
@admin.display(
description='Snapshot Tags'
description="Snapshot Tags",
)
def tags_str(self, result):
return result.snapshot.tags_str()
@admin.display(description='Tags', ordering='snapshot_first_tag')
@admin.display(description="Tags", ordering="snapshot_first_tag")
def tags_inline(self, result):
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
tags_html = widget.render(
name=f'tags_{result.snapshot_id}',
name=f"tags_{result.snapshot_id}",
value=result.snapshot.tags.all(),
attrs={'id': f'tags_{result.snapshot_id}'},
attrs={"id": f"tags_{result.snapshot_id}"},
snapshot_id=str(result.snapshot_id),
)
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
@admin.display(description='Status', ordering='status')
@admin.display(description="Status", ordering="status")
def status_badge(self, result):
status = result.status or ArchiveResult.StatusChoices.QUEUED
return format_html(
@@ -482,7 +602,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
result.get_status_display() or status,
)
@admin.display(description='Plugin', ordering='plugin')
@admin.display(description="Plugin", ordering="plugin")
def plugin_with_icon(self, result):
icon = get_plugin_icon(result.plugin)
return format_html(
@@ -494,36 +614,36 @@ class ArchiveResultAdmin(BaseModelAdmin):
result.plugin,
)
@admin.display(description='Process', ordering='process__pid')
@admin.display(description="Process", ordering="process__pid")
def process_link(self, result):
if not result.process_id:
return '-'
process_label = result.process.pid if result.process and result.process.pid else '-'
return "-"
process_label = result.process.pid if result.process and result.process.pid else "-"
return format_html(
'<a href="{}"><code>{}</code></a>',
reverse('admin:machine_process_change', args=[result.process_id]),
reverse("admin:machine_process_change", args=[result.process_id]),
process_label,
)
@admin.display(description='Machine', ordering='process__machine__hostname')
@admin.display(description="Machine", ordering="process__machine__hostname")
def machine_link(self, result):
if not result.process_id or not result.process or not result.process.machine_id:
return '-'
return "-"
machine = result.process.machine
return format_html(
'<a href="{}"><code>{}</code> {}</a>',
reverse('admin:machine_machine_change', args=[machine.id]),
reverse("admin:machine_machine_change", args=[machine.id]),
str(machine.id)[:8],
machine.hostname,
)
@admin.display(description='Command')
@admin.display(description="Command")
def cmd_str(self, result):
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
return format_html(
'''
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
"""
<div style="position: relative; width: 100%; max-width: 100%; overflow: hidden; box-sizing: border-box;">
<button type="button"
data-command="{}"
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
@@ -534,7 +654,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
{}
</code>
</div>
''',
""",
replay_cmd,
replay_cmd,
display_cmd,
@@ -542,8 +662,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
def output_display(self, result):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html"
snapshot_id = str(result.snapshot_id)
return format_html(
'<a href="{}" class="output-link">↗️</a><pre>{}</pre>',
@@ -551,13 +671,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
result.output_str,
)
@admin.display(description='Output', ordering='output_str')
@admin.display(description="Output", ordering="output_str")
def output_str_display(self, result):
output_text = str(result.output_str or '').strip()
output_text = str(result.output_str or "").strip()
if not output_text:
return '-'
return "-"
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
live_path = result.embed_path() if hasattr(result, "embed_path") else None
if live_path:
return format_html(
'<a href="{}" title="{}"><code>{}</code></a>',
@@ -572,8 +692,48 @@ class ArchiveResultAdmin(BaseModelAdmin):
output_text,
)
@admin.display(description="")
def admin_actions(self, result):
return format_html(
"""
<div style="display:flex; flex-wrap:wrap; gap:12px; align-items:center;">
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 View Output
</a>
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Output files
</a>
<a class="btn archivebox-zip-button" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#eff6ff; border:1px solid #bfdbfe; border-radius:8px; color:#1d4ed8; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
href="{}"
data-loading-label="Preparing..."
onclick="return window.archiveboxHandleZipClick(this, event);"
onmouseover="this.style.background='#dbeafe'; this.style.borderColor='#93c5fd';"
onmouseout="this.style.background='#eff6ff'; this.style.borderColor='#bfdbfe';">
<span class="archivebox-zip-spinner" aria-hidden="true"></span>
<span class="archivebox-zip-label">⬇ Download Zip</span>
</a>
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
🗂 Snapshot
</a>
</div>
""",
self.get_output_view_url(result),
self.get_output_files_url(result),
self.get_output_zip_url(result),
self.get_snapshot_view_url(result),
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split("data/", 1)[-1]
output_html = format_html(
'<pre style="display: inline-block">{}</pre><br/>',
result.output_str,
@@ -583,9 +743,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
'<a href="{}#all">See result files ...</a><br/><pre><code>',
build_snapshot_url(snapshot_id, "index.html"),
)
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
embed_path = result.embed_path() if hasattr(result, "embed_path") else ""
path_from_embed = snapshot_dir / (embed_path or "")
output_html += format_html(
'<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>',
str(snapshot_dir),
str(embed_path),
)
if os.access(path_from_embed, os.R_OK):
root_dir = str(path_from_embed)
else:
@@ -594,19 +758,22 @@ class ArchiveResultAdmin(BaseModelAdmin):
# print(root_dir, str(list(os.walk(root_dir))))
for root, dirs, files in os.walk(root_dir):
depth = root.replace(root_dir, '').count(os.sep) + 1
depth = root.replace(root_dir, "").count(os.sep) + 1
if depth > 2:
continue
indent = ' ' * 4 * (depth)
indent = " " * 4 * (depth)
output_html += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
indentation_str = ' ' * 4 * (depth + 1)
indentation_str = " " * 4 * (depth + 1)
for filename in sorted(files):
is_hidden = filename.startswith('.')
output_html += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
return output_html + mark_safe('</code></pre>')
is_hidden = filename.startswith(".")
output_html += format_html(
'<span style="opacity: {}.2">{}{}</span><br/>',
int(not is_hidden),
indentation_str,
filename.strip(),
)
return output_html + mark_safe("</code></pre>")
def register_admin(admin_site):

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from typing import TYPE_CHECKING, Any
@@ -18,23 +18,23 @@ if TYPE_CHECKING:
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Admin Views'
site_title = 'Admin'
namespace = 'admin'
site_header = "ArchiveBox"
index_title = "Admin Views"
site_title = "Admin"
namespace = "admin"
def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']:
def get_app_list(self, request: "HttpRequest", app_label: str | None = None) -> list["AppDict"]:
if app_label is None:
return adv_get_app_list(self, request)
return adv_get_app_list(self, request, app_label)
def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse':
def admin_data_index_view(self, request: "HttpRequest", **kwargs: Any) -> "TemplateResponse":
return adv_admin_data_index_view(self, request, **kwargs)
def get_admin_data_urls(self) -> list['URLResolver | URLPattern']:
def get_admin_data_urls(self) -> list["URLResolver | URLPattern"]:
return adv_get_admin_data_urls(self)
def get_urls(self) -> list['URLResolver | URLPattern']:
def get_urls(self) -> list["URLResolver | URLPattern"]:
return self.get_admin_data_urls() + super().get_urls()
@@ -43,7 +43,6 @@ archivebox_admin = ArchiveBoxAdmin()
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from urllib.parse import quote
@@ -28,92 +28,107 @@ from archivebox.core.host_utils import build_snapshot_url
class TagInline(admin.TabularInline):
model = SnapshotTag
fields = ('id', 'tag')
fields = ("id", "tag")
extra = 1
max_num = 1000
autocomplete_fields = (
'tag',
)
autocomplete_fields = ("tag",)
class TagAdminForm(forms.ModelForm):
class Meta:
model = Tag
fields = '__all__'
fields = "__all__"
widgets = {
'name': forms.TextInput(attrs={
'placeholder': 'research, receipts, product-design...',
'autocomplete': 'off',
'spellcheck': 'false',
'data-tag-name-input': '1',
}),
"name": forms.TextInput(
attrs={
"placeholder": "research, receipts, product-design...",
"autocomplete": "off",
"spellcheck": "false",
"data-tag-name-input": "1",
},
),
}
def clean_name(self):
name = (self.cleaned_data.get('name') or '').strip()
name = (self.cleaned_data.get("name") or "").strip()
if not name:
raise forms.ValidationError('Tag name is required.')
raise forms.ValidationError("Tag name is required.")
return name
class TagAdmin(BaseModelAdmin):
form = TagAdminForm
change_list_template = 'admin/core/tag/change_list.html'
change_form_template = 'admin/core/tag/change_form.html'
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
list_filter = ('created_at', 'created_by')
search_fields = ('id', 'name', 'slug')
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
actions = ['delete_selected']
ordering = ['name', 'id']
change_list_template = "admin/core/tag/change_list.html"
change_form_template = "admin/core/tag/change_form.html"
list_display = ("name", "num_snapshots", "created_at", "created_by")
list_filter = ("created_at", "created_by")
search_fields = ("id", "name", "slug")
readonly_fields = ("slug", "id", "created_at", "modified_at", "snapshots")
actions = ["delete_selected"]
ordering = ["name", "id"]
fieldsets = (
('Tag', {
'fields': ('name', 'slug'),
'classes': ('card',),
}),
('Metadata', {
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Recent Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
(
"Tag",
{
"fields": ("name", "slug"),
"classes": ("card",),
},
),
(
"Metadata",
{
"fields": ("id", "created_by", "created_at", "modified_at"),
"classes": ("card",),
},
),
(
"Recent Snapshots",
{
"fields": ("snapshots",),
"classes": ("card", "wide"),
},
),
)
add_fieldsets = (
('Tag', {
'fields': ('name',),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
(
"Tag",
{
"fields": ("name",),
"classes": ("card", "wide"),
},
),
(
"Metadata",
{
"fields": ("created_by",),
"classes": ("card",),
},
),
)
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
return self.fieldsets if obj else self.add_fieldsets
def changelist_view(self, request: HttpRequest, extra_context=None):
query = (request.GET.get('q') or '').strip()
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
query = (request.GET.get("q") or "").strip()
sort = normalize_tag_sort((request.GET.get("sort") or "created_desc").strip())
created_by = normalize_created_by_filter((request.GET.get("created_by") or "").strip())
year = normalize_created_year_filter((request.GET.get("year") or "").strip())
has_snapshots = normalize_has_snapshots_filter((request.GET.get("has_snapshots") or "all").strip())
extra_context = {
**(extra_context or {}),
'initial_query': query,
'initial_sort': sort,
'initial_created_by': created_by,
'initial_year': year,
'initial_has_snapshots': has_snapshots,
'tag_sort_choices': TAG_SORT_CHOICES,
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
'tag_created_by_choices': get_tag_creator_choices(),
'tag_year_choices': get_tag_year_choices(),
'initial_tag_cards': build_tag_cards(
"initial_query": query,
"initial_sort": sort,
"initial_created_by": created_by,
"initial_year": year,
"initial_has_snapshots": has_snapshots,
"tag_sort_choices": TAG_SORT_CHOICES,
"tag_has_snapshots_choices": TAG_HAS_SNAPSHOTS_CHOICES,
"tag_created_by_choices": get_tag_creator_choices(),
"tag_year_choices": get_tag_year_choices(),
"initial_tag_cards": build_tag_cards(
query=query,
request=request,
sort=sort,
@@ -121,62 +136,67 @@ class TagAdmin(BaseModelAdmin):
year=year,
has_snapshots=has_snapshots,
),
'tag_search_api_url': reverse('api-1:search_tags'),
'tag_create_api_url': reverse('api-1:tags_create'),
"tag_search_api_url": reverse("api-1:search_tags"),
"tag_create_api_url": reverse("api-1:tags_create"),
}
return super().changelist_view(request, extra_context=extra_context)
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
current_name = (request.POST.get('name') or '').strip()
def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None):
current_name = (request.POST.get("name") or "").strip()
if not current_name and obj:
current_name = obj.name
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
similar_tag_cards = (
build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
)
if obj:
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
similar_tag_cards = [card for card in similar_tag_cards if card["id"] != obj.pk]
context.update({
'tag_search_api_url': reverse('api-1:search_tags'),
'tag_similar_cards': similar_tag_cards,
'tag_similar_query': current_name,
})
context.update(
{
"tag_search_api_url": reverse("api-1:search_tags"),
"tag_similar_cards": similar_tag_cards,
"tag_similar_query": current_name,
},
)
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST:
return super().response_add(request, obj, post_url_continue=post_url_continue)
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
return self._redirect_to_changelist(obj.name)
def response_change(self, request: HttpRequest, obj: Tag):
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST or "_saveasnew" in request.POST:
return super().response_change(request, obj)
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
return self._redirect_to_changelist(obj.name)
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
changelist_url = reverse('admin:core_tag_changelist')
def _redirect_to_changelist(self, query: str = "") -> HttpResponseRedirect:
changelist_url = reverse("admin:core_tag_changelist")
if query:
changelist_url = f'{changelist_url}?q={quote(query)}'
changelist_url = f"{changelist_url}?q={quote(query)}"
return HttpResponseRedirect(changelist_url)
@admin.display(description='Snapshots')
@admin.display(description="Snapshots")
def snapshots(self, tag: Tag):
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
snapshots = tag.snapshot_set.select_related("crawl__created_by").order_by("-downloaded_at", "-created_at", "-pk")[:10]
total_count = tag.snapshot_set.count()
if not snapshots:
return mark_safe(
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>',
)
cards = []
for snapshot in snapshots:
title = (snapshot.title or '').strip() or snapshot.url
cards.append(format_html(
'''
title = (snapshot.title or "").strip() or snapshot.url
cards.append(
format_html(
"""
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
<span style="min-width:0;">
@@ -184,23 +204,26 @@ class TagAdmin(BaseModelAdmin):
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
</span>
</a>
''',
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
title[:120],
snapshot.url[:120],
))
""",
reverse("admin:core_snapshot_change", args=[snapshot.pk]),
build_snapshot_url(str(snapshot.pk), "favicon.ico"),
title[:120],
snapshot.url[:120],
),
)
cards.append(format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
tag.id,
total_count,
))
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
cards.append(
format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
tag.id,
total_count,
),
)
return mark_safe('<div style="display:grid;gap:10px;">' + "".join(cards) + "</div>")
@admin.display(description='Snapshots', ordering='num_snapshots')
@admin.display(description="Snapshots", ordering="num_snapshots")
def num_snapshots(self, tag: Tag):
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
count = getattr(tag, "num_snapshots", tag.snapshot_set.count())
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
tag.id,

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from django.contrib import admin
from django.contrib.auth.admin import UserAdmin
@@ -8,87 +8,100 @@ from django.utils.safestring import mark_safe
class CustomUserAdmin(UserAdmin):
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined']
readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set')
sort_fields = ["id", "email", "username", "is_superuser", "last_login", "date_joined"]
list_display = ["username", "id", "email", "is_superuser", "last_login", "date_joined"]
readonly_fields = ("snapshot_set", "archiveresult_set", "tag_set", "apitoken_set", "outboundwebhook_set")
# Preserve Django's default user creation form and fieldsets
# This ensures passwords are properly hashed and permissions are set correctly
add_fieldsets = UserAdmin.add_fieldsets
# Extend fieldsets for change form only (not user creation)
fieldsets = [*(UserAdmin.fieldsets or ()), ('Data', {'fields': readonly_fields})]
fieldsets = [*(UserAdmin.fieldsets or ()), ("Data", {"fields": readonly_fields})]
@admin.display(description='Snapshots')
@admin.display(description="Snapshots")
def snapshot_set(self, obj):
total_count = obj.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
snap.pk,
str(snap.id)[:8],
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
return mark_safe(
"<br/>".join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
snap.pk,
str(snap.id)[:8],
snap.downloaded_at.strftime("%Y-%m-%d %H:%M") if snap.downloaded_at else "pending...",
snap.url[:64],
)
for snap in obj.snapshot_set.order_by("-modified_at")[:10]
)
for snap in obj.snapshot_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
+ f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
)
@admin.display(description='Archive Result Logs')
@admin.display(description="Archive Result Logs")
def archiveresult_set(self, obj):
total_count = obj.archiveresult_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
result.pk,
str(result.id)[:8],
result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...',
result.extractor,
result.snapshot.url[:64],
return mark_safe(
"<br/>".join(
format_html(
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
result.pk,
str(result.id)[:8],
result.snapshot.downloaded_at.strftime("%Y-%m-%d %H:%M") if result.snapshot.downloaded_at else "pending...",
result.extractor,
result.snapshot.url[:64],
)
for result in obj.archiveresult_set.order_by("-modified_at")[:10]
)
for result in obj.archiveresult_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
+ f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
)
@admin.display(description='Tags')
@admin.display(description="Tags")
def tag_set(self, obj):
total_count = obj.tag_set.count()
return mark_safe(', '.join(
format_html(
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
tag.pk,
tag.name,
return mark_safe(
", ".join(
format_html(
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
tag.pk,
tag.name,
)
for tag in obj.tag_set.order_by("-modified_at")[:10]
)
for tag in obj.tag_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
+ f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
)
@admin.display(description='API Tokens')
@admin.display(description="API Tokens")
def apitoken_set(self, obj):
total_count = obj.apitoken_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
apitoken.pk,
str(apitoken.id)[:8],
apitoken.token_redacted[:64],
apitoken.expires,
return mark_safe(
"<br/>".join(
format_html(
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
apitoken.pk,
str(apitoken.id)[:8],
apitoken.token_redacted[:64],
apitoken.expires,
)
for apitoken in obj.apitoken_set.order_by("-modified_at")[:10]
)
for apitoken in obj.apitoken_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
+ f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
)
@admin.display(description='API Outbound Webhooks')
@admin.display(description="API Outbound Webhooks")
def outboundwebhook_set(self, obj):
total_count = obj.outboundwebhook_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
outboundwebhook.pk,
str(outboundwebhook.id)[:8],
outboundwebhook.referenced_model,
outboundwebhook.endpoint,
return mark_safe(
"<br/>".join(
format_html(
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
outboundwebhook.pk,
str(outboundwebhook.id)[:8],
outboundwebhook.referenced_model,
outboundwebhook.endpoint,
)
for outboundwebhook in obj.outboundwebhook_set.order_by("-modified_at")[:10]
)
for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
+ f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
)
def register_admin(admin_site):

View File

@@ -1,12 +1,12 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from django.apps import AppConfig
import os
class CoreConfig(AppConfig):
name = 'archivebox.core'
label = 'core'
name = "archivebox.core"
label = "core"
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
@@ -14,29 +14,30 @@ class CoreConfig(AppConfig):
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
from archivebox.core.admin_site import register_admin_site
register_admin_site()
# Import models to register state machines with the registry
# Skip during makemigrations to avoid premature state machine access
if 'makemigrations' not in sys.argv:
if "makemigrations" not in sys.argv:
from archivebox.core import models # noqa: F401
pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE')
pidfile = os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
if pidfile:
should_write_pid = True
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1":
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == "true"
if should_write_pid:
try:
with open(pidfile, 'w') as handle:
with open(pidfile, "w") as handle:
handle.write(str(os.getpid()))
except Exception:
pass
def _should_prepare_runtime() -> bool:
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if os.environ.get("ARCHIVEBOX_RUNSERVER") == "1":
if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1":
return os.environ.get(DJANGO_AUTORELOAD_ENV) == "true"
return True
return False
@@ -44,4 +45,5 @@ class CoreConfig(AppConfig):
from archivebox.machine.models import Process, Machine
Process.cleanup_stale_running()
Process.cleanup_orphaned_workers()
Machine.current()

View File

@@ -1,9 +1,9 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from django import forms
from django.utils.html import format_html
from archivebox.misc.util import URL_REGEX, find_all_urls
from archivebox.misc.util import URL_REGEX, find_all_urls, parse_filesize_to_bytes
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
from archivebox.crawls.schedule_utils import validate_schedule
@@ -13,11 +13,11 @@ from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_ic
from archivebox.personas.models import Persona
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
('1', 'depth = 1 (+ URLs one hop away)'),
('2', 'depth = 2 (+ URLs two hops away)'),
('3', 'depth = 3 (+ URLs three hops away)'),
('4', 'depth = 4 (+ URLs four hops away)'),
("0", "depth = 0 (archive just these URLs)"),
("1", "depth = 1 (+ URLs one hop away)"),
("2", "depth = 2 (+ URLs two hops away)"),
("3", "depth = 3 (+ URLs three hops away)"),
("4", "depth = 4 (+ URLs four hops away)"),
)
@@ -28,7 +28,7 @@ def get_plugin_choices():
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
schema = plugin_configs.get(plugin_name, {})
description = str(schema.get('description') or '').strip()
description = str(schema.get("description") or "").strip()
if not description:
return plugin_name
icon_html = get_plugin_icon(plugin_name)
@@ -45,7 +45,7 @@ def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
field = form.fields[name]
if not isinstance(field, forms.ChoiceField):
raise TypeError(f'{name} must be a ChoiceField')
raise TypeError(f"{name} must be a ChoiceField")
return field
@@ -54,10 +54,12 @@ class AddLinkForm(forms.Form):
url = forms.CharField(
label="URLs",
strip=True,
widget=forms.Textarea(attrs={
'data-url-regex': URL_REGEX.pattern,
}),
required=True
widget=forms.Textarea(
attrs={
"data-url-regex": URL_REGEX.pattern,
},
),
required=True,
)
tag = forms.CharField(
label="Tags",
@@ -68,16 +70,41 @@ class AddLinkForm(forms.Form):
depth = forms.ChoiceField(
label="Archive depth",
choices=DEPTH_CHOICES,
initial='0',
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
initial="0",
widget=forms.RadioSelect(attrs={"class": "depth-selection"}),
)
max_urls = forms.IntegerField(
label="Max URLs",
required=False,
min_value=0,
initial=0,
widget=forms.NumberInput(
attrs={
"min": 0,
"step": 1,
"placeholder": "0 = unlimited",
},
),
)
max_size = forms.CharField(
label="Max size",
required=False,
initial="0",
widget=forms.TextInput(
attrs={
"placeholder": "0 = unlimited, or e.g. 45mb / 1gb",
},
),
)
notes = forms.CharField(
label="Notes",
strip=True,
required=False,
widget=forms.TextInput(attrs={
'placeholder': 'Optional notes about this crawl',
})
widget=forms.TextInput(
attrs={
"placeholder": "Optional notes about this crawl",
},
),
)
url_filters = forms.Field(
label="URL allowlist / denylist",
@@ -128,16 +155,18 @@ class AddLinkForm(forms.Form):
label="Repeat schedule",
max_length=64,
required=False,
widget=forms.TextInput(attrs={
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
widget=forms.TextInput(
attrs={
"placeholder": "e.g., daily, weekly, 0 */6 * * * (every 6 hours)",
},
),
)
persona = forms.ModelChoiceField(
label="Persona (authentication profile)",
required=False,
queryset=Persona.objects.none(),
empty_label=None,
to_field_name='name',
to_field_name="name",
)
index_only = forms.BooleanField(
label="Index only dry run (add crawl but don't archive yet)",
@@ -155,8 +184,8 @@ class AddLinkForm(forms.Form):
super().__init__(*args, **kwargs)
default_persona = Persona.get_or_create_default()
self.fields['persona'].queryset = Persona.objects.order_by('name')
self.fields['persona'].initial = default_persona.name
self.fields["persona"].queryset = Persona.objects.order_by("name")
self.fields["persona"].initial = default_persona.name
# Get all plugins
all_plugins = get_plugins()
@@ -164,86 +193,136 @@ class AddLinkForm(forms.Form):
# Define plugin groups
chrome_dependent = {
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
"accessibility",
"chrome",
"consolelog",
"dom",
"headers",
"parse_dom_outlinks",
"pdf",
"redirects",
"responses",
"screenshot",
"seo",
"singlefile",
"ssl",
"staticfile",
"title",
}
archiving = {
'archivedotorg', 'defuddle', 'favicon', 'forumdl', 'gallerydl', 'git',
'htmltotext', 'mercury', 'papersdl', 'readability', 'trafilatura', 'wget', 'ytdlp'
"archivedotorg",
"defuddle",
"favicon",
"forumdl",
"gallerydl",
"git",
"htmltotext",
"mercury",
"papersdl",
"readability",
"trafilatura",
"wget",
"ytdlp",
}
parsing = {
'parse_html_urls', 'parse_jsonl_urls',
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
"parse_html_urls",
"parse_jsonl_urls",
"parse_netscape_urls",
"parse_rss_urls",
"parse_txt_urls",
}
search = {
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
"search_backend_ripgrep",
"search_backend_sonic",
"search_backend_sqlite",
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
binary = {"apt", "brew", "custom", "env", "npm", "pip"}
extensions = {"twocaptcha", "istilldontcareaboutcookies", "ublock"}
# Populate plugin field choices
get_choice_field(self, 'chrome_plugins').choices = [
get_choice_field(self, "chrome_plugins").choices = [
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
]
get_choice_field(self, 'archiving_plugins').choices = [
get_choice_field(self, "archiving_plugins").choices = [
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
]
get_choice_field(self, 'parsing_plugins').choices = [
get_choice_field(self, "parsing_plugins").choices = [
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
]
get_choice_field(self, 'search_plugins').choices = [
get_choice_field(self, "search_plugins").choices = [
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
]
get_choice_field(self, 'binary_plugins').choices = [
get_choice_field(self, "binary_plugins").choices = [
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
]
get_choice_field(self, 'extension_plugins').choices = [
get_choice_field(self, "extension_plugins").choices = [
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
]
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip()
search_choices = [choice[0] for choice in get_choice_field(self, "search_plugins").choices]
if required_search_plugin in search_choices:
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
get_choice_field(self, "search_plugins").initial = [required_search_plugin]
def clean(self):
cleaned_data = super().clean() or {}
# Combine all plugin groups into single list
all_selected_plugins = []
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
'search_plugins', 'binary_plugins', 'extension_plugins']:
for field in [
"chrome_plugins",
"archiving_plugins",
"parsing_plugins",
"search_plugins",
"binary_plugins",
"extension_plugins",
]:
selected = cleaned_data.get(field)
if isinstance(selected, list):
all_selected_plugins.extend(selected)
# Store combined list for easy access
cleaned_data['plugins'] = all_selected_plugins
cleaned_data["plugins"] = all_selected_plugins
return cleaned_data
def clean_url(self):
value = self.cleaned_data.get('url') or ''
urls = '\n'.join(find_all_urls(value))
value = self.cleaned_data.get("url") or ""
urls = "\n".join(find_all_urls(value))
if not urls:
raise forms.ValidationError('Enter at least one valid URL.')
raise forms.ValidationError("Enter at least one valid URL.")
return urls
def clean_url_filters(self):
from archivebox.crawls.models import Crawl
value = self.cleaned_data.get('url_filters') or {}
value = self.cleaned_data.get("url_filters") or {}
return {
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
'same_domain_only': bool(value.get('same_domain_only')),
"allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))),
"denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))),
"same_domain_only": bool(value.get("same_domain_only")),
}
def clean_max_urls(self):
value = self.cleaned_data.get("max_urls")
return int(value or 0)
def clean_max_size(self):
raw_value = str(self.cleaned_data.get("max_size") or "").strip()
if not raw_value:
return 0
try:
value = parse_filesize_to_bytes(raw_value)
except ValueError as err:
raise forms.ValidationError(str(err))
if value < 0:
raise forms.ValidationError("Max size must be 0 or a positive number of bytes.")
return value
def clean_schedule(self):
schedule = (self.cleaned_data.get('schedule') or '').strip()
schedule = (self.cleaned_data.get("schedule") or "").strip()
if not schedule:
return ''
return ""
try:
validate_schedule(schedule)
@@ -269,7 +348,7 @@ class TagField(forms.CharField):
return parse_tags(value)
except ValueError:
raise forms.ValidationError(
"Please provide a comma-separated list of tags."
"Please provide a comma-separated list of tags.",
)
def has_changed(self, initial, data):

View File

@@ -1,7 +1,5 @@
from __future__ import annotations
from __future__ import annotations
import re
from urllib.parse import urlparse
@@ -9,6 +7,7 @@ from archivebox.config.common import SERVER_CONFIG
_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$")
_SNAPSHOT_SUBDOMAIN_RE = re.compile(r"^snap-(?P<suffix>[0-9a-fA-F]{12})$")
def split_host_port(host: str) -> tuple[str, str | None]:
@@ -71,21 +70,29 @@ def get_web_host() -> str:
return urlparse(override).netloc.lower()
return _build_listen_host("web")
def get_api_host() -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host("api")
def get_public_host() -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host("public")
def get_snapshot_subdomain(snapshot_id: str) -> str:
normalized = re.sub(r"[^0-9a-fA-F]", "", snapshot_id or "")
suffix = (normalized[-12:] if len(normalized) >= 12 else normalized).lower()
return f"snap-{suffix}"
def get_snapshot_host(snapshot_id: str) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host(snapshot_id)
return _build_listen_host(get_snapshot_subdomain(snapshot_id))
def get_original_host(domain: str) -> str:
@@ -95,7 +102,16 @@ def get_original_host(domain: str) -> str:
def is_snapshot_subdomain(subdomain: str) -> bool:
return bool(_SNAPSHOT_ID_RE.match(subdomain or ""))
value = (subdomain or "").strip()
return bool(_SNAPSHOT_SUBDOMAIN_RE.match(value) or _SNAPSHOT_ID_RE.match(value))
def get_snapshot_lookup_key(snapshot_ref: str) -> str:
value = (snapshot_ref or "").strip().lower()
match = _SNAPSHOT_SUBDOMAIN_RE.match(value)
if match:
return match.group("suffix")
return value
def get_listen_subdomain(request_host: str) -> str:
@@ -141,22 +157,23 @@ def _build_base_url_for_host(host: str, request=None) -> str:
def get_admin_base_url(request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return override
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
return _build_base_url_for_host(get_admin_host(), request=request)
def get_web_base_url(request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return override
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
return _build_base_url_for_host(get_web_host(), request=request)
def get_api_base_url(request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
@@ -191,6 +208,7 @@ def build_admin_url(path: str = "", request=None) -> str:
def build_web_url(path: str = "", request=None) -> str:
return _build_url(get_web_base_url(request), path)
def build_api_url(path: str = "", request=None) -> str:
return _build_url(get_api_base_url(request), path)

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox'
__package__ = "archivebox"
from django.core.management.base import BaseCommand
@@ -6,13 +6,12 @@ from archivebox.cli import main as run_cli
class Command(BaseCommand):
help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
help = "Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)"
def add_arguments(self, parser):
parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
parser.add_argument("subcommand", type=str, help="The subcommand you want to run")
parser.add_argument("command_args", nargs="*", help="Arguments to pass to the subcommand")
def handle(self, *args, **kwargs):
command_args = [kwargs['subcommand'], *kwargs['command_args']]
command_args = [kwargs["subcommand"], *kwargs["command_args"]]
run_cli(args=command_args)

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
import ipaddress
import re
@@ -16,6 +16,7 @@ from archivebox.config.common import SERVER_CONFIG
from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from archivebox.core.host_utils import (
build_snapshot_url,
build_admin_url,
build_web_url,
get_api_host,
@@ -31,10 +32,10 @@ from archivebox.core.host_utils import (
from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
def detect_timezone(request, activate: bool=True):
gmt_offset = (request.COOKIES.get('GMT_OFFSET') or '').strip()
def detect_timezone(request, activate: bool = True):
gmt_offset = (request.COOKIES.get("GMT_OFFSET") or "").strip()
tz = None
if gmt_offset.replace('-', '').isdigit():
if gmt_offset.replace("-", "").isdigit():
tz = timezone.get_fixed_timezone(int(gmt_offset))
if activate:
timezone.activate(tz)
@@ -53,11 +54,12 @@ def TimezoneMiddleware(get_response):
def CacheControlMiddleware(get_response):
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip()
def middleware(request):
response = get_response(request)
if request.path.startswith('/static/'):
rel_path = request.path[len('/static/'):]
if request.path.startswith("/static/"):
rel_path = request.path[len("/static/") :]
static_path = finders.find(rel_path)
if static_path:
try:
@@ -81,10 +83,10 @@ def CacheControlMiddleware(get_response):
response.headers["Last-Modified"] = http_date(mtime)
return response
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
if not response.get('Cache-Control'):
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
if "/archive/" in request.path or "/static/" in request.path or snapshot_path_re.match(request.path):
if not response.get("Cache-Control"):
policy = "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private"
response["Cache-Control"] = f"{policy}, max-age=60, stale-while-revalidate=300"
# print('Set Cache-Control header to', response['Cache-Control'])
return response
@@ -115,6 +117,10 @@ def ServerSecurityModeMiddleware(get_response):
def HostRoutingMiddleware(get_response):
snapshot_path_re = re.compile(
r"^/(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?$",
)
def middleware(request):
request_host = (request.get_host() or "").lower()
admin_host = get_admin_host()
@@ -124,6 +130,23 @@ def HostRoutingMiddleware(get_response):
listen_host = get_listen_host()
subdomain = get_listen_subdomain(request_host)
# Framework-owned assets must bypass snapshot/original-domain replay routing.
# Otherwise pages on snapshot subdomains can receive HTML for JS/CSS requests.
if request.path.startswith("/static/") or request.path in {"/favicon.ico", "/robots.txt"}:
return get_response(request)
if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and not host_matches(request_host, admin_host):
if (
request.path == "/admin"
or request.path.startswith("/admin/")
or request.path == "/accounts"
or request.path.startswith("/accounts/")
):
target = build_admin_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
if host_matches(request_host, listen_host):
return get_response(request)
@@ -140,6 +163,16 @@ def HostRoutingMiddleware(get_response):
return get_response(request)
if host_matches(request_host, admin_host):
snapshot_match = snapshot_path_re.match(request.path)
if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and snapshot_match:
snapshot_id = snapshot_match.group("snapshot_id")
replay_path = (snapshot_match.group("path") or "").strip("/")
if replay_path == "index.html":
replay_path = ""
target = build_snapshot_url(snapshot_id, replay_path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
if host_matches(request_host, api_host):
@@ -160,16 +193,9 @@ def HostRoutingMiddleware(get_response):
if host_matches(request_host, web_host):
request.user = AnonymousUser()
request._cached_user = request.user
if request.path.startswith("/admin"):
target = build_admin_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
if host_matches(request_host, public_host):
request.user = AnonymousUser()
request._cached_user = request.user
return get_response(request)
if subdomain:
@@ -196,24 +222,26 @@ def HostRoutingMiddleware(get_response):
return middleware
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
header = "HTTP_{normalized}".format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace("-", "_").upper())
def process_request(self, request):
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == "":
return
ip = request.META.get('REMOTE_ADDR')
ip = request.META.get("REMOTE_ADDR")
if not isinstance(ip, str):
return
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(","):
try:
network = ipaddress.ip_network(cidr)
except ValueError:
raise ImproperlyConfigured(
"The REVERSE_PROXY_WHITELIST config paramater is in invalid format, or "
"contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.")
"The REVERSE_PROXY_WHITELIST config parameter is in invalid format, or "
"contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.",
)
if ipaddress.ip_address(ip) in network:
return super().process_request(request)

View File

@@ -5,23 +5,21 @@ import uuid
class Migration(migrations.Migration):
initial = True
dependencies = [
]
dependencies = []
operations = [
migrations.CreateModel(
name='Snapshot',
name="Snapshot",
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('url', models.URLField(unique=True)),
('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
('title', models.CharField(default=None, max_length=128, null=True)),
('tags', models.CharField(default=None, max_length=256, null=True)),
('added', models.DateTimeField(auto_now_add=True)),
('updated', models.DateTimeField(default=None, null=True)),
("id", models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
("url", models.URLField(unique=True)),
("timestamp", models.CharField(default=None, max_length=32, null=True, unique=True)),
("title", models.CharField(default=None, max_length=128, null=True)),
("tags", models.CharField(default=None, max_length=256, null=True)),
("added", models.DateTimeField(auto_now_add=True)),
("updated", models.DateTimeField(default=None, null=True)),
],
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0001_initial'),
("core", "0001_initial"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='timestamp',
model_name="snapshot",
name="timestamp",
field=models.CharField(default=None, max_length=32, null=True),
),
]

View File

@@ -4,35 +4,34 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0002_auto_20200625_1521'),
("core", "0002_auto_20200625_1521"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='added',
model_name="snapshot",
name="added",
field=models.DateTimeField(auto_now_add=True, db_index=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
model_name="snapshot",
name="tags",
field=models.CharField(db_index=True, default=None, max_length=256, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
model_name="snapshot",
name="timestamp",
field=models.CharField(db_index=True, default=None, max_length=32, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='title',
model_name="snapshot",
name="title",
field=models.CharField(db_index=True, default=None, max_length=128, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='updated',
model_name="snapshot",
name="updated",
field=models.DateTimeField(db_index=True, default=None, null=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0003_auto_20200630_1034'),
("core", "0003_auto_20200630_1034"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='timestamp',
model_name="snapshot",
name="timestamp",
field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
preserve_default=False,
),

View File

@@ -4,25 +4,24 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0004_auto_20200713_1552'),
("core", "0004_auto_20200713_1552"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='tags',
model_name="snapshot",
name="tags",
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='title',
model_name="snapshot",
name="title",
field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='updated',
model_name="snapshot",
name="updated",
field=models.DateTimeField(blank=True, db_index=True, null=True),
),
]

View File

@@ -3,19 +3,18 @@
from django.db import migrations, models
from django.utils.text import slugify
def forwards_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
TagModel = apps.get_model("core", "Tag")
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
tag_set = (
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
)
tag_set = {tag.strip() for tag in (snapshot.tags_old or "").split(",")}
tag_set.discard("")
for tag in tag_set:
to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={'slug': slugify(tag)})
to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={"slug": slugify(tag)})
snapshot.tags.add(to_add)
@@ -30,37 +29,36 @@ def reverse_func(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('core', '0005_auto_20200728_0326'),
("core", "0005_auto_20200728_0326"),
]
operations = [
migrations.RenameField(
model_name='snapshot',
old_name='tags',
new_name='tags_old',
model_name="snapshot",
old_name="tags",
new_name="tags_old",
),
migrations.CreateModel(
name='Tag',
name="Tag",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("name", models.CharField(max_length=100, unique=True, verbose_name="name")),
("slug", models.SlugField(max_length=100, unique=True, verbose_name="slug")),
],
options={
'verbose_name': 'Tag',
'verbose_name_plural': 'Tags',
"verbose_name": "Tag",
"verbose_name_plural": "Tags",
},
),
migrations.AddField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(to='core.Tag'),
model_name="snapshot",
name="tags",
field=models.ManyToManyField(to="core.Tag"),
),
migrations.RunPython(forwards_func, reverse_func),
migrations.RemoveField(
model_name='snapshot',
name='tags_old',
model_name="snapshot",
name="tags_old",
),
]

View File

@@ -9,13 +9,15 @@ import django.db.models.deletion
# Handle old vs new import paths
try:
from archivebox.config import CONSTANTS
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
except ImportError:
try:
from archivebox.config import CONFIG
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
ARCHIVE_DIR = Path(CONFIG.get("ARCHIVE_DIR", "./archive"))
except ImportError:
ARCHIVE_DIR = Path('./archive')
ARCHIVE_DIR = Path("./archive")
try:
from archivebox.misc.util import to_json
@@ -29,6 +31,7 @@ try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
@@ -41,7 +44,7 @@ def forwards_func(apps, schema_editor):
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
try:
with open(out_dir / "index.json", "r") as f:
with open(out_dir / "index.json") as f:
fs_index = json.load(f)
except Exception:
continue
@@ -56,37 +59,46 @@ def forwards_func(apps, schema_editor):
snapshot=snapshot,
pwd=result["pwd"],
cmd=result.get("cmd") or [],
cmd_version=result.get("cmd_version") or 'unknown',
cmd_version=result.get("cmd_version") or "unknown",
start_ts=result["start_ts"],
end_ts=result["end_ts"],
status=result["status"],
output=result.get("output") or 'null',
output=result.get("output") or "null",
)
except Exception as e:
print(
' ! Skipping import due to missing/invalid index.json:',
" ! Skipping import due to missing/invalid index.json:",
out_dir,
e,
'(open an issue with this index.json for help)',
"(open an issue with this index.json for help)",
)
def verify_json_index_integrity(snapshot):
results = snapshot.archiveresult_set.all()
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
with open(out_dir / "index.json", "r") as f:
with open(out_dir / "index.json") as f:
index = json.load(f)
history = index["history"]
index_results = [result for extractor in history for result in history[extractor]]
flattened_results = [result["start_ts"] for result in index_results]
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
for missing in missing_results:
index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
"start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
"schema": "ArchiveResult", "status": missing.status})
index["history"][missing.extractor].append(
{
"cmd": missing.cmd,
"cmd_version": missing.cmd_version,
"end_ts": missing.end_ts.isoformat(),
"start_ts": missing.start_ts.isoformat(),
"pwd": missing.pwd,
"output": missing.output,
"schema": "ArchiveResult",
"status": missing.status,
},
)
json_index = to_json(index)
with open(out_dir / "index.json", "w") as f:
@@ -103,25 +115,47 @@ def reverse_func(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('core', '0006_auto_20201012_1520'),
("core", "0006_auto_20201012_1520"),
]
operations = [
migrations.CreateModel(
name='ArchiveResult',
name="ArchiveResult",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('cmd', JSONField()),
('pwd', models.CharField(max_length=256)),
('cmd_version', models.CharField(max_length=32)),
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
('output', models.CharField(max_length=512)),
('start_ts', models.DateTimeField()),
('end_ts', models.DateTimeField()),
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("cmd", JSONField()),
("pwd", models.CharField(max_length=256)),
("cmd_version", models.CharField(max_length=32)),
(
"status",
models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16),
),
("output", models.CharField(max_length=512)),
("start_ts", models.DateTimeField()),
("end_ts", models.DateTimeField()),
(
"extractor",
models.CharField(
choices=[
("title", "title"),
("favicon", "favicon"),
("wget", "wget"),
("singlefile", "singlefile"),
("pdf", "pdf"),
("screenshot", "screenshot"),
("dom", "dom"),
("readability", "readability"),
("mercury", "mercury"),
("git", "git"),
("media", "media"),
("headers", "headers"),
("archivedotorg", "archivedotorg"),
],
max_length=32,
),
),
("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")),
],
),
migrations.RunPython(forwards_func, reverse_func),

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0007_archiveresult'),
("core", "0007_archiveresult"),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='cmd_version',
model_name="archiveresult",
name="cmd_version",
field=models.CharField(blank=True, default=None, max_length=32, null=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0008_auto_20210105_1421'),
("core", "0008_auto_20210105_1421"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='updated',
model_name="snapshot",
name="updated",
field=models.DateTimeField(auto_now=True, db_index=True, null=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0009_auto_20210216_1038'),
("core", "0009_auto_20210216_1038"),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
model_name="archiveresult",
name="start_ts",
field=models.DateTimeField(db_index=True),
),
]

View File

@@ -5,20 +5,36 @@ import uuid
class Migration(migrations.Migration):
dependencies = [
('core', '0010_auto_20210216_1055'),
("core", "0010_auto_20210216_1055"),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='uuid',
model_name="archiveresult",
name="uuid",
field=models.UUIDField(default=uuid.uuid4, editable=False),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
model_name="archiveresult",
name="extractor",
field=models.CharField(
choices=[
("title", "title"),
("favicon", "favicon"),
("headers", "headers"),
("singlefile", "singlefile"),
("pdf", "pdf"),
("screenshot", "screenshot"),
("dom", "dom"),
("wget", "wget"),
("readability", "readability"),
("mercury", "mercury"),
("git", "git"),
("media", "media"),
("archivedotorg", "archivedotorg"),
],
max_length=32,
),
),
]

View File

@@ -4,20 +4,19 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0011_auto_20210216_1331'),
("core", "0011_auto_20210216_1331"),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='cmd_version',
model_name="archiveresult",
name="cmd_version",
field=models.CharField(blank=True, default=None, max_length=128, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output',
model_name="archiveresult",
name="output",
field=models.CharField(max_length=1024),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0012_auto_20210216_1425'),
("core", "0012_auto_20210216_1425"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='title',
model_name="snapshot",
name="title",
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0013_auto_20210218_0729'),
("core", "0013_auto_20210218_0729"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='title',
model_name="snapshot",
name="title",
field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0014_auto_20210218_0729'),
("core", "0014_auto_20210218_0729"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='title',
model_name="snapshot",
name="title",
field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0015_auto_20210218_0730'),
("core", "0015_auto_20210218_0730"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, to='core.Tag'),
model_name="snapshot",
name="tags",
field=models.ManyToManyField(blank=True, to="core.Tag"),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0016_auto_20210218_1204'),
("core", "0016_auto_20210218_1204"),
]
operations = [
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
model_name="tag",
name="slug",
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name="slug"),
),
]

View File

@@ -4,20 +4,19 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0017_auto_20210219_0211'),
("core", "0017_auto_20210219_0211"),
]
operations = [
migrations.AlterField(
model_name='tag',
name='name',
model_name="tag",
name="name",
field=models.CharField(max_length=100, unique=True),
),
migrations.AlterField(
model_name='tag',
name='slug',
model_name="tag",
name="slug",
field=models.SlugField(blank=True, max_length=100, unique=True),
),
]

View File

@@ -4,15 +4,14 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0018_auto_20210327_0952'),
("core", "0018_auto_20210327_0952"),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='url',
model_name="snapshot",
name="url",
field=models.URLField(db_index=True, unique=True),
),
]

View File

@@ -4,20 +4,19 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0019_auto_20210401_0654'),
("core", "0019_auto_20210401_0654"),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
model_name="archiveresult",
name="id",
field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"),
),
migrations.AlterField(
model_name='tag',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
model_name="tag",
name="id",
field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"),
),
]

View File

@@ -4,15 +4,31 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0020_auto_20210410_1031'),
("core", "0020_auto_20210410_1031"),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
model_name="archiveresult",
name="extractor",
field=models.CharField(
choices=[
("favicon", "favicon"),
("headers", "headers"),
("singlefile", "singlefile"),
("pdf", "pdf"),
("screenshot", "screenshot"),
("dom", "dom"),
("wget", "wget"),
("title", "title"),
("readability", "readability"),
("mercury", "mercury"),
("git", "git"),
("media", "media"),
("archivedotorg", "archivedotorg"),
],
max_length=32,
),
),
]

View File

@@ -4,15 +4,32 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0021_auto_20220914_0934'),
("core", "0021_auto_20220914_0934"),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
model_name="archiveresult",
name="extractor",
field=models.CharField(
choices=[
("favicon", "favicon"),
("headers", "headers"),
("singlefile", "singlefile"),
("pdf", "pdf"),
("screenshot", "screenshot"),
("dom", "dom"),
("wget", "wget"),
("title", "title"),
("readability", "readability"),
("mercury", "mercury"),
("htmltotext", "htmltotext"),
("git", "git"),
("media", "media"),
("archivedotorg", "archivedotorg"),
],
max_length=32,
),
),
]

View File

@@ -16,6 +16,7 @@ def get_table_columns(table_name):
def upgrade_core_tables(apps, schema_editor):
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
from archivebox.uuid_compat import uuid7
cursor = connection.cursor()
# Check if core_archiveresult table exists
@@ -30,11 +31,11 @@ def upgrade_core_tables(apps, schema_editor):
has_data = row_count > 0
# Detect which version we're migrating from
archiveresult_cols = get_table_columns('core_archiveresult')
has_uuid = 'uuid' in archiveresult_cols
has_abid = 'abid' in archiveresult_cols
archiveresult_cols = get_table_columns("core_archiveresult")
has_uuid = "uuid" in archiveresult_cols
has_abid = "abid" in archiveresult_cols
print(f'DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}')
print(f"DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}")
# ============================================================================
# PART 1: Upgrade core_archiveresult table
@@ -62,7 +63,7 @@ def upgrade_core_tables(apps, schema_editor):
if has_data:
if has_uuid and not has_abid:
# Migrating from v0.7.2+ (has uuid column)
print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
print("Migrating ArchiveResult from v0.7.2+ schema (with uuid)...")
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, snapshot_id, cmd, pwd, cmd_version,
@@ -75,7 +76,7 @@ def upgrade_core_tables(apps, schema_editor):
""")
elif has_abid and not has_uuid:
# Migrating from v0.8.6rc0 (has abid instead of uuid)
print('Migrating ArchiveResult from v0.8.6rc0 schema...')
print("Migrating ArchiveResult from v0.8.6rc0 schema...")
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, snapshot_id, cmd, pwd, cmd_version,
@@ -88,17 +89,34 @@ def upgrade_core_tables(apps, schema_editor):
""")
else:
# Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
print("Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...")
cursor.execute(
"SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult",
)
old_records = cursor.fetchall()
for record in old_records:
new_uuid = uuid7().hex
cursor.execute("""
cursor.execute(
"""
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, snapshot_id, cmd, pwd, cmd_version,
start_ts, end_ts, status, extractor, output
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
""",
(
record[0],
new_uuid,
record[1],
record[2],
record[3],
record[4],
record[5],
record[6],
record[7],
record[8],
record[9],
),
)
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
@@ -149,13 +167,13 @@ def upgrade_core_tables(apps, schema_editor):
if snapshot_has_data:
# Detect which version we're migrating from
snapshot_cols = get_table_columns('core_snapshot')
has_added = 'added' in snapshot_cols
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
snapshot_cols = get_table_columns("core_snapshot")
has_added = "added" in snapshot_cols
has_bookmarked_at = "bookmarked_at" in snapshot_cols
if has_added and not has_bookmarked_at:
# Migrating from v0.7.2 (has added/updated fields)
print('Migrating Snapshot from v0.7.2 schema...')
print("Migrating Snapshot from v0.7.2 schema...")
# Transform added→bookmarked_at/created_at and updated→modified_at
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
@@ -173,28 +191,28 @@ def upgrade_core_tables(apps, schema_editor):
""")
elif has_bookmarked_at and not has_added:
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.8.6rc0 schema...')
print("Migrating Snapshot from v0.8.6rc0 schema...")
# Check what fields exist
has_status = 'status' in snapshot_cols
has_retry_at = 'retry_at' in snapshot_cols
has_crawl_id = 'crawl_id' in snapshot_cols
has_status = "status" in snapshot_cols
has_retry_at = "retry_at" in snapshot_cols
has_crawl_id = "crawl_id" in snapshot_cols
# Build column list based on what exists
cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at']
cols = ["id", "url", "timestamp", "title", "bookmarked_at", "created_at", "modified_at", "downloaded_at"]
if has_crawl_id:
cols.append('crawl_id')
cols.append("crawl_id")
if has_status:
cols.append('status')
cols.append("status")
if has_retry_at:
cols.append('retry_at')
cols.append("retry_at")
cursor.execute(f"""
INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)})
SELECT {', '.join(cols)}
INSERT OR IGNORE INTO core_snapshot_new ({", ".join(cols)})
SELECT {", ".join(cols)}
FROM core_snapshot;
""")
else:
print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}')
print(f"Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}")
cursor.execute("DROP TABLE IF EXISTS core_snapshot;")
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;")
@@ -237,13 +255,13 @@ def upgrade_core_tables(apps, schema_editor):
cursor.execute("PRAGMA table_info(core_tag)")
tag_id_type = None
for row in cursor.fetchall():
if row[1] == 'id': # row[1] is column name
if row[1] == "id": # row[1] is column name
tag_id_type = row[2] # row[2] is type
break
if tag_id_type and 'char' in tag_id_type.lower():
if tag_id_type and "char" in tag_id_type.lower():
# v0.8.6rc0: Tag IDs are UUIDs, need to convert to INTEGER
print('Converting Tag IDs from UUID to INTEGER...')
print("Converting Tag IDs from UUID to INTEGER...")
# Get all tags with their UUIDs
cursor.execute("SELECT id, name, slug, created_at, modified_at, created_by_id FROM core_tag ORDER BY name")
@@ -255,10 +273,13 @@ def upgrade_core_tables(apps, schema_editor):
old_id, name, slug, created_at, modified_at, created_by_id = tag
uuid_to_int_map[old_id] = i
# Insert with new INTEGER ID
cursor.execute("""
cursor.execute(
"""
INSERT OR IGNORE INTO core_tag_new (id, name, slug, created_at, modified_at, created_by_id)
VALUES (?, ?, ?, ?, ?, ?)
""", (i, name, slug, created_at, modified_at, created_by_id))
""",
(i, name, slug, created_at, modified_at, created_by_id),
)
# Update snapshot_tags to use new INTEGER IDs
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot_tags'")
@@ -273,13 +294,16 @@ def upgrade_core_tables(apps, schema_editor):
for st_id, snapshot_id, old_tag_id in snapshot_tags:
new_tag_id = uuid_to_int_map.get(old_tag_id)
if new_tag_id:
cursor.execute("""
cursor.execute(
"""
INSERT OR IGNORE INTO core_snapshot_tags (id, snapshot_id, tag_id)
VALUES (?, ?, ?)
""", (st_id, snapshot_id, new_tag_id))
""",
(st_id, snapshot_id, new_tag_id),
)
else:
# v0.7.2: Tag IDs are already INTEGER
print('Migrating Tag from v0.7.2 schema...')
print("Migrating Tag from v0.7.2 schema...")
cursor.execute("""
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
SELECT id, name, slug
@@ -294,15 +318,14 @@ def upgrade_core_tables(apps, schema_editor):
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);")
if has_data:
print('✓ Core tables upgraded to v0.9.0')
print("✓ Core tables upgraded to v0.9.0")
class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
("core", "0022_auto_20231023_2008"),
("crawls", "0001_initial"),
("auth", "0012_alter_user_first_name_max_length"),
]
operations = [
@@ -317,60 +340,58 @@ class Migration(migrations.Migration):
# NOTE: We do NOT remove extractor/output for ArchiveResult!
# They are still in the database and will be removed by migration 0025
# after copying their data to plugin/output_str.
# However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields
# because the SQL above already transformed them.
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
migrations.RemoveField(model_name="snapshot", name="added"),
migrations.RemoveField(model_name="snapshot", name="updated"),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
model_name="snapshot",
name="bookmarked_at",
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
model_name="snapshot",
name="created_at",
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
model_name="snapshot",
name="modified_at",
field=models.DateTimeField(auto_now=True),
),
# Declare fs_version (already created in database with DEFAULT '0.8.0')
migrations.AddField(
model_name='snapshot',
name='fs_version',
model_name="snapshot",
name="fs_version",
field=models.CharField(
max_length=10,
default='0.8.0',
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
default="0.8.0",
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().',
),
),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(
name='SnapshotTag',
name="SnapshotTag",
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)),
('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)),
("id", models.AutoField(primary_key=True, serialize=False)),
("snapshot", models.ForeignKey(to="core.Snapshot", db_column="snapshot_id", on_delete=models.CASCADE)),
("tag", models.ForeignKey(to="core.Tag", db_column="tag_id", on_delete=models.CASCADE)),
],
options={
'db_table': 'core_snapshot_tags',
'unique_together': {('snapshot', 'tag')},
"db_table": "core_snapshot_tags",
"unique_together": {("snapshot", "tag")},
},
),
# Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2)
migrations.AlterField(
model_name='snapshot',
name='tags',
model_name="snapshot",
name="tags",
field=models.ManyToManyField(
'Tag',
"Tag",
blank=True,
related_name='snapshot_set',
through='SnapshotTag',
through_fields=('snapshot', 'tag'),
related_name="snapshot_set",
through="SnapshotTag",
through_fields=("snapshot", "tag"),
),
),
],

View File

@@ -20,23 +20,27 @@ def create_default_crawl_and_assign_snapshots(apps, schema_editor):
snapshots_without_crawl = cursor.fetchone()[0]
if snapshots_without_crawl == 0:
print('✓ Fresh install or all snapshots already have crawls')
print("✓ Fresh install or all snapshots already have crawls")
return
# Get or create system user (pk=1)
cursor.execute("SELECT id FROM auth_user WHERE id = 1")
if not cursor.fetchone():
cursor.execute("""
cursor.execute(
"""
INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined)
VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?)
""", [datetime.now().isoformat()])
""",
[datetime.now().isoformat()],
)
# Create a default crawl for migrated snapshots
# At this point crawls_crawl is guaranteed to have v0.9.0 schema (crawls/0002 ran first)
crawl_id = str(uuid_lib.uuid4())
now = datetime.now().isoformat()
cursor.execute("""
cursor.execute(
"""
INSERT INTO crawls_crawl (
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
urls, max_depth, tags_str, label, notes, output_dir,
@@ -44,20 +48,21 @@ def create_default_crawl_and_assign_snapshots(apps, schema_editor):
) VALUES (?, ?, ?, 0, 0, '', 0, '', 'Migrated from v0.7.2/v0.8.6',
'Auto-created crawl for migrated snapshots', '',
'sealed', ?, 1, NULL, '{}', NULL)
""", [crawl_id, now, now, now])
""",
[crawl_id, now, now, now],
)
# Assign all snapshots without a crawl to the default crawl
cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id])
print(f'✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}')
print(f"✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}")
class Migration(migrations.Migration):
dependencies = [
('core', '0023_upgrade_to_0_9_0'),
('crawls', '0002_upgrade_from_0_8_6'),
('auth', '0012_alter_user_first_name_max_length'),
("core", "0023_upgrade_to_0_9_0"),
("crawls", "0002_upgrade_from_0_8_6"),
("auth", "0012_alter_user_first_name_max_length"),
]
operations = [
@@ -137,12 +142,12 @@ class Migration(migrations.Migration):
],
state_operations=[
migrations.AddField(
model_name='snapshot',
name='crawl',
model_name="snapshot",
name="crawl",
field=models.ForeignKey(
on_delete=models.deletion.CASCADE,
to='crawls.crawl',
help_text='Crawl that created this snapshot'
to="crawls.crawl",
help_text="Crawl that created this snapshot",
),
),
],

View File

@@ -17,20 +17,24 @@ def copy_old_fields_to_new(apps, schema_editor):
cursor.execute("PRAGMA table_info(core_archiveresult)")
cols = {row[1] for row in cursor.fetchall()}
if 'extractor' in cols and 'plugin' in cols:
if "extractor" in cols and "plugin" in cols:
# Copy extractor -> plugin
cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL")
if 'output' in cols and 'output_str' in cols:
if "output" in cols and "output_str" in cols:
# Copy output -> output_str
cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '') WHERE output_str = '' OR output_str IS NULL")
# Copy timestamps to new timestamp fields if they don't have values yet
if 'start_ts' in cols and 'created_at' in cols:
cursor.execute("UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''")
if "start_ts" in cols and "created_at" in cols:
cursor.execute(
"UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''",
)
if 'end_ts' in cols and 'modified_at' in cols:
cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
if "end_ts" in cols and "modified_at" in cols:
cursor.execute(
"UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''",
)
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
# transformed by migration 0023, so we don't need to copy them here.
@@ -39,164 +43,191 @@ def copy_old_fields_to_new(apps, schema_editor):
# Debug: Check Snapshot timestamps at end of RunPython
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
snap_after = cursor.fetchall()
print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}')
print(f"DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_assign_default_crawl'),
('crawls', '0001_initial'),
("core", "0024_assign_default_crawl"),
("crawls", "0001_initial"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
name="archiveresult",
options={"verbose_name": "Archive Result", "verbose_name_plural": "Archive Results Log"},
),
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
name="snapshot",
options={"verbose_name": "Snapshot", "verbose_name_plural": "Snapshots"},
),
# NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027
# to allow data migration to Process records first
migrations.AddField(
model_name='archiveresult',
name='config',
model_name="archiveresult",
name="config",
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
model_name="archiveresult",
name="created_at",
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
model_name="archiveresult",
name="hook_name",
field=models.CharField(
blank=True,
db_index=True,
default="",
help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)",
max_length=255,
),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
model_name="archiveresult",
name="modified_at",
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
model_name="archiveresult",
name="notes",
field=models.TextField(blank=True, default=""),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
model_name="archiveresult",
name="num_uses_failed",
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
model_name="archiveresult",
name="num_uses_succeeded",
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
model_name="archiveresult",
name="output_files",
field=models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}"),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
model_name="archiveresult",
name="output_json",
field=models.JSONField(blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)", null=True),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
model_name="archiveresult",
name="output_mimetypes",
field=models.CharField(blank=True, default="", help_text="CSV of mimetypes sorted by size", max_length=512),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
model_name="archiveresult",
name="output_size",
field=models.BigIntegerField(default=0, help_text="Total bytes of all output files"),
),
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
model_name="archiveresult",
name="output_str",
field=models.TextField(blank=True, default="", help_text="Human-readable output summary"),
),
migrations.AddField(
model_name='archiveresult',
name='plugin',
field=models.CharField(db_index=True, default='', max_length=32),
model_name="archiveresult",
name="plugin",
field=models.CharField(db_index=True, default="", max_length=32),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
model_name="archiveresult",
name="retry_at",
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
# NOTE: bookmarked_at and created_at already added by migration 0023
migrations.AddField(
model_name='snapshot',
name='config',
model_name="snapshot",
name="config",
field=models.JSONField(default=dict),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
model_name="snapshot",
name="current_step",
field=models.PositiveSmallIntegerField(
db_index=True,
default=0,
help_text="Current hook step being executed (0-9). Used for sequential hook execution.",
),
),
migrations.AddField(
model_name='snapshot',
name='depth',
model_name="snapshot",
name="depth",
field=models.PositiveSmallIntegerField(db_index=True, default=0),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
model_name="snapshot",
name="downloaded_at",
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
# NOTE: fs_version already added by migration 0023 with default='0.8.0'
# NOTE: modified_at already added by migration 0023
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
model_name="snapshot",
name="notes",
field=models.TextField(blank=True, default=""),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
model_name="snapshot",
name="num_uses_failed",
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
model_name="snapshot",
name="num_uses_succeeded",
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
model_name="snapshot",
name="parent_snapshot",
field=models.ForeignKey(
blank=True,
help_text="Parent snapshot that discovered this URL (for recursive crawling)",
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="child_snapshots",
to="core.snapshot",
),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
model_name="snapshot",
name="retry_at",
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
model_name="snapshot",
name="status",
field=models.CharField(
choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")],
db_index=True,
default="queued",
max_length=15,
),
),
migrations.AddField(
model_name='tag',
name='created_at',
model_name="tag",
name="created_at",
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
model_name="tag",
name="created_by",
field=models.ForeignKey(
default=archivebox.base_models.models.get_or_create_system_user_pk,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="tag_set",
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='tag',
name='modified_at',
model_name="tag",
name="modified_at",
field=models.DateTimeField(auto_now=True),
),
# Copy data from old field names to new field names after AddField operations
@@ -206,75 +237,93 @@ class Migration(migrations.Migration):
),
# Now remove the old ArchiveResult fields after data has been copied
migrations.RemoveField(
model_name='archiveresult',
name='extractor',
model_name="archiveresult",
name="extractor",
),
migrations.RemoveField(
model_name='archiveresult',
name='output',
model_name="archiveresult",
name="output",
),
# NOTE: Snapshot's added/updated were already removed by migration 0023
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
model_name="archiveresult",
name="end_ts",
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
model_name="archiveresult",
name="id",
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
model_name="archiveresult",
name="start_ts",
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
model_name="archiveresult",
name="status",
field=models.CharField(
choices=[
("queued", "Queued"),
("started", "Started"),
("backoff", "Waiting to retry"),
("succeeded", "Succeeded"),
("failed", "Failed"),
("skipped", "Skipped"),
],
db_index=True,
default="queued",
max_length=15,
),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
model_name="archiveresult",
name="uuid",
field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
model_name="snapshot",
name="crawl",
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="snapshot_set", to="crawls.crawl"),
),
migrations.AlterField(
model_name='snapshot',
name='id',
model_name="snapshot",
name="id",
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
model_name="snapshot",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="snapshot_set",
through="core.SnapshotTag",
through_fields=("snapshot", "tag"),
to="core.tag",
),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
model_name="snapshot",
name="timestamp",
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
model_name="snapshot",
name="url",
field=models.URLField(db_index=True),
),
migrations.AlterField(
model_name='tag',
name='slug',
model_name="tag",
name="slug",
field=models.SlugField(editable=False, max_length=100, unique=True),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'),
model_name="snapshot",
constraint=models.UniqueConstraint(fields=("url", "crawl"), name="unique_url_per_crawl"),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
model_name="snapshot",
constraint=models.UniqueConstraint(fields=("timestamp",), name="unique_timestamp"),
),
]

View File

@@ -5,24 +5,30 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
('machine', '0007_add_process_type_and_parent'),
("core", "0025_alter_archiveresult_options_alter_snapshot_options_and_more"),
("machine", "0007_add_process_type_and_parent"),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='num_uses_failed',
model_name="archiveresult",
name="num_uses_failed",
),
migrations.RemoveField(
model_name='archiveresult',
name='num_uses_succeeded',
model_name="archiveresult",
name="num_uses_succeeded",
),
migrations.AddField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
model_name="archiveresult",
name="process",
field=models.OneToOneField(
blank=True,
help_text="Process execution details for this archive result",
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name="archiveresult",
to="machine.process",
),
),
]

View File

@@ -25,7 +25,7 @@ def parse_cmd_field(cmd_raw):
return []
# Try to parse as JSON first
if cmd_raw.startswith('['):
if cmd_raw.startswith("["):
try:
parsed = json.loads(cmd_raw)
if isinstance(parsed, list):
@@ -45,7 +45,7 @@ def get_or_create_current_machine(cursor):
# Simple machine detection - get hostname as guid
hostname = socket.gethostname()
guid = f'host_{hostname}' # Simple but stable identifier
guid = f"host_{hostname}" # Simple but stable identifier
# Check if machine exists
cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
@@ -64,9 +64,10 @@ def get_or_create_current_machine(cursor):
machine_cols = {row[1] for row in cursor.fetchall()}
# Build INSERT statement based on available columns
if 'config' in machine_cols:
if "config" in machine_cols:
# 0.9.x schema with config column
cursor.execute("""
cursor.execute(
"""
INSERT INTO machine_machine (
id, created_at, modified_at, guid, hostname,
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
@@ -74,10 +75,13 @@ def get_or_create_current_machine(cursor):
stats, config, num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
'', '', '', '', '', '{}', '{}', 0, 0)
""", [machine_id, now, now, guid, hostname])
""",
[machine_id, now, now, guid, hostname],
)
else:
# 0.8.x schema without config column
cursor.execute("""
cursor.execute(
"""
INSERT INTO machine_machine (
id, created_at, modified_at, guid, hostname,
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
@@ -85,7 +89,9 @@ def get_or_create_current_machine(cursor):
stats, num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
'', '', '', '', '', '{}', 0, 0)
""", [machine_id, now, now, guid, hostname])
""",
[machine_id, now, now, guid, hostname],
)
return machine_id
@@ -108,15 +114,18 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
# If abspath is just a name without slashes, it's not a full path
# Store it in both fields for simplicity
if '/' not in abspath:
if "/" not in abspath:
# Not a full path - store as-is
pass
# Check if binary exists with same machine, name, abspath, version
cursor.execute("""
cursor.execute(
"""
SELECT id FROM machine_binary
WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
""", [machine_id, name, abspath, version])
""",
[machine_id, name, abspath, version],
)
row = cursor.fetchone()
if row:
@@ -134,9 +143,10 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
# Use only columns that exist in current schema
# 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
# 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
if 'binproviders' in binary_cols:
if "binproviders" in binary_cols:
# 0.9.x schema
cursor.execute("""
cursor.execute(
"""
INSERT INTO machine_binary (
id, created_at, modified_at, machine_id,
name, binproviders, overrides, binprovider, abspath, version, sha256,
@@ -144,16 +154,21 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
'succeeded', NULL, '', 0, 0)
""", [binary_id, now, now, machine_id, name, abspath, version])
""",
[binary_id, now, now, machine_id, name, abspath, version],
)
else:
# 0.8.x schema (simpler)
cursor.execute("""
cursor.execute(
"""
INSERT INTO machine_binary (
id, created_at, modified_at, machine_id,
name, binprovider, abspath, version, sha256,
num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
""", [binary_id, now, now, machine_id, name, abspath, version])
""",
[binary_id, now, now, machine_id, name, abspath, version],
)
return binary_id
@@ -169,15 +184,15 @@ def map_status(old_status):
(process_status, exit_code) tuple
"""
status_map = {
'queued': ('queued', None),
'started': ('running', None),
'backoff': ('queued', None),
'succeeded': ('exited', 0),
'failed': ('exited', 1),
'skipped': ('exited', None), # Skipped = exited without error
"queued": ("queued", None),
"started": ("running", None),
"backoff": ("queued", None),
"succeeded": ("exited", 0),
"failed": ("exited", 1),
"skipped": ("exited", None), # Skipped = exited without error
}
return status_map.get(old_status, ('queued', None))
return status_map.get(old_status, ("queued", None))
def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
@@ -197,9 +212,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
cmd_json = json.dumps(cmd)
# Set retry_at to now for queued processes, NULL otherwise
retry_at = now if status == 'queued' else None
retry_at = now if status == "queued" else None
cursor.execute("""
cursor.execute(
"""
INSERT INTO machine_process (
id, created_at, modified_at, machine_id, parent_id, process_type,
pwd, cmd, env, timeout,
@@ -213,14 +229,22 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
?, ?,
?, NULL, NULL,
?, ?)
""", [
process_id, now, now, machine_id,
pwd, cmd_json,
exit_code,
started_at, ended_at,
binary_id,
status, retry_at
])
""",
[
process_id,
now,
now,
machine_id,
pwd,
cmd_json,
exit_code,
started_at,
ended_at,
binary_id,
status,
retry_at,
],
)
return process_id
@@ -250,16 +274,18 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
cursor.execute("PRAGMA table_info(core_archiveresult)")
cols = {row[1] for row in cursor.fetchall()}
print(f'DEBUG 0027: Columns found: {sorted(cols)}')
print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
print(f"DEBUG 0027: Columns found: {sorted(cols)}")
print(
f"DEBUG 0027: Has cmd={('cmd' in cols)}, pwd={('pwd' in cols)}, cmd_version={('cmd_version' in cols)}, process_id={('process_id' in cols)}",
)
if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
print('✓ Fresh install or fields already removed - skipping data copy')
if "cmd" not in cols or "pwd" not in cols or "cmd_version" not in cols:
print("✓ Fresh install or fields already removed - skipping data copy")
return
# Check if process_id field exists (should exist from 0026)
if 'process_id' not in cols:
print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
if "process_id" not in cols:
print("✗ ERROR: process_id field not found. Migration 0026 must run first.")
return
# Get or create Machine.current()
@@ -278,10 +304,10 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
results = cursor.fetchall()
if not results:
print('✓ No ArchiveResults need Process migration')
print("✓ No ArchiveResults need Process migration")
return
print(f'Migrating {len(results)} ArchiveResults to Process records...')
print(f"Migrating {len(results)} ArchiveResults to Process records...")
migrated_count = 0
skipped_count = 0
@@ -291,42 +317,46 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
if i == 0:
print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
print(f"DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}")
try:
# Parse cmd field
cmd_array = parse_cmd_field(cmd_raw)
if i == 0:
print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
print(f"DEBUG 0027: Parsed cmd: {cmd_array}")
# Extract binary info from cmd[0] if available
binary_id = None
if cmd_array and cmd_array[0]:
binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name
binary_abspath = cmd_array[0]
binary_version = cmd_version or ''
binary_version = cmd_version or ""
# Get or create Binary record
binary_id = get_or_create_binary(
cursor, machine_id, binary_name, binary_abspath, binary_version
cursor,
machine_id,
binary_name,
binary_abspath,
binary_version,
)
if i == 0:
print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
print(f"DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}")
# Map status
process_status, exit_code = map_status(status)
# Set timestamps
started_at = start_ts or created_at
ended_at = end_ts if process_status == 'exited' else None
ended_at = end_ts if process_status == "exited" else None
# Create Process record
process_id = create_process(
cursor=cursor,
machine_id=machine_id,
pwd=pwd or '',
pwd=pwd or "",
cmd=cmd_array,
status=process_status,
exit_code=exit_code,
@@ -336,34 +366,34 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
)
if i == 0:
print(f'DEBUG 0027: Created Process: id={process_id}')
print(f"DEBUG 0027: Created Process: id={process_id}")
# Link ArchiveResult to Process
cursor.execute(
"UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
[process_id, ar_id]
[process_id, ar_id],
)
migrated_count += 1
if i == 0:
print('DEBUG 0027: Linked ArchiveResult to Process')
print("DEBUG 0027: Linked ArchiveResult to Process")
except Exception as e:
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
print(f"✗ Error migrating ArchiveResult {ar_id}: {e}")
import traceback
traceback.print_exc()
error_count += 1
continue
print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
print(f"✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors")
class Migration(migrations.Migration):
dependencies = [
('core', '0026_add_process_to_archiveresult'),
('machine', '0007_add_process_type_and_parent'),
("core", "0026_add_process_to_archiveresult"),
("machine", "0007_add_process_type_and_parent"),
]
operations = [
@@ -372,18 +402,17 @@ class Migration(migrations.Migration):
copy_archiveresult_data_to_process,
reverse_code=migrations.RunPython.noop,
),
# Now safe to remove old fields (moved from 0025)
migrations.RemoveField(
model_name='archiveresult',
name='cmd',
model_name="archiveresult",
name="cmd",
),
migrations.RemoveField(
model_name='archiveresult',
name='cmd_version',
model_name="archiveresult",
name="cmd_version",
),
migrations.RemoveField(
model_name='archiveresult',
name='pwd',
model_name="archiveresult",
name="pwd",
),
]

Some files were not shown because too many files have changed in this diff Show More