mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
wip
This commit is contained in:
4
.github/ISSUE_TEMPLATE/2-feature_request.yml
vendored
4
.github/ISSUE_TEMPLATE/2-feature_request.yml
vendored
@@ -57,9 +57,9 @@ body:
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Share the entire output of the `archivebox version` command for the current verison you are using.
|
||||
label: Share the entire output of the `archivebox version` command for the current version you are using.
|
||||
description: |
|
||||
DO NOT JUST ENTER "the latest verion" OR YOUR ISSUE WILL BE CLOSED.
|
||||
DO NOT JUST ENTER "the latest version" OR YOUR ISSUE WILL BE CLOSED.
|
||||
We need to know what version of ArchiveBox and what feature flags you're currently running with in order to contextualize your feature request.
|
||||
Sometimes we've already fixed the issues in newer BETA versions, sometimes features already exist but may not be available in your specific environment.
|
||||
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/config.yml
vendored
2
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -8,4 +8,4 @@ contact_links:
|
||||
about: "Join us on our Zulip forum to chat with the developers and other users (it's similar to Discord but self-hosted)."
|
||||
- name: 💁♂️ Hire us for professional support with fast response times
|
||||
url: https://docs.monadical.com/s/archivebox-consulting-services
|
||||
about: "We provide hosting, develoment, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc."
|
||||
about: "We provide hosting, development, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc."
|
||||
|
||||
1
.github/workflows/claude.yml
vendored
1
.github/workflows/claude.yml
vendored
@@ -47,4 +47,3 @@ jobs:
|
||||
# See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
|
||||
# or https://code.claude.com/docs/en/cli-reference for available options
|
||||
claude_args: '--allowed-tools Bash(gh pr:*)'
|
||||
|
||||
|
||||
30
.github/workflows/lint.yml
vendored
30
.github/workflows/lint.yml
vendored
@@ -4,32 +4,28 @@ on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
|
||||
env:
|
||||
MAX_LINE_LENGTH: 110
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v1
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
python-version: "3.13"
|
||||
architecture: x64
|
||||
|
||||
- name: Install flake8
|
||||
run: |
|
||||
pip install flake8
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
with:
|
||||
enable-cache: true
|
||||
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
cd archivebox
|
||||
# one pass for show-stopper syntax errors or undefined names
|
||||
flake8 . --count --show-source --statistics
|
||||
# one pass for small stylistic things
|
||||
flake8 . --count --max-line-length="$MAX_LINE_LENGTH" --statistics
|
||||
- name: Install dependencies with uv
|
||||
run: uv sync --all-extras --all-groups --no-sources --no-cache
|
||||
|
||||
- name: Run prek
|
||||
run: uv run prek run --all-files
|
||||
|
||||
1
.github/workflows/pip.yml
vendored
1
.github/workflows/pip.yml
vendored
@@ -63,4 +63,3 @@ jobs:
|
||||
# && uv run archivebox add 'https://example.com' \
|
||||
# && uv run archivebox status \
|
||||
# || (echo "UV Failed to run archivebox!" && exit 1)
|
||||
|
||||
|
||||
67
.pre-commit-config.yaml
Normal file
67
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
default_language_version:
|
||||
python: python3.13
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/asottile/yesqa
|
||||
rev: v1.5.0
|
||||
hooks:
|
||||
- id: yesqa
|
||||
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies:
|
||||
- tomli
|
||||
|
||||
- repo: https://github.com/asottile/pyupgrade
|
||||
rev: v3.20.0
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: [--py313-plus]
|
||||
|
||||
- repo: https://github.com/asottile/add-trailing-comma
|
||||
rev: v3.1.0
|
||||
hooks:
|
||||
- id: add-trailing-comma
|
||||
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: ruff-format
|
||||
name: ruff-format
|
||||
entry: uv run --active ruff format
|
||||
language: system
|
||||
types_or: [python, pyi]
|
||||
- id: ruff-check
|
||||
name: ruff-check
|
||||
entry: uv run --active ruff check --fix
|
||||
language: system
|
||||
types_or: [python, pyi]
|
||||
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: check-ast
|
||||
- id: check-toml
|
||||
- id: check-yaml
|
||||
exclude: ^\.github/workflows/homebrew\.yml$
|
||||
- id: check-json
|
||||
- id: check-merge-conflict
|
||||
- id: check-symlinks
|
||||
- id: destroyed-symlinks
|
||||
- id: check-case-conflict
|
||||
- id: check-illegal-windows-names
|
||||
- id: check-shebang-scripts-are-executable
|
||||
exclude: ^(archivebox/.*\.py|archivebox/tests/.*\.py|archivebox/personas/export_browser_state\.js)$
|
||||
- id: mixed-line-ending
|
||||
- id: fix-byte-order-marker
|
||||
- id: end-of-file-fixer
|
||||
- id: detect-private-key
|
||||
- id: debug-statements
|
||||
- id: forbid-submodules
|
||||
exclude: ^docs$
|
||||
- id: check-added-large-files
|
||||
args: ["--maxkb=600"]
|
||||
- id: name-tests-test
|
||||
args: ["--pytest-test-first"]
|
||||
exclude: ^archivebox/tests/(data/|fixtures\.py$|migrations_helpers\.py$)
|
||||
@@ -196,9 +196,9 @@ with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
# Run hook in its output directory
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=snap-456'],
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com'],
|
||||
cwd=str(screenshot_dir),
|
||||
env=get_test_env(),
|
||||
env={**get_test_env(), 'EXTRA_CONTEXT': '{"snapshot_id":"snap-456"}'},
|
||||
capture_output=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
@@ -56,7 +56,7 @@ ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
######### Environment Variables #################################
|
||||
|
||||
# Global built-time and runtime environment constants + default pkg manager config
|
||||
# Global build-time and runtime environment constants + default pkg manager config
|
||||
ENV TZ=UTC \
|
||||
LANGUAGE=en_US:en \
|
||||
LC_ALL=C.UTF-8 \
|
||||
@@ -121,7 +121,7 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
|
||||
&& groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \
|
||||
&& echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER PUID=$(id -u $ARCHIVEBOX_USER) PGID=$(id -g $ARCHIVEBOX_USER)\n\n" \
|
||||
| tee -a /VERSION.txt
|
||||
# DEFAULT_PUID and DEFAULT_PID are overriden by PUID and PGID in /bin/docker_entrypoint.sh at runtime
|
||||
# DEFAULT_PUID and DEFAULT_PID are overridden by PUID and PGID in /bin/docker_entrypoint.sh at runtime
|
||||
# https://docs.linuxserver.io/general/understanding-puid-and-pgid
|
||||
|
||||
# Install system apt dependencies (adding backports to access more recent apt updates)
|
||||
@@ -139,7 +139,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||
# nano iputils-ping dnsutils htop procps jq yq
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install apt binary dependencies for exractors
|
||||
# Install apt binary dependencies for extractors
|
||||
# COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
|
||||
@@ -373,7 +373,7 @@ RUN openssl rand -hex 16 > /etc/machine-id \
|
||||
&& echo -e "\nTMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt
|
||||
|
||||
# Print version for nice docker finish summary
|
||||
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
|
||||
RUN (echo -e "\n\n[√] Finished Docker build successfully. Saving build summary in: /VERSION.txt" \
|
||||
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
|
||||
&& echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
@@ -153,7 +153,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur
|
||||
- **Individuals:**
|
||||
`saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival`
|
||||
- **Governments:**
|
||||
`snapshoting public service sites`, `recordkeeping compliance`
|
||||
`snapshotting public service sites`, `recordkeeping compliance`
|
||||
|
||||
> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.*
|
||||
> We offer: setup & support, CAPTCHA/ratelimit unblocking, SSO, audit logging/chain-of-custody, and more
|
||||
@@ -423,7 +423,7 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, W
|
||||
<ul>
|
||||
<li><a href="https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102">
|
||||
<img src="https://img.shields.io/badge/Custom_Development-ArchiveBox.io-%231a1a1a.svg?style=flat" height="22px"/>
|
||||
</a> (<a href="https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102">get hosting, support, and feature customization directy from us</a>)</li>
|
||||
</a> (<a href="https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102">get hosting, support, and feature customization directly from us</a>)</li>
|
||||
<li><a href="https://monadical.com">
|
||||
<img src="https://img.shields.io/badge/General_Dev_Consulting-Monadical.com-%231a1a1a.svg?style=flat" height="22px"/>
|
||||
</a> (<a href="https://monadical.com/contact-us.html">generalist consultancy that has ArchiveBox experience</a>)</li>
|
||||
@@ -1183,7 +1183,7 @@ ArchiveBox's stance is that duplication of other people's content is only ethica
|
||||
|
||||
In the U.S., <a href="https://guides.library.oregonstate.edu/copyright/libraries">libraries, researchers, and archivists</a> are allowed to duplicate copyrighted materials under <a href="https://libguides.ala.org/copyright/fairuse">"fair use"</a> for <a href="https://guides.cuny.edu/cunyfairuse/librarians#:~:text=One%20of%20these%20specified%20conditions,may%20be%20liable%20for%20copyright">private study, scholarship, or research</a>. Archive.org's non-profit preservation work is <a href="https://blog.archive.org/2024/03/01/fair-use-in-action-at-the-internet-archive/">covered under fair use</a> in the US, and they properly handle <a href="https://cardozoaelj.com/2015/03/20/use-of-copyright-law-to-take-down-revenge-porn/">unethical content</a>/<a href="https://help.archive.org/help/rights/">DMCA</a>/<a href="https://gdpr.eu/right-to-be-forgotten/#:~:text=An%20individual%20has%20the%20right,that%20individual%20withdraws%20their%20consent.">GDPR</a> removal requests to maintain good standing in the eyes of the law.
|
||||
|
||||
As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use sofware like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#footer_info"><code>FOOTER_INFO</code></a> and changing your instance's branding using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#custom_templates_dir"><code>CUSTOM_TEMPLATES_DIR</code></a>).
|
||||
As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use software like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#footer_info"><code>FOOTER_INFO</code></a> and changing your instance's branding using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#custom_templates_dir"><code>CUSTOM_TEMPLATES_DIR</code></a>).
|
||||
|
||||
</details>
|
||||
<br/>
|
||||
@@ -1212,7 +1212,7 @@ ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for
|
||||
|
||||
Not all content is suitable to be archived on a centralized, publicly accessible platform. Archive.org doesn't offer the ability to save things behind login walls for good reason, as the content may not have been intended for a public audience. ArchiveBox exists to fill that gap by letting everyone save what they have access to on an individual basis, and to encourage decentralized archiving that's less succeptible to censorship or natural disasters.
|
||||
|
||||
By having users store their content locally or within their organizations, we can also save much larger portions of the internet than a centralized service has the disk capcity handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other, and with central archives on a case-by-case basis.
|
||||
By having users store their content locally or within their organizations, we can also save much larger portions of the internet than a centralized service has the disk capacity to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other, and with central archives on a case-by-case basis.
|
||||
|
||||
<h3>Comparison With Other Self-Hosted Archiving Options</h3>
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
# in a universe that seems indifferent to us."
|
||||
# --Norber Weiner
|
||||
|
||||
__package__ = 'archivebox'
|
||||
__package__ = "archivebox"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -22,11 +22,12 @@ from abx_plugins import get_plugins_dir
|
||||
class _ReconfigurableStream(Protocol):
|
||||
def reconfigure(self, *, line_buffering: bool) -> object: ...
|
||||
|
||||
|
||||
# Force unbuffered output for real-time logs
|
||||
if hasattr(sys.stdout, 'reconfigure'):
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
cast(_ReconfigurableStream, sys.stdout).reconfigure(line_buffering=True)
|
||||
cast(_ReconfigurableStream, sys.stderr).reconfigure(line_buffering=True)
|
||||
os.environ['PYTHONUNBUFFERED'] = '1'
|
||||
os.environ["PYTHONUNBUFFERED"] = "1"
|
||||
|
||||
ASCII_LOGO = """
|
||||
█████╗ ██████╗ ██████╗██╗ ██╗██╗██╗ ██╗███████╗ ██████╗ ██████╗ ██╗ ██╗
|
||||
@@ -44,48 +45,51 @@ PACKAGE_DIR = Path(__file__).resolve().parent
|
||||
# if str(PACKAGE_DIR) not in sys.path:
|
||||
# sys.path.append(str(PACKAGE_DIR))
|
||||
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
|
||||
os.environ['TZ'] = 'UTC'
|
||||
os.environ["DJANGO_SETTINGS_MODULE"] = "archivebox.core.settings"
|
||||
os.environ["TZ"] = "UTC"
|
||||
|
||||
# detect ArchiveBox user's UID/GID based on data dir ownership
|
||||
from .config.permissions import drop_privileges # noqa
|
||||
from .config.permissions import drop_privileges # noqa
|
||||
|
||||
drop_privileges()
|
||||
|
||||
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
|
||||
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
|
||||
|
||||
check_not_root()
|
||||
check_not_inside_source_dir()
|
||||
check_io_encoding()
|
||||
|
||||
# Install monkey patches for third-party libraries
|
||||
from .misc.monkey_patches import * # noqa
|
||||
from .misc.monkey_patches import * # noqa
|
||||
|
||||
# Plugin directories
|
||||
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
|
||||
USER_PLUGINS_DIR = Path(
|
||||
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
|
||||
or os.environ.get('USER_PLUGINS_DIR')
|
||||
or os.environ.get('DATA_DIR', os.getcwd())
|
||||
) / 'custom_plugins'
|
||||
USER_PLUGINS_DIR = (
|
||||
Path(
|
||||
os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or os.environ.get("USER_PLUGINS_DIR") or os.environ.get("DATA_DIR", os.getcwd()),
|
||||
)
|
||||
/ "custom_plugins"
|
||||
)
|
||||
|
||||
# These are kept for backwards compatibility with existing code
|
||||
# that checks for plugins. The new hook system uses discover_hooks()
|
||||
ALL_PLUGINS = {
|
||||
'builtin': BUILTIN_PLUGINS_DIR,
|
||||
'user': USER_PLUGINS_DIR,
|
||||
"builtin": BUILTIN_PLUGINS_DIR,
|
||||
"user": USER_PLUGINS_DIR,
|
||||
}
|
||||
LOADED_PLUGINS = ALL_PLUGINS
|
||||
|
||||
# Setup basic config, constants, paths, and version
|
||||
from .config.constants import CONSTANTS # noqa
|
||||
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .config.version import VERSION # noqa
|
||||
from .config.constants import CONSTANTS # noqa
|
||||
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .config.version import VERSION # noqa
|
||||
|
||||
# Set MACHINE_ID env var so hook scripts can use it
|
||||
os.environ.setdefault('MACHINE_ID', CONSTANTS.MACHINE_ID)
|
||||
os.environ.setdefault("MACHINE_ID", CONSTANTS.MACHINE_ID)
|
||||
|
||||
__version__ = VERSION
|
||||
__author__ = 'ArchiveBox'
|
||||
__license__ = 'MIT'
|
||||
__author__ = "ArchiveBox"
|
||||
__license__ = "MIT"
|
||||
|
||||
ASCII_ICON = """
|
||||
██████████████████████████████████████████████████████████████████████████████████████████████████
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
"""This is the entrypoint for python -m archivebox ..."""
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import archivebox # noqa # make sure monkey patches are applied before anything else
|
||||
__package__ = "archivebox"
|
||||
|
||||
import archivebox # noqa # make sure monkey patches are applied before anything else
|
||||
import sys
|
||||
|
||||
from .cli import main
|
||||
@@ -15,5 +16,5 @@ ASCII_LOGO_MINI = r"""
|
||||
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
|
||||
"""
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.contrib import admin
|
||||
from django.http import HttpRequest
|
||||
@@ -11,57 +11,81 @@ from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
class APITokenAdmin(BaseModelAdmin):
|
||||
list_display = ('created_at', 'id', 'created_by', 'token_redacted', 'expires')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'expires')
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
search_fields = ('id', 'created_by__username', 'token')
|
||||
list_display = ("created_at", "id", "created_by", "token_redacted", "expires")
|
||||
sort_fields = ("id", "created_at", "created_by", "expires")
|
||||
readonly_fields = ("created_at", "modified_at")
|
||||
search_fields = ("id", "created_by__username", "token")
|
||||
|
||||
fieldsets = (
|
||||
('Token', {
|
||||
'fields': ('token', 'expires'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Owner', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
(
|
||||
"Token",
|
||||
{
|
||||
"fields": ("token", "expires"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Owner",
|
||||
{
|
||||
"fields": ("created_by",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Timestamps",
|
||||
{
|
||||
"fields": ("created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
list_filter = ('created_by',)
|
||||
ordering = ['-created_at']
|
||||
list_filter = ("created_by",)
|
||||
ordering = ["-created_at"]
|
||||
list_per_page = 100
|
||||
|
||||
|
||||
class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
|
||||
list_display = ('created_at', 'created_by', 'id', *WebhookAdmin.list_display)
|
||||
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
|
||||
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
|
||||
list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display)
|
||||
sort_fields = ("created_at", "created_by", "id", "referenced_model", "endpoint", "last_success", "last_error")
|
||||
readonly_fields = ("created_at", "modified_at", *WebhookAdmin.readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Webhook', {
|
||||
'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Authentication', {
|
||||
'fields': ('auth_token',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('enabled', 'last_success', 'last_error'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Owner', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
(
|
||||
"Webhook",
|
||||
{
|
||||
"fields": ("name", "signal", "referenced_model", "endpoint"),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Authentication",
|
||||
{
|
||||
"fields": ("auth_token",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Status",
|
||||
{
|
||||
"fields": ("enabled", "last_success", "last_error"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Owner",
|
||||
{
|
||||
"fields": ("created_by",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Timestamps",
|
||||
{
|
||||
"fields": ("created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool:
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class APIConfig(AppConfig):
|
||||
name = 'archivebox.api'
|
||||
label = 'api'
|
||||
name = "archivebox.api"
|
||||
label = "api"
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
from archivebox.api.admin import register_admin
|
||||
|
||||
register_admin(admin_site)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from typing import Optional
|
||||
from datetime import timedelta
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -14,7 +13,7 @@ from ninja.errors import HttpError
|
||||
|
||||
def get_or_create_api_token(user: User | None):
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
if user and user.is_superuser:
|
||||
api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now())
|
||||
if api_tokens.exists():
|
||||
@@ -34,18 +33,18 @@ def get_or_create_api_token(user: User | None):
|
||||
|
||||
def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None:
|
||||
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
|
||||
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||
|
||||
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||
|
||||
user: User | None = None
|
||||
|
||||
submitted_empty_form = str(token).strip() in ('string', '', 'None', 'null')
|
||||
submitted_empty_form = str(token).strip() in ("string", "", "None", "null")
|
||||
if not submitted_empty_form:
|
||||
try:
|
||||
api_token = APIToken.objects.get(token=token)
|
||||
if api_token.is_valid() and isinstance(api_token.created_by, User):
|
||||
user = api_token.created_by
|
||||
if request is not None:
|
||||
setattr(request, '_api_token', api_token)
|
||||
setattr(request, "_api_token", api_token)
|
||||
except APIToken.DoesNotExist:
|
||||
pass
|
||||
|
||||
@@ -55,8 +54,8 @@ def auth_using_token(token: str | None, request: HttpRequest | None = None) -> U
|
||||
def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None:
|
||||
"""Given a username and password, check if they are valid and return the corresponding user"""
|
||||
user: User | None = None
|
||||
|
||||
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
|
||||
|
||||
submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None))
|
||||
if not submitted_empty_form:
|
||||
authenticated_user = authenticate(
|
||||
username=username,
|
||||
@@ -73,34 +72,40 @@ def auth_using_password(username: str | None, password: str | None, request: Htt
|
||||
def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None:
|
||||
if user and user.pk:
|
||||
request.user = user
|
||||
setattr(request, '_api_auth_method', auth_method)
|
||||
setattr(request, "_api_auth_method", auth_method)
|
||||
if not user.is_superuser:
|
||||
raise HttpError(403, 'Valid credentials but User does not have permission (make sure user.is_superuser=True)')
|
||||
raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)")
|
||||
return user
|
||||
|
||||
|
||||
### Django-Ninja-Provided Auth Methods
|
||||
|
||||
|
||||
class HeaderTokenAuth(APIKeyHeader):
|
||||
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
|
||||
|
||||
param_name = "X-ArchiveBox-API-Key"
|
||||
|
||||
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
|
||||
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
|
||||
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
|
||||
|
||||
|
||||
class BearerTokenAuth(HttpBearer):
|
||||
"""Allow authenticating by passing Bearer=xyz as a request header"""
|
||||
|
||||
def authenticate(self, request: HttpRequest, token: str) -> User | None:
|
||||
return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__)
|
||||
|
||||
|
||||
class QueryParamTokenAuth(APIKeyQuery):
|
||||
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
|
||||
|
||||
param_name = "api_key"
|
||||
|
||||
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
|
||||
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
|
||||
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
|
||||
|
||||
|
||||
class UsernameAndPasswordAuth(HttpBasicAuth):
|
||||
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
|
||||
|
||||
@@ -111,25 +116,28 @@ class UsernameAndPasswordAuth(HttpBasicAuth):
|
||||
self.__class__.__name__,
|
||||
)
|
||||
|
||||
|
||||
class DjangoSessionAuth:
|
||||
"""Allow authenticating with existing Django session cookies (same-origin only)."""
|
||||
|
||||
def __call__(self, request: HttpRequest) -> User | None:
|
||||
return self.authenticate(request)
|
||||
|
||||
def authenticate(self, request: HttpRequest, **kwargs) -> User | None:
|
||||
user = getattr(request, 'user', None)
|
||||
user = getattr(request, "user", None)
|
||||
if isinstance(user, User) and user.is_authenticated:
|
||||
setattr(request, '_api_auth_method', self.__class__.__name__)
|
||||
setattr(request, "_api_auth_method", self.__class__.__name__)
|
||||
if not user.is_superuser:
|
||||
raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
|
||||
raise HttpError(403, "Valid session but User does not have permission (make sure user.is_superuser=True)")
|
||||
return user
|
||||
return None
|
||||
|
||||
|
||||
### Enabled Auth Methods
|
||||
|
||||
API_AUTH_METHODS = [
|
||||
HeaderTokenAuth(),
|
||||
BearerTokenAuth(),
|
||||
QueryParamTokenAuth(),
|
||||
QueryParamTokenAuth(),
|
||||
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
|
||||
]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.http import HttpResponse
|
||||
|
||||
@@ -10,8 +10,8 @@ class ApiCorsMiddleware:
|
||||
self.get_response = get_response
|
||||
|
||||
def __call__(self, request):
|
||||
if request.path.startswith('/api/'):
|
||||
if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
|
||||
if request.path.startswith("/api/"):
|
||||
if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"):
|
||||
response = HttpResponse(status=204)
|
||||
return self._add_cors_headers(request, response)
|
||||
|
||||
@@ -21,14 +21,12 @@ class ApiCorsMiddleware:
|
||||
return self.get_response(request)
|
||||
|
||||
def _add_cors_headers(self, request, response):
|
||||
origin = request.META.get('HTTP_ORIGIN')
|
||||
origin = request.META.get("HTTP_ORIGIN")
|
||||
if not origin:
|
||||
return response
|
||||
|
||||
response['Access-Control-Allow-Origin'] = '*'
|
||||
response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
|
||||
response['Access-Control-Allow-Headers'] = (
|
||||
'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
|
||||
)
|
||||
response['Access-Control-Max-Age'] = '600'
|
||||
response["Access-Control-Allow-Origin"] = "*"
|
||||
response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS"
|
||||
response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken"
|
||||
response["Access-Control-Max-Age"] = "600"
|
||||
return response
|
||||
|
||||
@@ -13,11 +13,10 @@ import signal_webhooks.utils
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
("auth", "0012_alter_user_first_name_max_length"),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
@@ -75,55 +74,165 @@ class Migration(migrations.Migration):
|
||||
reverse_sql="""
|
||||
DROP TABLE IF EXISTS api_outboundwebhook;
|
||||
DROP TABLE IF EXISTS api_apitoken;
|
||||
"""
|
||||
""",
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
migrations.CreateModel(
|
||||
name='APIToken',
|
||||
name="APIToken",
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
('expires', models.DateTimeField(blank=True, null=True)),
|
||||
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
("expires", models.DateTimeField(blank=True, null=True)),
|
||||
(
|
||||
"created_by",
|
||||
models.ForeignKey(
|
||||
default=get_or_create_system_user_pk,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'API Key',
|
||||
'verbose_name_plural': 'API Keys',
|
||||
'app_label': 'api',
|
||||
"verbose_name": "API Key",
|
||||
"verbose_name_plural": "API Keys",
|
||||
"app_label": "api",
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='OutboundWebhook',
|
||||
name="OutboundWebhook",
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('name', models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name')),
|
||||
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal')),
|
||||
('ref', models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
|
||||
('endpoint', models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint')),
|
||||
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
|
||||
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
|
||||
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
|
||||
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
|
||||
('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
|
||||
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
|
||||
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
|
||||
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
|
||||
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
|
||||
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
(
|
||||
"name",
|
||||
models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"),
|
||||
),
|
||||
(
|
||||
"signal",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("CREATE", "Create"),
|
||||
("UPDATE", "Update"),
|
||||
("DELETE", "Delete"),
|
||||
("M2M", "M2M changed"),
|
||||
("CREATE_OR_UPDATE", "Create or Update"),
|
||||
("CREATE_OR_DELETE", "Create or Delete"),
|
||||
("CREATE_OR_M2M", "Create or M2M changed"),
|
||||
("UPDATE_OR_DELETE", "Update or Delete"),
|
||||
("UPDATE_OR_M2M", "Update or M2M changed"),
|
||||
("DELETE_OR_M2M", "Delete or M2M changed"),
|
||||
("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"),
|
||||
("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"),
|
||||
("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"),
|
||||
("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"),
|
||||
("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"),
|
||||
],
|
||||
help_text="Signal the webhook fires to.",
|
||||
max_length=255,
|
||||
verbose_name="signal",
|
||||
),
|
||||
),
|
||||
(
|
||||
"ref",
|
||||
models.CharField(
|
||||
db_index=True,
|
||||
help_text="Dot import notation to the model the webhook is for.",
|
||||
max_length=1023,
|
||||
validators=[signal_webhooks.utils.model_from_reference],
|
||||
verbose_name="referenced model",
|
||||
),
|
||||
),
|
||||
(
|
||||
"endpoint",
|
||||
models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"),
|
||||
),
|
||||
(
|
||||
"headers",
|
||||
models.JSONField(
|
||||
blank=True,
|
||||
default=dict,
|
||||
help_text="Headers to send with the webhook request.",
|
||||
validators=[signal_webhooks.utils.is_dict],
|
||||
verbose_name="headers",
|
||||
),
|
||||
),
|
||||
(
|
||||
"auth_token",
|
||||
signal_webhooks.fields.TokenField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Authentication token to use in an Authorization header.",
|
||||
max_length=8000,
|
||||
validators=[signal_webhooks.utils.decode_cipher_key],
|
||||
verbose_name="authentication token",
|
||||
),
|
||||
),
|
||||
("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")),
|
||||
(
|
||||
"keep_last_response",
|
||||
models.BooleanField(
|
||||
default=False,
|
||||
help_text="Should the webhook keep a log of the latest response it got?",
|
||||
verbose_name="keep last response",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created",
|
||||
models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"),
|
||||
),
|
||||
(
|
||||
"updated",
|
||||
models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"),
|
||||
),
|
||||
(
|
||||
"last_response",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Latest response to this webhook.",
|
||||
max_length=8000,
|
||||
verbose_name="last response",
|
||||
),
|
||||
),
|
||||
(
|
||||
"last_success",
|
||||
models.DateTimeField(
|
||||
default=None,
|
||||
help_text="When the webhook last succeeded.",
|
||||
null=True,
|
||||
verbose_name="last success",
|
||||
),
|
||||
),
|
||||
(
|
||||
"last_failure",
|
||||
models.DateTimeField(
|
||||
default=None,
|
||||
help_text="When the webhook last failed.",
|
||||
null=True,
|
||||
verbose_name="last failure",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created_by",
|
||||
models.ForeignKey(
|
||||
default=get_or_create_system_user_pk,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'API Outbound Webhook',
|
||||
'app_label': 'api',
|
||||
"verbose_name": "API Outbound Webhook",
|
||||
"app_label": "api",
|
||||
},
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='outboundwebhook',
|
||||
constraint=models.UniqueConstraint(fields=['ref', 'endpoint'], name='prevent_duplicate_hooks_api_outboundwebhook'),
|
||||
model_name="outboundwebhook",
|
||||
constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"),
|
||||
),
|
||||
],
|
||||
),
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
import secrets
|
||||
from archivebox.uuid_compat import uuid7
|
||||
@@ -25,7 +25,7 @@ class APIToken(models.Model):
|
||||
expires = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'api'
|
||||
app_label = "api"
|
||||
verbose_name = "API Key"
|
||||
verbose_name_plural = "API Keys"
|
||||
|
||||
@@ -34,7 +34,7 @@ class APIToken(models.Model):
|
||||
|
||||
@property
|
||||
def token_redacted(self):
|
||||
return f'************{self.token[-4:]}'
|
||||
return f"************{self.token[-4:]}"
|
||||
|
||||
def is_valid(self, for_date=None):
|
||||
return not self.expires or self.expires >= (for_date or timezone.now())
|
||||
@@ -47,8 +47,8 @@ class OutboundWebhook(WebhookBase):
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta(WebhookBase.Meta):
|
||||
app_label = 'api'
|
||||
verbose_name = 'API Outbound Webhook'
|
||||
app_label = "api"
|
||||
verbose_name = "API Outbound Webhook"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.id}] {self.ref} -> {self.endpoint}'
|
||||
return f"[{self.id}] {self.ref} -> {self.endpoint}"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from django.urls import path
|
||||
from django.views.generic.base import RedirectView
|
||||
@@ -6,12 +6,10 @@ from django.views.generic.base import RedirectView
|
||||
from .v1_api import urls as v1_api_urls
|
||||
|
||||
urlpatterns = [
|
||||
path("", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
path("v1/", RedirectView.as_view(url='/api/v1/docs')),
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
path("", RedirectView.as_view(url="/api/v1/docs")),
|
||||
path("v1/", RedirectView.as_view(url="/api/v1/docs")),
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url="/api/v1/docs")),
|
||||
# ... v2 can be added here ...
|
||||
# path("v2/", v2_api_urls),
|
||||
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
|
||||
from io import StringIO
|
||||
@@ -20,9 +20,9 @@ from archivebox.api.auth import API_AUTH_METHODS
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
|
||||
COMMIT_HASH = get_COMMIT_HASH() or "unknown"
|
||||
|
||||
html_description=f'''
|
||||
html_description = f"""
|
||||
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||
<br/>
|
||||
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
|
||||
@@ -35,47 +35,47 @@ html_description=f'''
|
||||
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
|
||||
</ul>
|
||||
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
|
||||
api.add_router('/core/', 'archivebox.api.v1_core.router')
|
||||
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
|
||||
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
|
||||
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
|
||||
api.add_router("/auth/", "archivebox.api.v1_auth.router")
|
||||
api.add_router("/core/", "archivebox.api.v1_core.router")
|
||||
api.add_router("/crawls/", "archivebox.api.v1_crawls.router")
|
||||
api.add_router("/cli/", "archivebox.api.v1_cli.router")
|
||||
api.add_router("/machine/", "archivebox.api.v1_machine.router")
|
||||
return api
|
||||
|
||||
|
||||
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
|
||||
stdout, stderr = StringIO(), StringIO()
|
||||
|
||||
with redirect_stderr(stderr):
|
||||
with redirect_stdout(stdout):
|
||||
setattr(request, 'stdout', stdout)
|
||||
setattr(request, 'stderr', stderr)
|
||||
setattr(request, "stdout", stdout)
|
||||
setattr(request, "stderr", stderr)
|
||||
|
||||
response = super().create_temporal_response(request)
|
||||
|
||||
# Diable caching of API responses entirely
|
||||
response['Cache-Control'] = 'no-store'
|
||||
# Disable caching of API responses entirely
|
||||
response["Cache-Control"] = "no-store"
|
||||
|
||||
# Add debug stdout and stderr headers to response
|
||||
response['X-ArchiveBox-Stdout'] = stdout.getvalue().replace('\n', '\\n')[:200]
|
||||
response['X-ArchiveBox-Stderr'] = stderr.getvalue().replace('\n', '\\n')[:200]
|
||||
response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200]
|
||||
response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200]
|
||||
# response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown'
|
||||
|
||||
# Add Auth Headers to response
|
||||
api_token_attr = getattr(request, '_api_token', None)
|
||||
api_token_attr = getattr(request, "_api_token", None)
|
||||
api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None
|
||||
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else 'Never'
|
||||
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never"
|
||||
|
||||
response['X-ArchiveBox-Auth-Method'] = str(getattr(request, '_api_auth_method', 'None'))
|
||||
response['X-ArchiveBox-Auth-Expires'] = token_expiry
|
||||
response['X-ArchiveBox-Auth-Token-Id'] = str(api_token.id) if api_token else 'None'
|
||||
response['X-ArchiveBox-Auth-User-Id'] = str(request.user.pk) if getattr(request.user, 'pk', None) else 'None'
|
||||
response['X-ArchiveBox-Auth-User-Username'] = request.user.username if isinstance(request.user, User) else 'None'
|
||||
response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None"))
|
||||
response["X-ArchiveBox-Auth-Expires"] = token_expiry
|
||||
response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None"
|
||||
response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None"
|
||||
response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None"
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
# print('RESPONDING NOW', response)
|
||||
@@ -84,7 +84,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
|
||||
|
||||
api = NinjaAPIWithIOCapture(
|
||||
title='ArchiveBox API',
|
||||
title="ArchiveBox API",
|
||||
description=html_description,
|
||||
version=VERSION,
|
||||
auth=API_AUTH_METHODS,
|
||||
@@ -103,15 +103,15 @@ def generic_exception_handler(request, err):
|
||||
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
|
||||
status = 404
|
||||
|
||||
print(''.join(format_exception(err)))
|
||||
print("".join(format_exception(err)))
|
||||
|
||||
return api.create_response(
|
||||
request,
|
||||
{
|
||||
"succeeded": False,
|
||||
"message": f'{err.__class__.__name__}: {err}',
|
||||
"message": f"{err.__class__.__name__}: {err}",
|
||||
"errors": [
|
||||
''.join(format_exception(err)),
|
||||
"".join(format_exception(err)),
|
||||
# or send simpler parent-only traceback:
|
||||
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
|
||||
],
|
||||
@@ -120,7 +120,6 @@ def generic_exception_handler(request, err):
|
||||
)
|
||||
|
||||
|
||||
|
||||
# import orjson
|
||||
# from ninja.renderers import BaseRenderer
|
||||
# class ORJSONRenderer(BaseRenderer):
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from typing import Optional
|
||||
from django.http import HttpRequest
|
||||
|
||||
from ninja import Router, Schema
|
||||
@@ -8,16 +7,21 @@ from ninja import Router, Schema
|
||||
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
|
||||
|
||||
|
||||
router = Router(tags=['Authentication'], auth=None)
|
||||
router = Router(tags=["Authentication"], auth=None)
|
||||
|
||||
|
||||
class PasswordAuthSchema(Schema):
|
||||
"""Schema for a /get_api_token request"""
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
|
||||
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
|
||||
@router.post(
|
||||
"/get_api_token",
|
||||
auth=None,
|
||||
summary="Generate an API token for a given username & password (or currently logged-in user)",
|
||||
) # auth=None because they are not authed yet
|
||||
def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
|
||||
user = auth_using_password(
|
||||
username=auth_data.username,
|
||||
@@ -35,17 +39,21 @@ def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
|
||||
"token": api_token.token,
|
||||
"expires": api_token.expires.isoformat() if api_token.expires else None,
|
||||
}
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
|
||||
class TokenAuthSchema(Schema):
|
||||
"""Schema for a /check_api_token request"""
|
||||
|
||||
token: str
|
||||
|
||||
|
||||
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
|
||||
@router.post(
|
||||
"/check_api_token",
|
||||
auth=None,
|
||||
summary="Validate an API token to make sure its valid and non-expired",
|
||||
) # auth=None because they are not authed yet
|
||||
def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
|
||||
user = auth_using_token(
|
||||
token=token_data.token,
|
||||
@@ -53,5 +61,5 @@ def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
|
||||
)
|
||||
if user:
|
||||
return {"success": True, "user_id": str(user.pk)}
|
||||
|
||||
|
||||
return {"success": False, "user_id": None}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
import json
|
||||
from io import StringIO
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import Any
|
||||
from enum import Enum
|
||||
|
||||
from django.http import HttpRequest
|
||||
@@ -16,44 +16,47 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
||||
# from .auth import API_AUTH_METHODS
|
||||
|
||||
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||
router = Router(tags=["ArchiveBox CLI Sub-Commands"])
|
||||
|
||||
|
||||
# Schemas
|
||||
|
||||
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
|
||||
JSONType = list[Any] | dict[str, Any] | bool | int | str | None
|
||||
|
||||
|
||||
class CLICommandResponseSchema(Schema):
|
||||
success: bool
|
||||
errors: List[str]
|
||||
errors: list[str]
|
||||
result: JSONType
|
||||
result_format: str = 'str'
|
||||
result_format: str = "str"
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
|
||||
class FilterTypeChoices(str, Enum):
|
||||
exact = 'exact'
|
||||
substring = 'substring'
|
||||
regex = 'regex'
|
||||
domain = 'domain'
|
||||
tag = 'tag'
|
||||
timestamp = 'timestamp'
|
||||
exact = "exact"
|
||||
substring = "substring"
|
||||
regex = "regex"
|
||||
domain = "domain"
|
||||
tag = "tag"
|
||||
timestamp = "timestamp"
|
||||
|
||||
|
||||
class StatusChoices(str, Enum):
|
||||
indexed = 'indexed'
|
||||
archived = 'archived'
|
||||
unarchived = 'unarchived'
|
||||
present = 'present'
|
||||
valid = 'valid'
|
||||
invalid = 'invalid'
|
||||
duplicate = 'duplicate'
|
||||
orphaned = 'orphaned'
|
||||
corrupted = 'corrupted'
|
||||
unrecognized = 'unrecognized'
|
||||
indexed = "indexed"
|
||||
archived = "archived"
|
||||
unarchived = "unarchived"
|
||||
present = "present"
|
||||
valid = "valid"
|
||||
invalid = "invalid"
|
||||
duplicate = "duplicate"
|
||||
orphaned = "orphaned"
|
||||
corrupted = "corrupted"
|
||||
unrecognized = "unrecognized"
|
||||
|
||||
|
||||
class AddCommandSchema(Schema):
|
||||
urls: List[str]
|
||||
urls: list[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
parser: str = "auto"
|
||||
@@ -62,53 +65,54 @@ class AddCommandSchema(Schema):
|
||||
overwrite: bool = False
|
||||
index_only: bool = False
|
||||
|
||||
|
||||
class UpdateCommandSchema(Schema):
|
||||
resume: Optional[str] = None
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
resume: str | None = None
|
||||
after: float | None = 0
|
||||
before: float | None = 999999999999999
|
||||
filter_type: str | None = FilterTypeChoices.substring
|
||||
filter_patterns: list[str] | None = ["https://example.com"]
|
||||
batch_size: int = 100
|
||||
continuous: bool = False
|
||||
|
||||
|
||||
class ScheduleCommandSchema(Schema):
|
||||
import_path: Optional[str] = None
|
||||
import_path: str | None = None
|
||||
add: bool = False
|
||||
show: bool = False
|
||||
foreground: bool = False
|
||||
run_all: bool = False
|
||||
quiet: bool = False
|
||||
every: Optional[str] = None
|
||||
tag: str = ''
|
||||
every: str | None = None
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
overwrite: bool = False
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
clear: bool = False
|
||||
|
||||
|
||||
class ListCommandSchema(Schema):
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_patterns: list[str] | None = ["https://example.com"]
|
||||
filter_type: str = FilterTypeChoices.substring
|
||||
status: StatusChoices = StatusChoices.indexed
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
sort: str = 'bookmarked_at'
|
||||
after: float | None = 0
|
||||
before: float | None = 999999999999999
|
||||
sort: str = "bookmarked_at"
|
||||
as_json: bool = True
|
||||
as_html: bool = False
|
||||
as_csv: str | None = 'timestamp,url'
|
||||
as_csv: str | None = "timestamp,url"
|
||||
with_headers: bool = False
|
||||
|
||||
|
||||
class RemoveCommandSchema(Schema):
|
||||
delete: bool = True
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
after: float | None = 0
|
||||
before: float | None = 999999999999999
|
||||
filter_type: str = FilterTypeChoices.exact
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_patterns: list[str] | None = ["https://example.com"]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
|
||||
def cli_add(request: HttpRequest, args: AddCommandSchema):
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
@@ -125,30 +129,30 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
|
||||
created_by_id=request.user.pk,
|
||||
)
|
||||
|
||||
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)]
|
||||
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
|
||||
result_payload = {
|
||||
"crawl_id": str(crawl.id),
|
||||
"num_snapshots": len(snapshot_ids),
|
||||
"snapshot_ids": snapshot_ids,
|
||||
"queued_urls": args.urls,
|
||||
}
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result_payload,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
|
||||
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
|
||||
from archivebox.cli.archivebox_update import update
|
||||
|
||||
|
||||
result = update(
|
||||
filter_patterns=args.filter_patterns or [],
|
||||
filter_type=args.filter_type or FilterTypeChoices.substring,
|
||||
@@ -158,21 +162,21 @@ def cli_update(request: HttpRequest, args: UpdateCommandSchema):
|
||||
batch_size=args.batch_size,
|
||||
continuous=args.continuous,
|
||||
)
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
|
||||
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
|
||||
from archivebox.cli.archivebox_schedule import schedule
|
||||
|
||||
|
||||
result = schedule(
|
||||
import_path=args.import_path,
|
||||
add=args.add,
|
||||
@@ -188,23 +192,22 @@ def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
|
||||
update=args.update,
|
||||
)
|
||||
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
|
||||
@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
|
||||
def cli_search(request: HttpRequest, args: ListCommandSchema):
|
||||
from archivebox.cli.archivebox_search import search
|
||||
|
||||
|
||||
result = search(
|
||||
filter_patterns=args.filter_patterns,
|
||||
filter_type=args.filter_type,
|
||||
@@ -218,7 +221,7 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
|
||||
with_headers=args.with_headers,
|
||||
)
|
||||
|
||||
result_format = 'txt'
|
||||
result_format = "txt"
|
||||
if args.as_json:
|
||||
result_format = "json"
|
||||
result = json.loads(result)
|
||||
@@ -227,20 +230,19 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
|
||||
elif args.as_csv:
|
||||
result_format = "csv"
|
||||
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": result_format,
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
|
||||
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
||||
from archivebox.cli.archivebox_remove import remove
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
@@ -253,10 +255,10 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
||||
after=args.after,
|
||||
before=args.before,
|
||||
)
|
||||
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
|
||||
|
||||
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
|
||||
|
||||
remove(
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
delete=args.delete,
|
||||
snapshots=snapshots_to_remove,
|
||||
before=args.before,
|
||||
@@ -270,14 +272,13 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
||||
"removed_snapshot_ids": removed_snapshot_ids,
|
||||
"remaining_snapshots": Snapshot.objects.count(),
|
||||
}
|
||||
stdout = getattr(request, 'stdout', None)
|
||||
stderr = getattr(request, 'stderr', None)
|
||||
stdout = getattr(request, "stdout", None)
|
||||
stderr = getattr(request, "stderr", None)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
|
||||
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
||||
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from uuid import UUID
|
||||
from typing import List, Optional, Union, Any, Annotated
|
||||
from typing import Union, Any, Annotated
|
||||
from datetime import datetime
|
||||
|
||||
from django.db.models import Model, Q
|
||||
from django.db.models import Model, Q, Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
from django.conf import settings
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.core.exceptions import ValidationError
|
||||
@@ -39,7 +41,7 @@ from archivebox.crawls.models import Crawl
|
||||
from archivebox.api.v1_crawls import CrawlSchema
|
||||
|
||||
|
||||
router = Router(tags=['Core Models'])
|
||||
router = Router(tags=["Core Models"])
|
||||
|
||||
|
||||
class CustomPagination(PaginationBase):
|
||||
@@ -49,13 +51,14 @@ class CustomPagination(PaginationBase):
|
||||
page: int = 0
|
||||
|
||||
class Output(PaginationBase.Output):
|
||||
count: int
|
||||
total_items: int
|
||||
total_pages: int
|
||||
page: int
|
||||
limit: int
|
||||
offset: int
|
||||
num_items: int
|
||||
items: List[Any]
|
||||
items: list[Any]
|
||||
|
||||
def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params):
|
||||
limit = min(pagination.limit, 500)
|
||||
@@ -65,27 +68,29 @@ class CustomPagination(PaginationBase):
|
||||
current_page = math.ceil(offset / (limit + 1))
|
||||
items = queryset[offset : offset + limit]
|
||||
return {
|
||||
'total_items': total,
|
||||
'total_pages': total_pages,
|
||||
'page': current_page,
|
||||
'limit': limit,
|
||||
'offset': offset,
|
||||
'num_items': len(items),
|
||||
'items': items,
|
||||
"count": total,
|
||||
"total_items": total,
|
||||
"total_pages": total_pages,
|
||||
"page": current_page,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"num_items": len(items),
|
||||
"items": items,
|
||||
}
|
||||
|
||||
|
||||
### ArchiveResult #########################################################################
|
||||
|
||||
|
||||
class MinimalArchiveResultSchema(Schema):
|
||||
TYPE: str = 'core.models.ArchiveResult'
|
||||
TYPE: str = "core.models.ArchiveResult"
|
||||
id: UUID
|
||||
created_at: datetime | None
|
||||
modified_at: datetime | None
|
||||
created_by_id: str
|
||||
created_by_username: str
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
retry_at: datetime | None = None
|
||||
plugin: str
|
||||
hook_name: str
|
||||
process_id: UUID | None
|
||||
@@ -93,8 +98,8 @@ class MinimalArchiveResultSchema(Schema):
|
||||
cmd: list[str] | None
|
||||
pwd: str | None
|
||||
output_str: str
|
||||
output_json: dict | None
|
||||
output_files: dict | None
|
||||
output_json: dict[str, Any] | None
|
||||
output_files: dict[str, dict[str, Any]] | None
|
||||
output_size: int
|
||||
output_mimetypes: str
|
||||
start_ts: datetime | None
|
||||
@@ -108,13 +113,34 @@ class MinimalArchiveResultSchema(Schema):
|
||||
def resolve_created_by_username(obj) -> str:
|
||||
return obj.created_by.username
|
||||
|
||||
@staticmethod
|
||||
def resolve_output_files(obj):
|
||||
return obj.output_file_map()
|
||||
|
||||
@staticmethod
|
||||
def resolve_output_mimetypes(obj) -> str:
|
||||
mime_sizes: dict[str, int] = defaultdict(int)
|
||||
for metadata in obj.output_file_map().values():
|
||||
if not isinstance(metadata, dict):
|
||||
continue
|
||||
mimetype = str(metadata.get("mimetype") or "").strip()
|
||||
try:
|
||||
size = max(int(metadata.get("size") or 0), 0)
|
||||
except (TypeError, ValueError):
|
||||
size = 0
|
||||
if mimetype and size:
|
||||
mime_sizes[mimetype] += size
|
||||
if mime_sizes:
|
||||
return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
|
||||
return obj.output_mimetypes or ""
|
||||
|
||||
|
||||
class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
TYPE: str = 'core.models.ArchiveResult'
|
||||
TYPE: str = "core.models.ArchiveResult"
|
||||
snapshot_id: UUID
|
||||
snapshot_timestamp: str
|
||||
snapshot_url: str
|
||||
snapshot_tags: List[str]
|
||||
snapshot_tags: list[str]
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshot_timestamp(obj):
|
||||
@@ -134,25 +160,39 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
|
||||
|
||||
class ArchiveResultFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup(['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
|
||||
search: Annotated[Optional[str], FilterLookup(['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
|
||||
snapshot_id: Annotated[Optional[str], FilterLookup(['snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
|
||||
snapshot_url: Annotated[Optional[str], FilterLookup('snapshot__url__icontains')] = None
|
||||
snapshot_tag: Annotated[Optional[str], FilterLookup('snapshot__tags__name__icontains')] = None
|
||||
status: Annotated[Optional[str], FilterLookup('status')] = None
|
||||
output_str: Annotated[Optional[str], FilterLookup('output_str__icontains')] = None
|
||||
plugin: Annotated[Optional[str], FilterLookup('plugin__icontains')] = None
|
||||
hook_name: Annotated[Optional[str], FilterLookup('hook_name__icontains')] = None
|
||||
process_id: Annotated[Optional[str], FilterLookup('process__id__startswith')] = None
|
||||
cmd: Annotated[Optional[str], FilterLookup('cmd__0__icontains')] = None
|
||||
pwd: Annotated[Optional[str], FilterLookup('pwd__icontains')] = None
|
||||
cmd_version: Annotated[Optional[str], FilterLookup('cmd_version')] = None
|
||||
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
|
||||
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
|
||||
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
|
||||
id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
|
||||
search: Annotated[
|
||||
str | None,
|
||||
FilterLookup(
|
||||
[
|
||||
"snapshot__url__icontains",
|
||||
"snapshot__title__icontains",
|
||||
"snapshot__tags__name__icontains",
|
||||
"plugin",
|
||||
"output_str__icontains",
|
||||
"id__startswith",
|
||||
"snapshot__id__startswith",
|
||||
"snapshot__timestamp__startswith",
|
||||
],
|
||||
),
|
||||
] = None
|
||||
snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
|
||||
snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None
|
||||
snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None
|
||||
status: Annotated[str | None, FilterLookup("status")] = None
|
||||
output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None
|
||||
plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None
|
||||
hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None
|
||||
process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None
|
||||
cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None
|
||||
pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None
|
||||
cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None
|
||||
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
|
||||
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
|
||||
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
|
||||
|
||||
|
||||
@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
|
||||
@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult")
|
||||
@paginate(CustomPagination)
|
||||
def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]):
|
||||
"""List all ArchiveResult entries matching these filters."""
|
||||
@@ -167,8 +207,9 @@ def get_archiveresult(request: HttpRequest, archiveresult_id: str):
|
||||
|
||||
### Snapshot #########################################################################
|
||||
|
||||
|
||||
class SnapshotSchema(Schema):
|
||||
TYPE: str = 'core.models.Snapshot'
|
||||
TYPE: str = "core.models.Snapshot"
|
||||
id: UUID
|
||||
created_by_id: str
|
||||
created_by_username: str
|
||||
@@ -177,14 +218,16 @@ class SnapshotSchema(Schema):
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
bookmarked_at: datetime
|
||||
downloaded_at: Optional[datetime]
|
||||
downloaded_at: datetime | None
|
||||
url: str
|
||||
tags: List[str]
|
||||
title: Optional[str]
|
||||
tags: list[str]
|
||||
title: str | None
|
||||
timestamp: str
|
||||
archive_path: str
|
||||
archive_size: int
|
||||
output_size: int
|
||||
num_archiveresults: int
|
||||
archiveresults: List[MinimalArchiveResultSchema]
|
||||
archiveresults: list[MinimalArchiveResultSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_id(obj):
|
||||
@@ -198,13 +241,21 @@ class SnapshotSchema(Schema):
|
||||
def resolve_tags(obj):
|
||||
return sorted(tag.name for tag in obj.tags.all())
|
||||
|
||||
@staticmethod
|
||||
def resolve_archive_size(obj):
|
||||
return int(getattr(obj, "output_size_sum", obj.archive_size) or 0)
|
||||
|
||||
@staticmethod
|
||||
def resolve_output_size(obj):
|
||||
return SnapshotSchema.resolve_archive_size(obj)
|
||||
|
||||
@staticmethod
|
||||
def resolve_num_archiveresults(obj, context):
|
||||
return obj.archiveresult_set.all().distinct().count()
|
||||
|
||||
@staticmethod
|
||||
def resolve_archiveresults(obj, context):
|
||||
if bool(getattr(context['request'], 'with_archiveresults', False)):
|
||||
if bool(getattr(context["request"], "with_archiveresults", False)):
|
||||
return obj.archiveresult_set.all().distinct()
|
||||
return ArchiveResult.objects.none()
|
||||
|
||||
@@ -212,16 +263,16 @@ class SnapshotSchema(Schema):
|
||||
class SnapshotUpdateSchema(Schema):
|
||||
status: str | None = None
|
||||
retry_at: datetime | None = None
|
||||
tags: Optional[List[str]] = None
|
||||
tags: list[str] | None = None
|
||||
|
||||
|
||||
class SnapshotCreateSchema(Schema):
|
||||
url: str
|
||||
crawl_id: Optional[str] = None
|
||||
crawl_id: str | None = None
|
||||
depth: int = 0
|
||||
title: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
status: Optional[str] = None
|
||||
title: str | None = None
|
||||
tags: list[str] | None = None
|
||||
status: str | None = None
|
||||
|
||||
|
||||
class SnapshotDeleteResponseSchema(Schema):
|
||||
@@ -231,77 +282,82 @@ class SnapshotDeleteResponseSchema(Schema):
|
||||
deleted_count: int
|
||||
|
||||
|
||||
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
|
||||
def normalize_tag_list(tags: list[str] | None = None) -> list[str]:
|
||||
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
|
||||
|
||||
|
||||
class SnapshotFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup(['id__icontains', 'timestamp__startswith'])] = None
|
||||
created_by_id: Annotated[Optional[str], FilterLookup('crawl__created_by_id')] = None
|
||||
created_by_username: Annotated[Optional[str], FilterLookup('crawl__created_by__username__icontains')] = None
|
||||
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
|
||||
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
|
||||
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
|
||||
modified_at: Annotated[Optional[datetime], FilterLookup('modified_at')] = None
|
||||
modified_at__gte: Annotated[Optional[datetime], FilterLookup('modified_at__gte')] = None
|
||||
modified_at__lt: Annotated[Optional[datetime], FilterLookup('modified_at__lt')] = None
|
||||
search: Annotated[Optional[str], FilterLookup(['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])] = None
|
||||
url: Annotated[Optional[str], FilterLookup('url')] = None
|
||||
tag: Annotated[Optional[str], FilterLookup('tags__name')] = None
|
||||
title: Annotated[Optional[str], FilterLookup('title__icontains')] = None
|
||||
timestamp: Annotated[Optional[str], FilterLookup('timestamp__startswith')] = None
|
||||
bookmarked_at__gte: Annotated[Optional[datetime], FilterLookup('bookmarked_at__gte')] = None
|
||||
bookmarked_at__lt: Annotated[Optional[datetime], FilterLookup('bookmarked_at__lt')] = None
|
||||
id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None
|
||||
created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None
|
||||
created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None
|
||||
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
|
||||
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
|
||||
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
|
||||
modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None
|
||||
modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None
|
||||
modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None
|
||||
search: Annotated[
|
||||
str | None,
|
||||
FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]),
|
||||
] = None
|
||||
url: Annotated[str | None, FilterLookup("url")] = None
|
||||
tag: Annotated[str | None, FilterLookup("tags__name")] = None
|
||||
title: Annotated[str | None, FilterLookup("title__icontains")] = None
|
||||
timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None
|
||||
bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None
|
||||
bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None
|
||||
|
||||
|
||||
@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
|
||||
@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots")
|
||||
@paginate(CustomPagination)
|
||||
def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False):
|
||||
"""List all Snapshot entries matching these filters."""
|
||||
setattr(request, 'with_archiveresults', with_archiveresults)
|
||||
return filters.filter(Snapshot.objects.all()).distinct()
|
||||
setattr(request, "with_archiveresults", with_archiveresults)
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
|
||||
return filters.filter(queryset).distinct()
|
||||
|
||||
|
||||
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
|
||||
def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True):
|
||||
"""Get a specific Snapshot by id."""
|
||||
setattr(request, 'with_archiveresults', with_archiveresults)
|
||||
setattr(request, "with_archiveresults", with_archiveresults)
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
|
||||
try:
|
||||
return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
|
||||
return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
|
||||
except Snapshot.DoesNotExist:
|
||||
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
|
||||
return queryset.get(Q(id__icontains=snapshot_id))
|
||||
|
||||
|
||||
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
|
||||
def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
|
||||
tags = normalize_tag_list(data.tags)
|
||||
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {data.status}')
|
||||
raise HttpError(400, f"Invalid status: {data.status}")
|
||||
if not data.url.strip():
|
||||
raise HttpError(400, 'URL is required')
|
||||
raise HttpError(400, "URL is required")
|
||||
if data.depth not in (0, 1, 2, 3, 4):
|
||||
raise HttpError(400, 'depth must be between 0 and 4')
|
||||
raise HttpError(400, "depth must be between 0 and 4")
|
||||
|
||||
if data.crawl_id:
|
||||
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
|
||||
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
|
||||
crawl_tags = normalize_tag_list(crawl.tags_str.split(","))
|
||||
tags = tags or crawl_tags
|
||||
else:
|
||||
crawl = Crawl.objects.create(
|
||||
urls=data.url,
|
||||
max_depth=max(data.depth, 0),
|
||||
tags_str=','.join(tags),
|
||||
tags_str=",".join(tags),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
created_by=request.user if isinstance(request.user, User) else None,
|
||||
)
|
||||
|
||||
snapshot_defaults = {
|
||||
'depth': data.depth,
|
||||
'title': data.title,
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'status': data.status or Snapshot.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
"depth": data.depth,
|
||||
"title": data.title,
|
||||
"timestamp": str(timezone.now().timestamp()),
|
||||
"status": data.status or Snapshot.StatusChoices.QUEUED,
|
||||
"retry_at": timezone.now(),
|
||||
}
|
||||
snapshot, _ = Snapshot.objects.get_or_create(
|
||||
url=data.url,
|
||||
@@ -309,17 +365,17 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
|
||||
defaults=snapshot_defaults,
|
||||
)
|
||||
|
||||
update_fields: List[str] = []
|
||||
update_fields: list[str] = []
|
||||
if data.title is not None and snapshot.title != data.title:
|
||||
snapshot.title = data.title
|
||||
update_fields.append('title')
|
||||
update_fields.append("title")
|
||||
if data.status is not None and snapshot.status != data.status:
|
||||
if data.status not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {data.status}')
|
||||
raise HttpError(400, f"Invalid status: {data.status}")
|
||||
snapshot.status = data.status
|
||||
update_fields.append('status')
|
||||
update_fields.append("status")
|
||||
if update_fields:
|
||||
update_fields.append('modified_at')
|
||||
update_fields.append("modified_at")
|
||||
snapshot.save(update_fields=update_fields)
|
||||
|
||||
if tags:
|
||||
@@ -330,7 +386,7 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
return snapshot
|
||||
|
||||
|
||||
@@ -343,26 +399,26 @@ def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateS
|
||||
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
|
||||
|
||||
payload = data.dict(exclude_unset=True)
|
||||
update_fields = ['modified_at']
|
||||
tags = payload.pop('tags', None)
|
||||
update_fields = ["modified_at"]
|
||||
tags = payload.pop("tags", None)
|
||||
|
||||
if 'status' in payload:
|
||||
if payload['status'] not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {payload["status"]}')
|
||||
snapshot.status = payload['status']
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
|
||||
if "status" in payload:
|
||||
if payload["status"] not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f"Invalid status: {payload['status']}")
|
||||
snapshot.status = payload["status"]
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload:
|
||||
snapshot.retry_at = None
|
||||
update_fields.append('status')
|
||||
update_fields.append("status")
|
||||
|
||||
if 'retry_at' in payload:
|
||||
snapshot.retry_at = payload['retry_at']
|
||||
update_fields.append('retry_at')
|
||||
if "retry_at" in payload:
|
||||
snapshot.retry_at = payload["retry_at"]
|
||||
update_fields.append("retry_at")
|
||||
|
||||
if tags is not None:
|
||||
snapshot.save_tags(normalize_tag_list(tags))
|
||||
|
||||
snapshot.save(update_fields=update_fields)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
return snapshot
|
||||
|
||||
|
||||
@@ -373,17 +429,18 @@ def delete_snapshot(request: HttpRequest, snapshot_id: str):
|
||||
crawl_id_str = str(snapshot.crawl.pk)
|
||||
deleted_count, _ = snapshot.delete()
|
||||
return {
|
||||
'success': True,
|
||||
'snapshot_id': snapshot_id_str,
|
||||
'crawl_id': crawl_id_str,
|
||||
'deleted_count': deleted_count,
|
||||
"success": True,
|
||||
"snapshot_id": snapshot_id_str,
|
||||
"crawl_id": crawl_id_str,
|
||||
"deleted_count": deleted_count,
|
||||
}
|
||||
|
||||
|
||||
### Tag #########################################################################
|
||||
|
||||
|
||||
class TagSchema(Schema):
|
||||
TYPE: str = 'core.models.Tag'
|
||||
TYPE: str = "core.models.Tag"
|
||||
id: int
|
||||
modified_at: datetime
|
||||
created_at: datetime
|
||||
@@ -392,7 +449,7 @@ class TagSchema(Schema):
|
||||
name: str
|
||||
slug: str
|
||||
num_snapshots: int
|
||||
snapshots: List[SnapshotSchema]
|
||||
snapshots: list[SnapshotSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_id(obj):
|
||||
@@ -402,7 +459,7 @@ class TagSchema(Schema):
|
||||
def resolve_created_by_username(obj):
|
||||
user_model = get_user_model()
|
||||
user = user_model.objects.get(id=obj.created_by_id)
|
||||
username = getattr(user, 'username', None)
|
||||
username = getattr(user, "username", None)
|
||||
return username if isinstance(username, str) else str(user)
|
||||
|
||||
@staticmethod
|
||||
@@ -411,58 +468,67 @@ class TagSchema(Schema):
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshots(obj, context):
|
||||
if bool(getattr(context['request'], 'with_snapshots', False)):
|
||||
if bool(getattr(context["request"], "with_snapshots", False)):
|
||||
return obj.snapshot_set.all().distinct()
|
||||
return Snapshot.objects.none()
|
||||
|
||||
|
||||
@router.get("/tags", response=List[TagSchema], url_name="get_tags")
|
||||
@router.get("/tags", response=list[TagSchema], url_name="get_tags")
|
||||
@paginate(CustomPagination)
|
||||
def get_tags(request: HttpRequest):
|
||||
setattr(request, 'with_snapshots', False)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_snapshots", False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
return get_matching_tags()
|
||||
|
||||
|
||||
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
|
||||
def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
|
||||
setattr(request, 'with_snapshots', with_snapshots)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_snapshots", with_snapshots)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
try:
|
||||
return get_tag_by_ref(tag_id)
|
||||
except (Tag.DoesNotExist, ValidationError):
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
|
||||
|
||||
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
|
||||
@router.get(
|
||||
"/any/{id}",
|
||||
response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema],
|
||||
url_name="get_any",
|
||||
summary="Get any object by its ID",
|
||||
)
|
||||
def get_any(request: HttpRequest, id: str):
|
||||
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
|
||||
setattr(request, 'with_snapshots', False)
|
||||
setattr(request, 'with_archiveresults', False)
|
||||
setattr(request, "with_snapshots", False)
|
||||
setattr(request, "with_archiveresults", False)
|
||||
|
||||
for getter in [get_snapshot, get_archiveresult, get_tag]:
|
||||
try:
|
||||
response = getter(request, id)
|
||||
if isinstance(response, Model):
|
||||
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
|
||||
return redirect(
|
||||
f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from archivebox.api.v1_crawls import get_crawl
|
||||
|
||||
response = get_crawl(request, id)
|
||||
if isinstance(response, Model):
|
||||
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
raise HttpError(404, 'Object with given ID not found')
|
||||
raise HttpError(404, "Object with given ID not found")
|
||||
|
||||
|
||||
### Tag Editor API Endpoints #########################################################################
|
||||
|
||||
|
||||
class TagAutocompleteSchema(Schema):
|
||||
tags: List[dict]
|
||||
tags: list[dict]
|
||||
|
||||
|
||||
class TagCreateSchema(Schema):
|
||||
@@ -483,7 +549,7 @@ class TagSearchSnapshotSchema(Schema):
|
||||
favicon_url: str
|
||||
admin_url: str
|
||||
archive_url: str
|
||||
downloaded_at: Optional[str] = None
|
||||
downloaded_at: str | None = None
|
||||
|
||||
|
||||
class TagSearchCardSchema(Schema):
|
||||
@@ -497,11 +563,11 @@ class TagSearchCardSchema(Schema):
|
||||
export_jsonl_url: str
|
||||
rename_url: str
|
||||
delete_url: str
|
||||
snapshots: List[TagSearchSnapshotSchema]
|
||||
snapshots: list[TagSearchSnapshotSchema]
|
||||
|
||||
|
||||
class TagSearchResponseSchema(Schema):
|
||||
tags: List[TagSearchCardSchema]
|
||||
tags: list[TagSearchCardSchema]
|
||||
sort: str
|
||||
created_by: str
|
||||
year: str
|
||||
@@ -527,8 +593,8 @@ class TagDeleteResponseSchema(Schema):
|
||||
|
||||
class TagSnapshotRequestSchema(Schema):
|
||||
snapshot_id: str
|
||||
tag_name: Optional[str] = None
|
||||
tag_id: Optional[int] = None
|
||||
tag_name: str | None = None
|
||||
tag_id: int | None = None
|
||||
|
||||
|
||||
class TagSnapshotResponseSchema(Schema):
|
||||
@@ -541,10 +607,10 @@ class TagSnapshotResponseSchema(Schema):
|
||||
def search_tags(
|
||||
request: HttpRequest,
|
||||
q: str = "",
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
sort: str = "created_desc",
|
||||
created_by: str = "",
|
||||
year: str = "",
|
||||
has_snapshots: str = "all",
|
||||
):
|
||||
"""Return detailed tag cards for admin/live-search UIs."""
|
||||
normalized_sort = normalize_tag_sort(sort)
|
||||
@@ -552,7 +618,7 @@ def search_tags(
|
||||
normalized_year = normalize_created_year_filter(year)
|
||||
normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
|
||||
return {
|
||||
'tags': build_tag_cards(
|
||||
"tags": build_tag_cards(
|
||||
query=q,
|
||||
request=request,
|
||||
sort=normalized_sort,
|
||||
@@ -560,28 +626,28 @@ def search_tags(
|
||||
year=normalized_year,
|
||||
has_snapshots=normalized_has_snapshots,
|
||||
),
|
||||
'sort': normalized_sort,
|
||||
'created_by': normalized_created_by,
|
||||
'year': normalized_year,
|
||||
'has_snapshots': normalized_has_snapshots,
|
||||
"sort": normalized_sort,
|
||||
"created_by": normalized_created_by,
|
||||
"year": normalized_year,
|
||||
"has_snapshots": normalized_has_snapshots,
|
||||
}
|
||||
|
||||
|
||||
def _public_tag_listing_enabled() -> bool:
|
||||
explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
|
||||
explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None)
|
||||
if explicit is not None:
|
||||
return bool(explicit)
|
||||
return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
|
||||
return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX))
|
||||
|
||||
|
||||
def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
|
||||
user = getattr(request, 'user', None)
|
||||
if getattr(user, 'is_authenticated', False):
|
||||
user = getattr(request, "user", None)
|
||||
if getattr(user, "is_authenticated", False):
|
||||
return True
|
||||
|
||||
token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
|
||||
auth_header = request.headers.get('Authorization', '')
|
||||
if not token and auth_header.lower().startswith('bearer '):
|
||||
token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key")
|
||||
auth_header = request.headers.get("Authorization", "")
|
||||
if not token and auth_header.lower().startswith("bearer "):
|
||||
token = auth_header.split(None, 1)[1].strip()
|
||||
|
||||
if token and auth_using_token(token=token, request=request):
|
||||
@@ -594,12 +660,12 @@ def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
|
||||
def tags_autocomplete(request: HttpRequest, q: str = ""):
|
||||
"""Return tags matching the query for autocomplete."""
|
||||
if not _request_has_tag_autocomplete_access(request):
|
||||
raise HttpError(401, 'Authentication required')
|
||||
raise HttpError(401, "Authentication required")
|
||||
|
||||
tags = get_matching_tags(q)[:50 if not q else 20]
|
||||
tags = get_matching_tags(q)[: 50 if not q else 20]
|
||||
|
||||
return {
|
||||
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
|
||||
"tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags],
|
||||
}
|
||||
|
||||
|
||||
@@ -615,10 +681,10 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
|
||||
raise HttpError(400, str(err)) from err
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
'created': created,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
"created": created,
|
||||
}
|
||||
|
||||
|
||||
@@ -627,15 +693,15 @@ def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
|
||||
try:
|
||||
tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
except ValueError as err:
|
||||
raise HttpError(400, str(err)) from err
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
'slug': tag.slug,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
"slug": tag.slug,
|
||||
}
|
||||
|
||||
|
||||
@@ -644,13 +710,13 @@ def delete_tag(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
|
||||
deleted_count, _ = delete_tag_record(tag)
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': int(tag_id),
|
||||
'deleted_count': deleted_count,
|
||||
"success": True,
|
||||
"tag_id": int(tag_id),
|
||||
"deleted_count": deleted_count,
|
||||
}
|
||||
|
||||
|
||||
@@ -659,10 +725,10 @@ def tag_urls_export(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
|
||||
response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
|
||||
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
|
||||
response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8")
|
||||
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
|
||||
return response
|
||||
|
||||
|
||||
@@ -671,10 +737,10 @@ def tag_snapshots_export(request: HttpRequest, tag_id: int):
|
||||
try:
|
||||
tag = get_tag_by_ref(tag_id)
|
||||
except Tag.DoesNotExist as err:
|
||||
raise HttpError(404, 'Tag not found') from err
|
||||
raise HttpError(404, "Tag not found") from err
|
||||
|
||||
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
|
||||
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
|
||||
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8")
|
||||
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
|
||||
return response
|
||||
|
||||
|
||||
@@ -684,16 +750,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
|
||||
# Get the snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot = Snapshot.objects.filter(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
).first()
|
||||
if snapshot is None:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
|
||||
# Get or create the tag
|
||||
if data.tag_name:
|
||||
@@ -708,17 +774,17 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
|
||||
try:
|
||||
tag = get_tag_by_ref(data.tag_id)
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
else:
|
||||
raise HttpError(400, 'Either tag_name or tag_id is required')
|
||||
raise HttpError(400, "Either tag_name or tag_id is required")
|
||||
|
||||
# Add the tag to the snapshot
|
||||
snapshot.tags.add(tag.pk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
}
|
||||
|
||||
|
||||
@@ -728,36 +794,36 @@ def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSche
|
||||
# Get the snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot = Snapshot.objects.filter(
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
|
||||
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
|
||||
).first()
|
||||
if snapshot is None:
|
||||
raise HttpError(404, 'Snapshot not found')
|
||||
raise HttpError(404, "Snapshot not found")
|
||||
|
||||
# Get the tag
|
||||
if data.tag_id:
|
||||
try:
|
||||
tag = Tag.objects.get(pk=data.tag_id)
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
elif data.tag_name:
|
||||
try:
|
||||
tag = Tag.objects.get(name__iexact=data.tag_name.strip())
|
||||
except Tag.DoesNotExist:
|
||||
raise HttpError(404, 'Tag not found')
|
||||
raise HttpError(404, "Tag not found")
|
||||
else:
|
||||
raise HttpError(400, 'Either tag_name or tag_id is required')
|
||||
raise HttpError(400, "Either tag_name or tag_id is required")
|
||||
|
||||
# Remove the tag from the snapshot
|
||||
snapshot.tags.remove(tag.pk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'tag_id': tag.pk,
|
||||
'tag_name': tag.name,
|
||||
"success": True,
|
||||
"tag_id": tag.pk,
|
||||
"tag_name": tag.name,
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from django.http import HttpRequest
|
||||
from django.utils import timezone
|
||||
@@ -17,11 +16,11 @@ from archivebox.crawls.models import Crawl
|
||||
|
||||
from .auth import API_AUTH_METHODS
|
||||
|
||||
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
|
||||
router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS)
|
||||
|
||||
|
||||
class CrawlSchema(Schema):
|
||||
TYPE: str = 'crawls.models.Crawl'
|
||||
TYPE: str = "crawls.models.Crawl"
|
||||
|
||||
id: UUID
|
||||
|
||||
@@ -35,6 +34,8 @@ class CrawlSchema(Schema):
|
||||
|
||||
urls: str
|
||||
max_depth: int
|
||||
max_urls: int
|
||||
max_size: int
|
||||
tags_str: str
|
||||
config: dict
|
||||
|
||||
@@ -48,12 +49,12 @@ class CrawlSchema(Schema):
|
||||
def resolve_created_by_username(obj):
|
||||
user_model = get_user_model()
|
||||
user = user_model.objects.get(id=obj.created_by_id)
|
||||
username = getattr(user, 'username', None)
|
||||
username = getattr(user, "username", None)
|
||||
return username if isinstance(username, str) else str(user)
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshots(obj, context):
|
||||
if bool(getattr(context['request'], 'with_snapshots', False)):
|
||||
if bool(getattr(context["request"], "with_snapshots", False)):
|
||||
return obj.snapshot_set.all().distinct()
|
||||
return Snapshot.objects.none()
|
||||
|
||||
@@ -61,17 +62,19 @@ class CrawlSchema(Schema):
|
||||
class CrawlUpdateSchema(Schema):
|
||||
status: str | None = None
|
||||
retry_at: datetime | None = None
|
||||
tags: Optional[List[str]] = None
|
||||
tags: list[str] | None = None
|
||||
tags_str: str | None = None
|
||||
|
||||
|
||||
class CrawlCreateSchema(Schema):
|
||||
urls: List[str]
|
||||
urls: list[str]
|
||||
max_depth: int = 0
|
||||
tags: Optional[List[str]] = None
|
||||
tags_str: str = ''
|
||||
label: str = ''
|
||||
notes: str = ''
|
||||
max_urls: int = 0
|
||||
max_size: int = 0
|
||||
tags: list[str] | None = None
|
||||
tags_str: str = ""
|
||||
label: str = ""
|
||||
notes: str = ""
|
||||
config: dict = {}
|
||||
|
||||
|
||||
@@ -82,13 +85,13 @@ class CrawlDeleteResponseSchema(Schema):
|
||||
deleted_snapshots: int
|
||||
|
||||
|
||||
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
|
||||
def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]:
|
||||
if tags is not None:
|
||||
return [tag.strip() for tag in tags if tag and tag.strip()]
|
||||
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
|
||||
return [tag.strip() for tag in tags_str.split(",") if tag.strip()]
|
||||
|
||||
|
||||
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
|
||||
@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls")
|
||||
def get_crawls(request: HttpRequest):
|
||||
return Crawl.objects.all().distinct()
|
||||
|
||||
@@ -97,15 +100,21 @@ def get_crawls(request: HttpRequest):
|
||||
def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
|
||||
urls = [url.strip() for url in data.urls if url and url.strip()]
|
||||
if not urls:
|
||||
raise HttpError(400, 'At least one URL is required')
|
||||
raise HttpError(400, "At least one URL is required")
|
||||
if data.max_depth not in (0, 1, 2, 3, 4):
|
||||
raise HttpError(400, 'max_depth must be between 0 and 4')
|
||||
raise HttpError(400, "max_depth must be between 0 and 4")
|
||||
if data.max_urls < 0:
|
||||
raise HttpError(400, "max_urls must be >= 0")
|
||||
if data.max_size < 0:
|
||||
raise HttpError(400, "max_size must be >= 0")
|
||||
|
||||
tags = normalize_tag_list(data.tags, data.tags_str)
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join(urls),
|
||||
urls="\n".join(urls),
|
||||
max_depth=data.max_depth,
|
||||
tags_str=','.join(tags),
|
||||
max_urls=data.max_urls,
|
||||
max_size=data.max_size,
|
||||
tags_str=",".join(tags),
|
||||
label=data.label,
|
||||
notes=data.notes,
|
||||
config=data.config,
|
||||
@@ -116,25 +125,26 @@ def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
|
||||
crawl.create_snapshots_from_urls()
|
||||
return crawl
|
||||
|
||||
|
||||
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
|
||||
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
|
||||
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False):
|
||||
"""Get a specific Crawl by id."""
|
||||
setattr(request, 'with_snapshots', with_snapshots)
|
||||
setattr(request, 'with_archiveresults', with_archiveresults)
|
||||
setattr(request, "with_snapshots", with_snapshots)
|
||||
setattr(request, "with_archiveresults", with_archiveresults)
|
||||
crawl = Crawl.objects.get(id__icontains=crawl_id)
|
||||
|
||||
|
||||
if crawl and as_rss:
|
||||
# return snapshots as XML rss feed
|
||||
urls = [
|
||||
{'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
|
||||
{"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str}
|
||||
for snapshot in crawl.snapshot_set.all()
|
||||
]
|
||||
xml = '<rss version="2.0"><channel>'
|
||||
for url in urls:
|
||||
xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
|
||||
xml += '</channel></rss>'
|
||||
xml += f"<item><url>{url['url']}</url><title>{url['title']}</title><bookmarked_at>{url['bookmarked_at']}</bookmarked_at><tags>{url['tags']}</tags></item>"
|
||||
xml += "</channel></rss>"
|
||||
return xml
|
||||
|
||||
|
||||
return crawl
|
||||
|
||||
|
||||
@@ -143,29 +153,29 @@ def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema):
|
||||
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
|
||||
crawl = Crawl.objects.get(id__icontains=crawl_id)
|
||||
payload = data.dict(exclude_unset=True)
|
||||
update_fields = ['modified_at']
|
||||
update_fields = ["modified_at"]
|
||||
|
||||
tags = payload.pop('tags', None)
|
||||
tags_str = payload.pop('tags_str', None)
|
||||
tags = payload.pop("tags", None)
|
||||
tags_str = payload.pop("tags_str", None)
|
||||
if tags is not None or tags_str is not None:
|
||||
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
|
||||
update_fields.append('tags_str')
|
||||
crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or ""))
|
||||
update_fields.append("tags_str")
|
||||
|
||||
if 'status' in payload:
|
||||
if payload['status'] not in Crawl.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {payload["status"]}')
|
||||
crawl.status = payload['status']
|
||||
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
|
||||
if "status" in payload:
|
||||
if payload["status"] not in Crawl.StatusChoices.values:
|
||||
raise HttpError(400, f"Invalid status: {payload['status']}")
|
||||
crawl.status = payload["status"]
|
||||
if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload:
|
||||
crawl.retry_at = None
|
||||
update_fields.append('status')
|
||||
update_fields.append("status")
|
||||
|
||||
if 'retry_at' in payload:
|
||||
crawl.retry_at = payload['retry_at']
|
||||
update_fields.append('retry_at')
|
||||
if "retry_at" in payload:
|
||||
crawl.retry_at = payload["retry_at"]
|
||||
update_fields.append("retry_at")
|
||||
|
||||
crawl.save(update_fields=update_fields)
|
||||
|
||||
if payload.get('status') == Crawl.StatusChoices.SEALED:
|
||||
if payload.get("status") == Crawl.StatusChoices.SEALED:
|
||||
Snapshot.objects.filter(
|
||||
crawl=crawl,
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
|
||||
@@ -184,8 +194,8 @@ def delete_crawl(request: HttpRequest, crawl_id: str):
|
||||
snapshot_count = crawl.snapshot_set.count()
|
||||
deleted_count, _ = crawl.delete()
|
||||
return {
|
||||
'success': True,
|
||||
'crawl_id': crawl_id_str,
|
||||
'deleted_count': deleted_count,
|
||||
'deleted_snapshots': snapshot_count,
|
||||
"success": True,
|
||||
"crawl_id": crawl_id_str,
|
||||
"deleted_count": deleted_count,
|
||||
"deleted_snapshots": snapshot_count,
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
__package__ = 'archivebox.api'
|
||||
__package__ = "archivebox.api"
|
||||
|
||||
from uuid import UUID
|
||||
from typing import Annotated, List, Optional
|
||||
from typing import Annotated
|
||||
from datetime import datetime
|
||||
|
||||
from django.http import HttpRequest
|
||||
@@ -12,16 +12,18 @@ from ninja.pagination import paginate
|
||||
from archivebox.api.v1_core import CustomPagination
|
||||
|
||||
|
||||
router = Router(tags=['Machine and Dependencies'])
|
||||
router = Router(tags=["Machine and Dependencies"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Machine Schemas
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class MachineSchema(Schema):
|
||||
"""Schema for Machine model."""
|
||||
TYPE: str = 'machine.Machine'
|
||||
|
||||
TYPE: str = "machine.Machine"
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
@@ -43,22 +45,24 @@ class MachineSchema(Schema):
|
||||
|
||||
|
||||
class MachineFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
|
||||
hostname: Annotated[Optional[str], FilterLookup('hostname__icontains')] = None
|
||||
os_platform: Annotated[Optional[str], FilterLookup('os_platform__icontains')] = None
|
||||
os_arch: Annotated[Optional[str], FilterLookup('os_arch')] = None
|
||||
hw_in_docker: Annotated[Optional[bool], FilterLookup('hw_in_docker')] = None
|
||||
hw_in_vm: Annotated[Optional[bool], FilterLookup('hw_in_vm')] = None
|
||||
bin_providers: Annotated[Optional[str], FilterLookup('bin_providers__icontains')] = None
|
||||
id: Annotated[str | None, FilterLookup("id__startswith")] = None
|
||||
hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None
|
||||
os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None
|
||||
os_arch: Annotated[str | None, FilterLookup("os_arch")] = None
|
||||
hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None
|
||||
hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None
|
||||
bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Binary Schemas
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class BinarySchema(Schema):
|
||||
"""Schema for Binary model."""
|
||||
TYPE: str = 'machine.Binary'
|
||||
|
||||
TYPE: str = "machine.Binary"
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
@@ -85,23 +89,25 @@ class BinarySchema(Schema):
|
||||
|
||||
|
||||
class BinaryFilterSchema(FilterSchema):
|
||||
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
|
||||
name: Annotated[Optional[str], FilterLookup('name__icontains')] = None
|
||||
binprovider: Annotated[Optional[str], FilterLookup('binprovider')] = None
|
||||
status: Annotated[Optional[str], FilterLookup('status')] = None
|
||||
machine_id: Annotated[Optional[str], FilterLookup('machine_id__startswith')] = None
|
||||
version: Annotated[Optional[str], FilterLookup('version__icontains')] = None
|
||||
id: Annotated[str | None, FilterLookup("id__startswith")] = None
|
||||
name: Annotated[str | None, FilterLookup("name__icontains")] = None
|
||||
binprovider: Annotated[str | None, FilterLookup("binprovider")] = None
|
||||
status: Annotated[str | None, FilterLookup("status")] = None
|
||||
machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None
|
||||
version: Annotated[str | None, FilterLookup("version__icontains")] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Machine Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
|
||||
|
||||
@router.get("/machines", response=list[MachineSchema], url_name="get_machines")
|
||||
@paginate(CustomPagination)
|
||||
def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
|
||||
"""List all machines."""
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return filters.filter(Machine.objects.all()).distinct()
|
||||
|
||||
|
||||
@@ -109,6 +115,7 @@ def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
|
||||
def get_current_machine(request: HttpRequest):
|
||||
"""Get the current machine."""
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.current()
|
||||
|
||||
|
||||
@@ -117,6 +124,7 @@ def get_machine(request: HttpRequest, machine_id: str):
|
||||
"""Get a specific machine by ID."""
|
||||
from archivebox.machine.models import Machine
|
||||
from django.db.models import Q
|
||||
|
||||
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
|
||||
|
||||
|
||||
@@ -127,23 +135,27 @@ def get_machine(request: HttpRequest, machine_id: str):
|
||||
# Binary Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/binaries", response=List[BinarySchema], url_name="get_binaries")
|
||||
|
||||
@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries")
|
||||
@paginate(CustomPagination)
|
||||
def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]):
|
||||
"""List all binaries."""
|
||||
from archivebox.machine.models import Binary
|
||||
return filters.filter(Binary.objects.all().select_related('machine')).distinct()
|
||||
|
||||
return filters.filter(Binary.objects.all().select_related("machine")).distinct()
|
||||
|
||||
|
||||
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
|
||||
def get_binary(request: HttpRequest, binary_id: str):
|
||||
"""Get a specific binary by ID."""
|
||||
from archivebox.machine.models import Binary
|
||||
return Binary.objects.select_related('machine').get(id__startswith=binary_id)
|
||||
|
||||
return Binary.objects.select_related("machine").get(id__startswith=binary_id)
|
||||
|
||||
|
||||
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
|
||||
@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name")
|
||||
def get_binaries_by_name(request: HttpRequest, name: str):
|
||||
"""Get all binaries with the given name."""
|
||||
from archivebox.machine.models import Binary
|
||||
return list(Binary.objects.filter(name__iexact=name).select_related('machine'))
|
||||
|
||||
return list(Binary.objects.filter(name__iexact=name).select_related("machine"))
|
||||
|
||||
@@ -1 +1 @@
|
||||
__package__ = 'archivebox.base_models'
|
||||
__package__ = "archivebox.base_models"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Base admin classes for models using UUIDv7."""
|
||||
|
||||
__package__ = 'archivebox.base_models'
|
||||
__package__ = "archivebox.base_models"
|
||||
|
||||
import json
|
||||
from collections.abc import Mapping
|
||||
@@ -32,11 +32,12 @@ class KeyValueWidget(forms.Widget):
|
||||
with + and - buttons to add/remove rows.
|
||||
Includes autocomplete for available config keys from the plugin system.
|
||||
"""
|
||||
|
||||
template_name = "" # We render manually
|
||||
|
||||
class Media:
|
||||
css = {
|
||||
'all': []
|
||||
"all": [],
|
||||
}
|
||||
js = []
|
||||
|
||||
@@ -44,17 +45,18 @@ class KeyValueWidget(forms.Widget):
|
||||
"""Get available config options from plugins."""
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
options: dict[str, ConfigOption] = {}
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop in schema.get('properties', {}).items():
|
||||
for key, prop in schema.get("properties", {}).items():
|
||||
option: ConfigOption = {
|
||||
'plugin': plugin_name,
|
||||
'type': prop.get('type', 'string'),
|
||||
'default': prop.get('default', ''),
|
||||
'description': prop.get('description', ''),
|
||||
"plugin": plugin_name,
|
||||
"type": prop.get("type", "string"),
|
||||
"default": prop.get("default", ""),
|
||||
"description": prop.get("description", ""),
|
||||
}
|
||||
for schema_key in ('enum', 'pattern', 'minimum', 'maximum'):
|
||||
for schema_key in ("enum", "pattern", "minimum", "maximum"):
|
||||
if schema_key in prop:
|
||||
option[schema_key] = prop[schema_key]
|
||||
options[key] = option
|
||||
@@ -85,11 +87,11 @@ class KeyValueWidget(forms.Widget):
|
||||
) -> SafeString:
|
||||
data = self._parse_value(value)
|
||||
|
||||
widget_id = attrs.get('id', name) if attrs else name
|
||||
widget_id = attrs.get("id", name) if attrs else name
|
||||
config_options = self._get_config_options()
|
||||
|
||||
# Build datalist options
|
||||
datalist_options = '\n'.join(
|
||||
datalist_options = "\n".join(
|
||||
f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
|
||||
for key, opt in sorted(config_options.items())
|
||||
)
|
||||
@@ -111,7 +113,7 @@ class KeyValueWidget(forms.Widget):
|
||||
html += self._render_row(widget_id, key, val_str)
|
||||
|
||||
# Always add one empty row for new entries
|
||||
html += self._render_row(widget_id, '', '')
|
||||
html += self._render_row(widget_id, "", "")
|
||||
|
||||
html += f'''
|
||||
</div>
|
||||
@@ -669,8 +671,8 @@ class KeyValueWidget(forms.Widget):
|
||||
def _escape(self, s: object) -> str:
|
||||
"""Escape HTML special chars in attribute values."""
|
||||
if not s:
|
||||
return ''
|
||||
return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
return ""
|
||||
return str(s).replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """)
|
||||
|
||||
def value_from_datadict(
|
||||
self,
|
||||
@@ -678,8 +680,8 @@ class KeyValueWidget(forms.Widget):
|
||||
files: object,
|
||||
name: str,
|
||||
) -> str:
|
||||
value = data.get(name, '{}')
|
||||
return value if isinstance(value, str) else '{}'
|
||||
value = data.get(name, "{}")
|
||||
return value if isinstance(value, str) else "{}"
|
||||
|
||||
|
||||
class ConfigEditorMixin(admin.ModelAdmin):
|
||||
@@ -696,14 +698,20 @@ class ConfigEditorMixin(admin.ModelAdmin):
|
||||
**kwargs: object,
|
||||
) -> forms.Field | None:
|
||||
"""Use KeyValueWidget for the config JSON field."""
|
||||
if db_field.name == 'config':
|
||||
kwargs['widget'] = KeyValueWidget()
|
||||
if db_field.name == "config":
|
||||
kwargs["widget"] = KeyValueWidget()
|
||||
return super().formfield_for_dbfield(db_field, request, **kwargs)
|
||||
|
||||
|
||||
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by')
|
||||
readonly_fields = ('id', 'created_at', 'modified_at')
|
||||
list_display = ("id", "created_at", "created_by")
|
||||
readonly_fields = ("id", "created_at", "modified_at")
|
||||
show_search_mode_selector = False
|
||||
|
||||
def get_default_search_mode(self) -> str:
|
||||
# The shared changelist template always asks every admin for a default
|
||||
# search mode, even when the search-mode toggle is hidden.
|
||||
return "meta"
|
||||
|
||||
def get_form(
|
||||
self,
|
||||
@@ -713,6 +721,6 @@ class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
||||
**kwargs: object,
|
||||
):
|
||||
form = super().get_form(request, obj, change=change, **kwargs)
|
||||
if 'created_by' in form.base_fields:
|
||||
form.base_fields['created_by'].initial = request.user
|
||||
if "created_by" in form.base_fields:
|
||||
form.base_fields["created_by"].initial = request.user
|
||||
return form
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Base models using UUIDv7 for all id fields."""
|
||||
|
||||
__package__ = 'archivebox.base_models'
|
||||
__package__ = "archivebox.base_models"
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
@@ -15,22 +15,22 @@ from django.conf import settings
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
|
||||
|
||||
def get_or_create_system_user_pk(username='system'):
|
||||
def get_or_create_system_user_pk(username="system"):
|
||||
User = get_user_model()
|
||||
# If there's exactly one superuser, use that for all system operations
|
||||
if User.objects.filter(is_superuser=True).count() == 1:
|
||||
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
|
||||
return User.objects.filter(is_superuser=True).values_list("pk", flat=True)[0]
|
||||
# Otherwise get or create the system user
|
||||
user, _ = User.objects.get_or_create(
|
||||
username=username,
|
||||
defaults={'is_staff': True, 'is_superuser': True, 'email': '', 'password': '!'}
|
||||
defaults={"is_staff": True, "is_superuser": True, "email": "", "password": "!"},
|
||||
)
|
||||
return user.pk
|
||||
|
||||
|
||||
class AutoDateTimeField(models.DateTimeField):
|
||||
"""DateTimeField that automatically updates on save (legacy compatibility)."""
|
||||
|
||||
def pre_save(self, model_instance, add):
|
||||
if add or not getattr(model_instance, self.attname):
|
||||
value = timezone.now()
|
||||
@@ -43,13 +43,19 @@ class ModelWithUUID(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, db_index=True)
|
||||
created_by = models.ForeignKey(
|
||||
settings.AUTH_USER_MODEL,
|
||||
on_delete=models.CASCADE,
|
||||
default=get_or_create_system_user_pk,
|
||||
null=False,
|
||||
db_index=True,
|
||||
)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.id}] {self.__class__.__name__}'
|
||||
return f"[{self.id}] {self.__class__.__name__}"
|
||||
|
||||
@property
|
||||
def admin_change_url(self) -> str:
|
||||
@@ -57,17 +63,17 @@ class ModelWithUUID(models.Model):
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return str(reverse_lazy('api-1:get_any', args=[self.id]))
|
||||
return str(reverse_lazy("api-1:get_any", args=[self.id]))
|
||||
|
||||
@property
|
||||
def api_docs_url(self) -> str:
|
||||
return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
|
||||
|
||||
return f"/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}"
|
||||
|
||||
|
||||
class ModelWithNotes(models.Model):
|
||||
"""Mixin for models with a notes field."""
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
|
||||
notes = models.TextField(blank=True, null=False, default="")
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
@@ -75,6 +81,7 @@ class ModelWithNotes(models.Model):
|
||||
|
||||
class ModelWithHealthStats(models.Model):
|
||||
"""Mixin for models with health tracking fields."""
|
||||
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
@@ -88,12 +95,13 @@ class ModelWithHealthStats(models.Model):
|
||||
|
||||
def increment_health_stats(self, success: bool):
|
||||
"""Atomically increment success or failure counter using F() expression."""
|
||||
field = 'num_uses_succeeded' if success else 'num_uses_failed'
|
||||
field = "num_uses_succeeded" if success else "num_uses_failed"
|
||||
type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
|
||||
|
||||
|
||||
class ModelWithConfig(models.Model):
|
||||
"""Mixin for models with a JSON config field."""
|
||||
|
||||
config = models.JSONField(default=dict, null=True, blank=True, editable=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
@@ -111,7 +119,7 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
|
||||
@property
|
||||
def output_dir_parent(self) -> str:
|
||||
return f'{self._meta.model_name}s'
|
||||
return f"{self._meta.model_name}s"
|
||||
|
||||
@property
|
||||
def output_dir_name(self) -> str:
|
||||
@@ -119,7 +127,7 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
|
||||
@property
|
||||
def output_dir_str(self) -> str:
|
||||
return f'{self.output_dir_parent}/{self.output_dir_name}'
|
||||
return f"{self.output_dir_parent}/{self.output_dir_name}"
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox"
|
||||
import os
|
||||
import sys
|
||||
from importlib import import_module
|
||||
@@ -10,55 +10,55 @@ from rich import print
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
|
||||
|
||||
if '--debug' in sys.argv:
|
||||
os.environ['DEBUG'] = 'True'
|
||||
sys.argv.remove('--debug')
|
||||
if "--debug" in sys.argv:
|
||||
os.environ["DEBUG"] = "True"
|
||||
sys.argv.remove("--debug")
|
||||
|
||||
|
||||
class ArchiveBoxGroup(click.Group):
|
||||
"""lazy loading click group for archivebox commands"""
|
||||
|
||||
meta_commands = {
|
||||
'help': 'archivebox.cli.archivebox_help.main',
|
||||
'version': 'archivebox.cli.archivebox_version.main',
|
||||
'mcp': 'archivebox.cli.archivebox_mcp.main',
|
||||
"help": "archivebox.cli.archivebox_help.main",
|
||||
"version": "archivebox.cli.archivebox_version.main",
|
||||
"mcp": "archivebox.cli.archivebox_mcp.main",
|
||||
}
|
||||
setup_commands = {
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
'install': 'archivebox.cli.archivebox_install.main',
|
||||
"init": "archivebox.cli.archivebox_init.main",
|
||||
"install": "archivebox.cli.archivebox_install.main",
|
||||
}
|
||||
# Model commands (CRUD operations via subcommands)
|
||||
model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
|
||||
'tag': 'archivebox.cli.archivebox_tag.main',
|
||||
'binary': 'archivebox.cli.archivebox_binary.main',
|
||||
'process': 'archivebox.cli.archivebox_process.main',
|
||||
'machine': 'archivebox.cli.archivebox_machine.main',
|
||||
'persona': 'archivebox.cli.archivebox_persona.main',
|
||||
"crawl": "archivebox.cli.archivebox_crawl.main",
|
||||
"snapshot": "archivebox.cli.archivebox_snapshot.main",
|
||||
"archiveresult": "archivebox.cli.archivebox_archiveresult.main",
|
||||
"tag": "archivebox.cli.archivebox_tag.main",
|
||||
"binary": "archivebox.cli.archivebox_binary.main",
|
||||
"process": "archivebox.cli.archivebox_process.main",
|
||||
"machine": "archivebox.cli.archivebox_machine.main",
|
||||
"persona": "archivebox.cli.archivebox_persona.main",
|
||||
}
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
'list': 'archivebox.cli.archivebox_list.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
"add": "archivebox.cli.archivebox_add.main",
|
||||
"extract": "archivebox.cli.archivebox_extract.main",
|
||||
"list": "archivebox.cli.archivebox_list.main",
|
||||
"remove": "archivebox.cli.archivebox_remove.main",
|
||||
"run": "archivebox.cli.archivebox_run.main",
|
||||
"update": "archivebox.cli.archivebox_update.main",
|
||||
"status": "archivebox.cli.archivebox_status.main",
|
||||
"search": "archivebox.cli.archivebox_search.main",
|
||||
"config": "archivebox.cli.archivebox_config.main",
|
||||
"schedule": "archivebox.cli.archivebox_schedule.main",
|
||||
"server": "archivebox.cli.archivebox_server.main",
|
||||
"shell": "archivebox.cli.archivebox_shell.main",
|
||||
"manage": "archivebox.cli.archivebox_manage.main",
|
||||
# Introspection commands
|
||||
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
|
||||
"pluginmap": "archivebox.cli.archivebox_pluginmap.main",
|
||||
}
|
||||
legacy_model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
|
||||
"crawl": "archivebox.cli.archivebox_crawl_compat.main",
|
||||
"snapshot": "archivebox.cli.archivebox_snapshot_compat.main",
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
@@ -67,15 +67,15 @@ class ArchiveBoxGroup(click.Group):
|
||||
**archive_commands,
|
||||
}
|
||||
renamed_commands = {
|
||||
'setup': 'install',
|
||||
'import': 'add',
|
||||
'archive': 'add',
|
||||
"setup": "install",
|
||||
"import": "add",
|
||||
"archive": "add",
|
||||
}
|
||||
legacy_model_subcommands = {
|
||||
'crawl': {'create', 'list', 'update', 'delete'},
|
||||
'snapshot': {'create', 'list', 'update', 'delete'},
|
||||
"crawl": {"create", "list", "update", "delete"},
|
||||
"snapshot": {"create", "list", "update", "delete"},
|
||||
}
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_canonical_name(cls, cmd_name):
|
||||
return cls.renamed_commands.get(cmd_name, cmd_name)
|
||||
@@ -90,23 +90,22 @@ class ArchiveBoxGroup(click.Group):
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
remaining_args = sys.argv[arg_idx + 1:]
|
||||
remaining_args = sys.argv[arg_idx + 1 :]
|
||||
if not remaining_args:
|
||||
return False
|
||||
|
||||
first_arg = remaining_args[0]
|
||||
if first_arg in ('-h', '--help'):
|
||||
if first_arg in ("-h", "--help"):
|
||||
return False
|
||||
|
||||
return first_arg not in cls.legacy_model_subcommands[cmd_name]
|
||||
|
||||
|
||||
def get_command(self, ctx, cmd_name):
|
||||
# handle renamed commands
|
||||
if cmd_name in self.renamed_commands:
|
||||
new_name = self.renamed_commands[cmd_name]
|
||||
print(
|
||||
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
|
||||
f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`",
|
||||
file=sys.stderr,
|
||||
)
|
||||
cmd_name = new_name
|
||||
@@ -114,11 +113,11 @@ class ArchiveBoxGroup(click.Group):
|
||||
|
||||
if self._should_use_legacy_model_command(cmd_name):
|
||||
return self._lazy_load(self.legacy_model_commands[cmd_name])
|
||||
|
||||
|
||||
# handle lazy loading of commands
|
||||
if cmd_name in self.all_subcommands:
|
||||
return self._lazy_load(cmd_name)
|
||||
|
||||
|
||||
# fall-back to using click's default command lookup
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
@@ -127,72 +126,74 @@ class ArchiveBoxGroup(click.Group):
|
||||
import_path = cls.all_subcommands.get(cmd_name_or_path)
|
||||
if import_path is None:
|
||||
import_path = cmd_name_or_path
|
||||
modname, funcname = import_path.rsplit('.', 1)
|
||||
|
||||
modname, funcname = import_path.rsplit(".", 1)
|
||||
|
||||
# print(f'LAZY LOADING {import_path}')
|
||||
mod = import_module(modname)
|
||||
func = getattr(mod, funcname)
|
||||
|
||||
if not hasattr(func, '__doc__'):
|
||||
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
|
||||
|
||||
|
||||
if not hasattr(func, "__doc__"):
|
||||
raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method")
|
||||
|
||||
# if not isinstance(cmd, click.BaseCommand):
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
return func
|
||||
|
||||
|
||||
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.version_option(VERSION, '-v', '--version', package_name='archivebox', message='%(version)s')
|
||||
@click.option("--help", "-h", is_flag=True, help="Show help")
|
||||
@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s")
|
||||
@click.pass_context
|
||||
def cli(ctx, help=False):
|
||||
"""ArchiveBox: The self-hosted internet archive"""
|
||||
|
||||
|
||||
subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand)
|
||||
|
||||
|
||||
# if --help is passed or no subcommand is given, show custom help message
|
||||
if help or ctx.invoked_subcommand is None:
|
||||
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||
|
||||
ctx.invoke(ctx.command.get_command(ctx, "help"))
|
||||
|
||||
# if the subcommand is in archive_commands or model_commands,
|
||||
# then we need to set up the django environment and check that we're in a valid data folder
|
||||
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
|
||||
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||
try:
|
||||
if subcommand == 'server':
|
||||
run_in_debug = '--reload' in sys.argv or os.environ.get('DEBUG') in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
if subcommand == "server":
|
||||
run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes")
|
||||
if run_in_debug:
|
||||
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
|
||||
if '--reload' in sys.argv:
|
||||
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
|
||||
os.environ["ARCHIVEBOX_RUNSERVER"] = "1"
|
||||
if "--reload" in sys.argv:
|
||||
os.environ["ARCHIVEBOX_AUTORELOAD"] = "1"
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
|
||||
|
||||
os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
except Exception as e:
|
||||
print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr)
|
||||
if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand
|
||||
print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr)
|
||||
if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand
|
||||
raise
|
||||
|
||||
|
||||
|
||||
def main(args=None, prog_name=None, stdin=None):
|
||||
# show `docker run archivebox xyz` in help messages if running in docker
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
IS_TTY = sys.stdin.isatty()
|
||||
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
|
||||
|
||||
prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox")
|
||||
|
||||
# stdin param allows passing input data from caller (used by __main__.py)
|
||||
# currently not used by click-based CLI, but kept for backwards compatibility
|
||||
|
||||
try:
|
||||
cli(args=args, prog_name=prog_name)
|
||||
except KeyboardInterrupt:
|
||||
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
|
||||
print("\n\n[red][X] Got CTRL+C. Exiting...[/red]")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox add'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox add"
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -14,6 +14,7 @@ from django.utils import timezone
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.util import parse_filesize_to_bytes
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.permissions import USER, HOSTNAME
|
||||
@@ -29,34 +30,38 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
|
||||
urls: list[str] = []
|
||||
for record in read_args_or_stdin(args):
|
||||
url = record.get('url')
|
||||
url = record.get("url")
|
||||
if isinstance(url, str) and url:
|
||||
urls.append(url)
|
||||
|
||||
urls_field = record.get('urls')
|
||||
urls_field = record.get("urls")
|
||||
if isinstance(urls_field, str):
|
||||
for line in urls_field.splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
if line and not line.startswith("#"):
|
||||
urls.append(line)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
@enforce_types
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
tag: str='',
|
||||
url_allowlist: str='',
|
||||
url_denylist: str='',
|
||||
parser: str="auto",
|
||||
plugins: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool | None=None,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
|
||||
def add(
|
||||
urls: str | list[str],
|
||||
depth: int | str = 0,
|
||||
max_urls: int = 0,
|
||||
max_size: int | str = 0,
|
||||
tag: str = "",
|
||||
url_allowlist: str = "",
|
||||
url_denylist: str = "",
|
||||
parser: str = "auto",
|
||||
plugins: str = "",
|
||||
persona: str = "Default",
|
||||
overwrite: bool = False,
|
||||
update: bool | None = None,
|
||||
index_only: bool = False,
|
||||
bg: bool = False,
|
||||
created_by_id: int | None = None,
|
||||
) -> tuple["Crawl", QuerySet["Snapshot"]]:
|
||||
"""Add a new URL or list of URLs to your archive.
|
||||
|
||||
The flow is:
|
||||
@@ -72,8 +77,15 @@ def add(urls: str | list[str],
|
||||
from rich import print
|
||||
|
||||
depth = int(depth)
|
||||
max_urls = int(max_urls or 0)
|
||||
max_size = parse_filesize_to_bytes(max_size)
|
||||
|
||||
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
|
||||
if depth not in (0, 1, 2, 3, 4):
|
||||
raise ValueError("Depth must be 0-4")
|
||||
if max_urls < 0:
|
||||
raise ValueError("max_urls must be >= 0")
|
||||
if max_size < 0:
|
||||
raise ValueError("max_size must be >= 0")
|
||||
|
||||
# import models once django is set up
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -91,47 +103,49 @@ def add(urls: str | list[str],
|
||||
update = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
|
||||
|
||||
# 2. Create a new Crawl with inline URLs
|
||||
cli_args = [*sys.argv]
|
||||
if cli_args[0].lower().endswith('archivebox'):
|
||||
cli_args[0] = 'archivebox'
|
||||
cmd_str = ' '.join(cli_args)
|
||||
if cli_args[0].lower().endswith("archivebox"):
|
||||
cli_args[0] = "archivebox"
|
||||
cmd_str = " ".join(cli_args)
|
||||
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
|
||||
# Read URLs directly into crawl
|
||||
urls_content = sources_file.read_text()
|
||||
persona_name = (persona or 'Default').strip() or 'Default'
|
||||
plugins = plugins or str(get_config().get('PLUGINS') or '')
|
||||
persona_name = (persona or "Default").strip() or "Default"
|
||||
plugins = plugins or str(get_config().get("PLUGINS") or "")
|
||||
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
|
||||
persona_obj.ensure_dirs()
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
max_urls=max_urls,
|
||||
max_size=max_size,
|
||||
tags_str=tag,
|
||||
persona_id=persona_obj.id,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]",
|
||||
created_by_id=created_by_id,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
|
||||
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
|
||||
}
|
||||
"ONLY_NEW": not update,
|
||||
"INDEX_ONLY": index_only,
|
||||
"OVERWRITE": overwrite,
|
||||
"PLUGINS": plugins,
|
||||
"DEFAULT_PERSONA": persona_name,
|
||||
"PARSER": parser,
|
||||
**({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}),
|
||||
**({"URL_DENYLIST": url_denylist} if url_denylist else {}),
|
||||
},
|
||||
)
|
||||
|
||||
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
|
||||
print(f' [dim]First URL: {first_url}[/dim]')
|
||||
print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]")
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ""
|
||||
print(f" [dim]First URL: {first_url}[/dim]")
|
||||
|
||||
# 3. The CrawlMachine will create Snapshots from all URLs when started
|
||||
# Parser extractors run on snapshots and discover more URLs
|
||||
@@ -139,20 +153,21 @@ def add(urls: str | list[str],
|
||||
|
||||
if index_only:
|
||||
# Just create the crawl but don't start processing
|
||||
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
|
||||
print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]")
|
||||
# Create snapshots for all URLs in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
snapshot, _ = Snapshot.objects.update_or_create(
|
||||
crawl=crawl, url=url,
|
||||
crawl=crawl,
|
||||
url=url,
|
||||
defaults={
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'depth': 0,
|
||||
"status": Snapshot.INITIAL_STATE,
|
||||
"retry_at": timezone.now(),
|
||||
"timestamp": str(timezone.now().timestamp()),
|
||||
"depth": 0,
|
||||
},
|
||||
)
|
||||
if tag:
|
||||
snapshot.save_tags(tag.split(','))
|
||||
snapshot.save_tags(tag.split(","))
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return crawl, crawl.snapshot_set.all()
|
||||
|
||||
@@ -168,10 +183,12 @@ def add(urls: str | list[str],
|
||||
|
||||
if bg:
|
||||
# Background mode: just queue work and return (background runner via server will pick it up)
|
||||
print('[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]')
|
||||
print(
|
||||
"[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]",
|
||||
)
|
||||
else:
|
||||
# Foreground mode: run full crawl runner until all work is done
|
||||
print('[green]\\[*] Starting crawl runner to process crawl...[/green]')
|
||||
print("[green]\\[*] Starting crawl runner to process crawl...[/green]")
|
||||
run_crawl(str(crawl.id))
|
||||
|
||||
# Print summary for foreground runs
|
||||
@@ -179,7 +196,10 @@ def add(urls: str | list[str],
|
||||
crawl.refresh_from_db()
|
||||
snapshots_count = crawl.snapshot_set.count()
|
||||
try:
|
||||
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
|
||||
from django.db.models import Count, Sum
|
||||
|
||||
totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size"))
|
||||
total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0
|
||||
except Exception:
|
||||
total_bytes, _, _ = get_dir_size(crawl.output_dir)
|
||||
total_size = printable_filesize(total_bytes)
|
||||
@@ -197,23 +217,23 @@ def add(urls: str | list[str],
|
||||
# Output dir relative to DATA_DIR
|
||||
try:
|
||||
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
|
||||
rel_output_str = f'./{rel_output}'
|
||||
rel_output_str = f"./{rel_output}"
|
||||
except Exception:
|
||||
rel_output_str = str(crawl.output_dir)
|
||||
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
|
||||
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000"
|
||||
if bind_addr.startswith("http://") or bind_addr.startswith("https://"):
|
||||
base_url = bind_addr
|
||||
else:
|
||||
base_url = f'http://{bind_addr}'
|
||||
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
|
||||
base_url = f"http://{bind_addr}"
|
||||
admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/"
|
||||
|
||||
print('\n[bold]crawl output saved to:[/bold]')
|
||||
print(f' {rel_output_str}')
|
||||
print(f' {admin_url}')
|
||||
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
|
||||
print(f'[bold]total size:[/bold] {total_size}')
|
||||
print(f'[bold]total time:[/bold] {duration_str}')
|
||||
print("\n[bold]crawl output saved to:[/bold]")
|
||||
print(f" {rel_output_str}")
|
||||
print(f" {admin_url}")
|
||||
print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}")
|
||||
print(f"[bold]total size:[/bold] {total_size}")
|
||||
print(f"[bold]total time:[/bold] {duration_str}")
|
||||
except Exception:
|
||||
# Summary is best-effort; avoid failing the command if something goes wrong
|
||||
pass
|
||||
@@ -224,29 +244,43 @@ def add(urls: str | list[str],
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
|
||||
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
|
||||
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
@click.option(
|
||||
"--depth",
|
||||
"-d",
|
||||
type=click.Choice([str(i) for i in range(5)]),
|
||||
default="0",
|
||||
help="Recursively archive linked pages up to N hops away",
|
||||
)
|
||||
@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)")
|
||||
@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3")
|
||||
@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl")
|
||||
@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl")
|
||||
@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)")
|
||||
@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...")
|
||||
@click.option("--persona", default="Default", help="Authentication profile to use when archiving")
|
||||
@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
||||
@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them")
|
||||
@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now")
|
||||
@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)")
|
||||
@click.argument("urls", nargs=-1, type=click.Path())
|
||||
@docstring(add.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
raw_urls = kwargs.pop('urls')
|
||||
raw_urls = kwargs.pop("urls")
|
||||
urls = _collect_input_urls(raw_urls)
|
||||
if not urls:
|
||||
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
|
||||
raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.")
|
||||
if int(kwargs.get("max_urls") or 0) < 0:
|
||||
raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls")
|
||||
try:
|
||||
kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size"))
|
||||
except ValueError as err:
|
||||
raise click.BadParameter(str(err), param_hint="--max-size") from err
|
||||
|
||||
add(urls=urls, **kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -30,11 +30,10 @@ Examples:
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox archiveresult'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox archiveresult"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -42,13 +41,13 @@ from rich import print as rprint
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict:
|
||||
return {
|
||||
'type': 'ArchiveResult',
|
||||
'snapshot_id': str(snapshot_id),
|
||||
'plugin': plugin,
|
||||
'hook_name': hook_name,
|
||||
'status': status,
|
||||
"type": "ArchiveResult",
|
||||
"snapshot_id": str(snapshot_id),
|
||||
"plugin": plugin,
|
||||
"hook_name": hook_name,
|
||||
"status": status,
|
||||
}
|
||||
|
||||
|
||||
@@ -56,10 +55,11 @@ def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str =
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_archiveresults(
|
||||
snapshot_id: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
status: str = 'queued',
|
||||
snapshot_id: str | None = None,
|
||||
plugin: str | None = None,
|
||||
status: str = "queued",
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResult request records for Snapshots.
|
||||
@@ -86,13 +86,13 @@ def create_archiveresults(
|
||||
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
||||
pass_through_records = []
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Read from stdin
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate snapshot records from pass-through records
|
||||
@@ -100,17 +100,17 @@ def create_archiveresults(
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
# Pass through the Snapshot record itself
|
||||
pass_through_records.append(record)
|
||||
if record.get('id'):
|
||||
snapshot_ids.append(record['id'])
|
||||
if record.get("id"):
|
||||
snapshot_ids.append(record["id"])
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
# ArchiveResult records: pass through if they have an id
|
||||
if record.get('id'):
|
||||
if record.get("id"):
|
||||
pass_through_records.append(record)
|
||||
# If no id, we could create it, but for now just pass through
|
||||
else:
|
||||
@@ -120,9 +120,9 @@ def create_archiveresults(
|
||||
# Other typed records (Crawl, Tag, etc): pass through
|
||||
pass_through_records.append(record)
|
||||
|
||||
elif record.get('id'):
|
||||
elif record.get("id"):
|
||||
# Untyped record with id - assume it's a snapshot ID
|
||||
snapshot_ids.append(record['id'])
|
||||
snapshot_ids.append(record["id"])
|
||||
|
||||
# Output pass-through records first
|
||||
if not is_tty:
|
||||
@@ -131,15 +131,15 @@ def create_archiveresults(
|
||||
|
||||
if not snapshot_ids:
|
||||
if pass_through_records:
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
|
||||
|
||||
if not snapshots:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
|
||||
return 0 if pass_through_records else 1
|
||||
|
||||
created_count = 0
|
||||
@@ -150,7 +150,7 @@ def create_archiveresults(
|
||||
created_count += 1
|
||||
else:
|
||||
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
hooks = discover_hooks("Snapshot", config=config)
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name
|
||||
plugin_name = hook_path.parent.name
|
||||
@@ -158,7 +158,7 @@ def create_archiveresults(
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -166,11 +166,12 @@ def create_archiveresults(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
plugin: str | None = None,
|
||||
snapshot_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List ArchiveResults as JSONL with optional filters.
|
||||
@@ -183,13 +184,13 @@ def list_archiveresults(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = ArchiveResult.objects.all().order_by('-start_ts')
|
||||
queryset = ArchiveResult.objects.all().order_by("-start_ts")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'plugin': plugin,
|
||||
'snapshot_id': snapshot_id,
|
||||
"status": status,
|
||||
"plugin": plugin,
|
||||
"snapshot_id": snapshot_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -197,20 +198,22 @@ def list_archiveresults(
|
||||
for result in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'noresults': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"succeeded": "green",
|
||||
"failed": "red",
|
||||
"skipped": "dim",
|
||||
"noresults": "dim",
|
||||
"backoff": "magenta",
|
||||
}.get(result.status, "dim")
|
||||
rprint(
|
||||
f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}",
|
||||
)
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -218,8 +221,9 @@ def list_archiveresults(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update ArchiveResults from stdin JSONL.
|
||||
@@ -238,12 +242,12 @@ def update_archiveresults(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
result_id = record.get('id')
|
||||
result_id = record.get("id")
|
||||
if not result_id:
|
||||
continue
|
||||
|
||||
@@ -261,10 +265,10 @@ def update_archiveresults(
|
||||
write_record(result.to_json())
|
||||
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -272,6 +276,7 @@ def update_archiveresults(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete ArchiveResults from stdin JSONL.
|
||||
@@ -287,37 +292,37 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
result_ids = [r.get('id') for r in records if r.get('id')]
|
||||
result_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not result_ids:
|
||||
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = ArchiveResult.objects.filter(id__in=result_ids)
|
||||
count = results.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr)
|
||||
for result in results[:10]:
|
||||
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
|
||||
rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr)
|
||||
if count > 10:
|
||||
rprint(f' ... and {count - 10} more', file=sys.stderr)
|
||||
rprint(f" ... and {count - 10} more", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = results.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -325,51 +330,58 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage ArchiveResult records (plugin extraction results)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--snapshot-id', help='Snapshot ID to create results for')
|
||||
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
|
||||
@main.command("create")
|
||||
@click.option("--snapshot-id", help="Snapshot ID to create results for")
|
||||
@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
def create_cmd(snapshot_id: str | None, plugin: str | None, status: str):
|
||||
"""Create ArchiveResults for Snapshots from stdin JSONL."""
|
||||
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
|
||||
@click.option('--plugin', '-p', help='Filter by plugin name')
|
||||
@click.option('--snapshot-id', help='Filter by snapshot ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], plugin: Optional[str],
|
||||
snapshot_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)")
|
||||
@click.option("--plugin", "-p", help="Filter by plugin name")
|
||||
@click.option("--snapshot-id", help="Filter by snapshot ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
plugin: str | None,
|
||||
snapshot_id: str | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List ArchiveResults as JSONL."""
|
||||
sys.exit(list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
def update_cmd(status: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
def update_cmd(status: str | None):
|
||||
"""Update ArchiveResults from stdin JSONL."""
|
||||
sys.exit(update_archiveresults(status=status))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete ArchiveResults from stdin JSONL."""
|
||||
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -25,11 +25,10 @@ Examples:
|
||||
archivebox binary list --name=chrome | archivebox binary delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox binary'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox binary"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -41,10 +40,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_binary(
|
||||
name: str,
|
||||
abspath: str,
|
||||
version: str = '',
|
||||
version: str = "",
|
||||
) -> int:
|
||||
"""
|
||||
Create/register a Binary.
|
||||
@@ -59,7 +59,7 @@ def create_binary(
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
if not name or not abspath:
|
||||
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
|
||||
rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
@@ -76,28 +76,30 @@ def create_binary(
|
||||
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
|
||||
# records are owned by the current machine and can be safely piped into
|
||||
# `archivebox run` without creating invalid rows missing machine_id.
|
||||
binary = Binary.from_json({
|
||||
'name': name,
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'binproviders': 'env',
|
||||
'binprovider': 'env',
|
||||
})
|
||||
binary = Binary.from_json(
|
||||
{
|
||||
"name": name,
|
||||
"abspath": abspath,
|
||||
"version": version,
|
||||
"binproviders": "env",
|
||||
"binprovider": "env",
|
||||
},
|
||||
)
|
||||
if binary is None:
|
||||
raise ValueError('failed to create binary record')
|
||||
raise ValueError("failed to create binary record")
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
if created:
|
||||
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@@ -105,11 +107,12 @@ def create_binary(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_binaries(
|
||||
name: Optional[str] = None,
|
||||
abspath__icontains: Optional[str] = None,
|
||||
version__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
abspath__icontains: str | None = None,
|
||||
version__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Binaries as JSONL with optional filters.
|
||||
@@ -122,25 +125,25 @@ def list_binaries(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Binary.objects.all().order_by('name', '-modified_at', '-created_at')
|
||||
queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'abspath__icontains': abspath__icontains,
|
||||
'version__icontains': version__icontains,
|
||||
"name": name,
|
||||
"abspath__icontains": abspath__icontains,
|
||||
"version__icontains": version__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for binary in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
|
||||
rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}")
|
||||
else:
|
||||
write_record(binary.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -148,9 +151,10 @@ def list_binaries(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_binaries(
|
||||
version: Optional[str] = None,
|
||||
abspath: Optional[str] = None,
|
||||
version: str | None = None,
|
||||
abspath: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Binaries from stdin JSONL.
|
||||
@@ -169,12 +173,12 @@ def update_binaries(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
binary_id = record.get('id')
|
||||
binary_id = record.get("id")
|
||||
if not binary_id:
|
||||
continue
|
||||
|
||||
@@ -194,10 +198,10 @@ def update_binaries(
|
||||
write_record(binary.to_json())
|
||||
|
||||
except Binary.DoesNotExist:
|
||||
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -205,6 +209,7 @@ def update_binaries(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Binaries from stdin JSONL.
|
||||
@@ -220,35 +225,35 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binary_ids = [r.get('id') for r in records if r.get('id')]
|
||||
binary_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not binary_ids:
|
||||
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binaries = Binary.objects.filter(id__in=binary_ids)
|
||||
count = binaries.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr)
|
||||
for binary in binaries:
|
||||
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
|
||||
rprint(f" {binary.name} {binary.abspath}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = binaries.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -256,52 +261,59 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Binary records (detected executables)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
|
||||
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
|
||||
@click.option('--version', '-v', default='', help='Binary version')
|
||||
@main.command("create")
|
||||
@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)")
|
||||
@click.option("--abspath", "-p", required=True, help="Absolute path to binary")
|
||||
@click.option("--version", "-v", default="", help="Binary version")
|
||||
def create_cmd(name: str, abspath: str, version: str):
|
||||
"""Create/register a Binary."""
|
||||
sys.exit(create_binary(name=name, abspath=abspath, version=version))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', '-n', help='Filter by name')
|
||||
@click.option('--abspath__icontains', help='Filter by path contains')
|
||||
@click.option('--version__icontains', help='Filter by version contains')
|
||||
@click.option('--limit', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
|
||||
version__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", "-n", help="Filter by name")
|
||||
@click.option("--abspath__icontains", help="Filter by path contains")
|
||||
@click.option("--version__icontains", help="Filter by version contains")
|
||||
@click.option("--limit", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
name: str | None,
|
||||
abspath__icontains: str | None,
|
||||
version__icontains: str | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List Binaries as JSONL."""
|
||||
sys.exit(list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--version', '-v', help='Set version')
|
||||
@click.option('--abspath', '-p', help='Set path')
|
||||
def update_cmd(version: Optional[str], abspath: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--version", "-v", help="Set version")
|
||||
@click.option("--abspath", "-p", help="Set path")
|
||||
def update_cmd(version: str | None, abspath: str | None):
|
||||
"""Update Binaries from stdin JSONL."""
|
||||
sys.exit(update_binaries(version=version, abspath=abspath))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Binaries from stdin JSONL."""
|
||||
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import sys
|
||||
import rich_click as click
|
||||
@@ -12,12 +12,14 @@ from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
|
||||
@enforce_types
|
||||
def config(*keys,
|
||||
get: bool=False,
|
||||
set: bool=False,
|
||||
search: bool=False,
|
||||
reset: bool=False,
|
||||
**kwargs) -> None:
|
||||
def config(
|
||||
*keys,
|
||||
get: bool = False,
|
||||
set: bool = False,
|
||||
search: bool = False,
|
||||
reset: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""Get and set your ArchiveBox project configuration values"""
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
@@ -29,8 +31,8 @@ def config(*keys,
|
||||
|
||||
FLAT_CONFIG = get_flat_config()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
|
||||
|
||||
config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()])
|
||||
no_args = not (get or set or reset or config_options)
|
||||
|
||||
matching_config = {}
|
||||
@@ -39,19 +41,19 @@ def config(*keys,
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
for config_section in CONFIGS.values():
|
||||
aliases = getattr(config_section, 'aliases', {})
|
||||
|
||||
aliases = getattr(config_section, "aliases", {})
|
||||
|
||||
for search_key in config_options:
|
||||
# search all aliases in the section
|
||||
for alias_key, key in aliases.items():
|
||||
if search_key.lower() in alias_key.lower():
|
||||
matching_config[key] = dict(config_section)[key]
|
||||
|
||||
|
||||
# search all keys and values in the section
|
||||
for existing_key, value in dict(config_section).items():
|
||||
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
|
||||
matching_config[existing_key] = value
|
||||
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
@@ -61,23 +63,23 @@ def config(*keys,
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
|
||||
if failed_config:
|
||||
print('\n[red][X] These options failed to get[/red]')
|
||||
print(' {}'.format('\n '.join(config_options)))
|
||||
print("\n[red][X] These options failed to get[/red]")
|
||||
print(" {}".format("\n ".join(config_options)))
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
# Display core config sections
|
||||
for config_section in CONFIGS.values():
|
||||
section_header = getattr(config_section, 'toml_section_header', '')
|
||||
section_header = getattr(config_section, "toml_section_header", "")
|
||||
if isinstance(section_header, str) and section_header:
|
||||
print(f'[grey53]\\[{section_header}][/grey53]')
|
||||
print(f"[grey53]\\[{section_header}][/grey53]")
|
||||
else:
|
||||
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
|
||||
print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]")
|
||||
|
||||
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
|
||||
print("[grey53]################################################################[/grey53]")
|
||||
|
||||
# Display plugin config section
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
@@ -87,17 +89,17 @@ def config(*keys,
|
||||
|
||||
# Collect all plugin config keys
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' not in schema:
|
||||
if "properties" not in schema:
|
||||
continue
|
||||
for key in schema['properties'].keys():
|
||||
for key in schema["properties"].keys():
|
||||
if key in matching_config:
|
||||
plugin_keys[key] = matching_config[key]
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print('[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
print("[grey53]\\[PLUGINS][/grey53]")
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
|
||||
print("[grey53]################################################################[/grey53]")
|
||||
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
@@ -105,18 +107,20 @@ def config(*keys,
|
||||
new_config = {}
|
||||
failed_options = []
|
||||
for line in config_options:
|
||||
if line.startswith('#') or not line.strip():
|
||||
if line.startswith("#") or not line.strip():
|
||||
continue
|
||||
if '=' not in line:
|
||||
print('[red][X] Config KEY=VALUE must have an = sign in it[/red]')
|
||||
print(f' {line}')
|
||||
if "=" not in line:
|
||||
print("[red][X] Config KEY=VALUE must have an = sign in it[/red]")
|
||||
print(f" {line}")
|
||||
raise SystemExit(2)
|
||||
|
||||
raw_key, val = line.split('=', 1)
|
||||
raw_key, val = line.split("=", 1)
|
||||
raw_key = raw_key.upper().strip()
|
||||
key = get_real_name(raw_key)
|
||||
if key != raw_key:
|
||||
print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]')
|
||||
print(
|
||||
f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]",
|
||||
)
|
||||
|
||||
if key in FLAT_CONFIG:
|
||||
new_config[key] = val.strip()
|
||||
@@ -136,38 +140,38 @@ def config(*keys,
|
||||
|
||||
if side_effect_changes:
|
||||
print(file=sys.stderr)
|
||||
print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr)
|
||||
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr)
|
||||
print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr)
|
||||
print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr)
|
||||
|
||||
if failed_options:
|
||||
print()
|
||||
print('[red][X] These options failed to set (check for typos):[/red]')
|
||||
print(' {}'.format('\n '.join(failed_options)))
|
||||
print("[red][X] These options failed to set (check for typos):[/red]")
|
||||
print(" {}".format("\n ".join(failed_options)))
|
||||
raise SystemExit(1)
|
||||
|
||||
elif reset:
|
||||
print('[red][X] This command is not implemented yet.[/red]')
|
||||
print(' Please manually remove the relevant lines from your config file:')
|
||||
print("[red][X] This command is not implemented yet.[/red]")
|
||||
print(" Please manually remove the relevant lines from your config file:")
|
||||
raise SystemExit(2)
|
||||
|
||||
else:
|
||||
print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]')
|
||||
print(' archivebox config')
|
||||
print(' archivebox config --get SOME_KEY')
|
||||
print(' archivebox config --set SOME_KEY=SOME_VALUE')
|
||||
print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]")
|
||||
print(" archivebox config")
|
||||
print(" archivebox config --get SOME_KEY")
|
||||
print(" archivebox config --set SOME_KEY=SOME_VALUE")
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term')
|
||||
@click.option('--get', is_flag=True, help='Get the value for the given config KEYs')
|
||||
@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values')
|
||||
@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults')
|
||||
@click.argument('KEY=VALUE', nargs=-1, type=str)
|
||||
@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term")
|
||||
@click.option("--get", is_flag=True, help="Get the value for the given config KEYs")
|
||||
@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values")
|
||||
@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults")
|
||||
@click.argument("KEY=VALUE", nargs=-1, type=str)
|
||||
@docstring(config.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
config(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -30,11 +30,11 @@ Examples:
|
||||
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox crawl"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -46,12 +46,13 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_crawl(
|
||||
urls: Iterable[str],
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
created_by_id: Optional[int] = None,
|
||||
tag: str = "",
|
||||
status: str = "queued",
|
||||
created_by_id: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create a Crawl job from URLs.
|
||||
@@ -74,7 +75,7 @@ def create_crawl(
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate pass-through records from URL records
|
||||
@@ -82,29 +83,29 @@ def create_crawl(
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
# Pass-through: output records that aren't URL/Crawl types
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Handle existing Crawl records (just pass through with id)
|
||||
if record_type == TYPE_CRAWL and record.get('id'):
|
||||
if record_type == TYPE_CRAWL and record.get("id"):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Collect URLs
|
||||
url = record.get('url')
|
||||
url = record.get("url")
|
||||
if url:
|
||||
url_list.append(url)
|
||||
|
||||
# Handle 'urls' field (newline-separated)
|
||||
urls_field = record.get('urls')
|
||||
urls_field = record.get("urls")
|
||||
if urls_field:
|
||||
for line in urls_field.split('\n'):
|
||||
for line in urls_field.split("\n"):
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
if line and not line.startswith("#"):
|
||||
url_list.append(line)
|
||||
|
||||
# Output pass-through records first
|
||||
@@ -115,44 +116,44 @@ def create_crawl(
|
||||
if not url_list:
|
||||
if pass_through_records:
|
||||
# If we had pass-through records but no URLs, that's OK
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
||||
rprint("[red]No valid URLs found[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Build crawl record with all URLs as newline-separated string
|
||||
crawl_record = {
|
||||
'urls': '\n'.join(url_list),
|
||||
'max_depth': depth,
|
||||
'tags_str': tag,
|
||||
'status': status,
|
||||
'label': '',
|
||||
"urls": "\n".join(url_list),
|
||||
"max_depth": depth,
|
||||
"tags_str": tag,
|
||||
"status": status,
|
||||
"label": "",
|
||||
}
|
||||
|
||||
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
|
||||
if not crawl:
|
||||
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
|
||||
rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(crawl.to_json())
|
||||
|
||||
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
|
||||
rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr)
|
||||
for url in url_list[:5]: # Show first 5 URLs
|
||||
rprint(f' {url[:70]}', file=sys.stderr)
|
||||
rprint(f" {url[:70]}", file=sys.stderr)
|
||||
if len(url_list) > 5:
|
||||
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
|
||||
rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@@ -160,11 +161,12 @@ def create_crawl(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_crawls(
|
||||
status: Optional[str] = None,
|
||||
urls__icontains: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
urls__icontains: str | None = None,
|
||||
max_depth: int | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Crawls as JSONL with optional filters.
|
||||
@@ -177,13 +179,13 @@ def list_crawls(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Crawl.objects.all().order_by('-created_at')
|
||||
queryset = Crawl.objects.all().order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'urls__icontains': urls__icontains,
|
||||
'max_depth': max_depth,
|
||||
"status": status,
|
||||
"urls__icontains": urls__icontains,
|
||||
"max_depth": max_depth,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -191,17 +193,17 @@ def list_crawls(
|
||||
for crawl in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(crawl.status, 'dim')
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"sealed": "green",
|
||||
}.get(crawl.status, "dim")
|
||||
url_preview = crawl.urls[:50].replace("\n", " ")
|
||||
rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
|
||||
else:
|
||||
write_record(crawl.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -209,9 +211,10 @@ def list_crawls(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_crawls(
|
||||
status: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
max_depth: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Crawls from stdin JSONL.
|
||||
@@ -232,12 +235,12 @@ def update_crawls(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id')
|
||||
crawl_id = record.get("id")
|
||||
if not crawl_id:
|
||||
continue
|
||||
|
||||
@@ -258,10 +261,10 @@ def update_crawls(
|
||||
write_record(crawl.to_json())
|
||||
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -269,6 +272,7 @@ def update_crawls(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Crawls from stdin JSONL.
|
||||
@@ -284,36 +288,36 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawl_ids = [r.get('id') for r in records if r.get('id')]
|
||||
crawl_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not crawl_ids:
|
||||
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawls = Crawl.objects.filter(id__in=crawl_ids)
|
||||
count = crawls.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
|
||||
for crawl in crawls:
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
|
||||
url_preview = crawl.urls[:50].replace("\n", " ")
|
||||
rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = crawls.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -321,53 +325,60 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Crawl records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@main.command("create")
|
||||
@click.argument("urls", nargs=-1)
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
|
||||
"""Create a Crawl job from URLs or stdin."""
|
||||
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--urls__icontains', help='Filter by URLs contains')
|
||||
@click.option('--max-depth', type=int, help='Filter by max depth')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
|
||||
max_depth: Optional[int], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--urls__icontains", help="Filter by URLs contains")
|
||||
@click.option("--max-depth", type=int, help="Filter by max depth")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
urls__icontains: str | None,
|
||||
max_depth: int | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List Crawls as JSONL."""
|
||||
sys.exit(list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--max-depth', type=int, help='Set max depth')
|
||||
def update_cmd(status: Optional[str], max_depth: Optional[int]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
@click.option("--max-depth", type=int, help="Set max depth")
|
||||
def update_cmd(status: str | None, max_depth: int | None):
|
||||
"""Update Crawls from stdin JSONL."""
|
||||
sys.exit(update_crawls(status=status, max_depth=max_depth))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Crawls from stdin JSONL."""
|
||||
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox crawl"
|
||||
|
||||
import sys
|
||||
|
||||
@@ -10,12 +10,12 @@ import rich_click as click
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.command(context_settings={"ignore_unknown_options": True})
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility")
|
||||
@click.argument("urls", nargs=-1)
|
||||
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
|
||||
del status, wait
|
||||
@@ -23,5 +23,5 @@ def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,8 +27,8 @@ Examples:
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox extract"
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -52,51 +52,52 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
|
||||
rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr)
|
||||
|
||||
try:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
snapshot.status = snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.status != crawl.StatusChoices.STARTED:
|
||||
crawl.status = crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]")
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
|
||||
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
|
||||
print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]")
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Still in progress or backoff - not a failure
|
||||
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
|
||||
print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_plugins(
|
||||
args: tuple,
|
||||
records: list[dict] | None = None,
|
||||
plugins: str = '',
|
||||
plugins: str = "",
|
||||
wait: bool = True,
|
||||
emit_results: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Run plugins on Snapshots from input.
|
||||
@@ -111,16 +112,18 @@ def run_plugins(
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
read_args_or_stdin,
|
||||
write_record,
|
||||
TYPE_SNAPSHOT,
|
||||
TYPE_ARCHIVERESULT,
|
||||
)
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Parse comma-separated plugins list once (reused in creation and filtering)
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
|
||||
plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else []
|
||||
|
||||
# Parse stdin/args exactly once per CLI invocation.
|
||||
# `main()` may already have consumed stdin to distinguish Snapshot input from
|
||||
@@ -130,41 +133,41 @@ def run_plugins(
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs and optional plugin constraints to process
|
||||
snapshot_ids = set()
|
||||
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
record_type = record.get("type")
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
snapshot_id = record.get("id")
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record.get('url'):
|
||||
elif record.get("url"):
|
||||
# Look up by URL (get most recent if multiple exist)
|
||||
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
|
||||
snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first()
|
||||
if snap:
|
||||
snapshot_ids.add(str(snap.id))
|
||||
else:
|
||||
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr)
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
snapshot_id = record.get("snapshot_id")
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
plugin_name = record.get('plugin')
|
||||
plugin_name = record.get("plugin")
|
||||
if plugin_name and not plugins_list:
|
||||
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
|
||||
|
||||
elif 'id' in record:
|
||||
elif "id" in record:
|
||||
# Assume it's a snapshot ID
|
||||
snapshot_ids.add(record['id'])
|
||||
snapshot_ids.add(record["id"])
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
|
||||
rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get snapshots and ensure they have pending ArchiveResults
|
||||
@@ -173,17 +176,13 @@ def run_plugins(
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
|
||||
if existing_result and existing_result.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set())
|
||||
for plugin_name in requested_plugin_names:
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
||||
if existing_result:
|
||||
existing_result.reset_for_retry()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
@@ -195,34 +194,39 @@ def run_plugins(
|
||||
processed_count += 1
|
||||
|
||||
if processed_count == 0:
|
||||
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
|
||||
rprint("[red]No snapshots to process[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr)
|
||||
|
||||
# Run orchestrator if --wait (default)
|
||||
if wait:
|
||||
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
|
||||
rprint("[blue]Running plugins...[/blue]", file=sys.stderr)
|
||||
snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id)
|
||||
snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
|
||||
|
||||
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
|
||||
selected_plugins = plugins_list or sorted({
|
||||
plugin
|
||||
for snapshot_id in crawl_snapshot_ids
|
||||
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
|
||||
}) or None
|
||||
selected_plugins = (
|
||||
plugins_list
|
||||
or sorted(
|
||||
{plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())},
|
||||
)
|
||||
or None
|
||||
)
|
||||
run_crawl(
|
||||
crawl_id,
|
||||
snapshot_ids=sorted(crawl_snapshot_ids),
|
||||
selected_plugins=selected_plugins,
|
||||
)
|
||||
|
||||
if not emit_results:
|
||||
return 0
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
@@ -234,11 +238,14 @@ def run_plugins(
|
||||
for result in results:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
"succeeded": "green",
|
||||
"failed": "red",
|
||||
"skipped": "yellow",
|
||||
}.get(result.status, "dim")
|
||||
rprint(
|
||||
f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
except Snapshot.DoesNotExist:
|
||||
@@ -250,18 +257,20 @@ def run_plugins(
|
||||
def is_archiveresult_id(value: str) -> bool:
|
||||
"""Check if value looks like an ArchiveResult UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
|
||||
uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)")
|
||||
@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)")
|
||||
@click.argument("args", nargs=-1)
|
||||
def main(plugins: str, wait: bool, args: tuple):
|
||||
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
@@ -271,14 +280,12 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
|
||||
rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing ArchiveResult IDs to process
|
||||
all_are_archiveresult_ids = all(
|
||||
is_archiveresult_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records)
|
||||
|
||||
if all_are_archiveresult_ids:
|
||||
# Process existing ArchiveResults by ID
|
||||
@@ -286,9 +293,9 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
archiveresult_id = record.get('id') or record.get('url')
|
||||
archiveresult_id = record.get("id") or record.get("url")
|
||||
if not isinstance(archiveresult_id, str):
|
||||
rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr)
|
||||
exit_code = 1
|
||||
continue
|
||||
result = process_archiveresult_by_id(archiveresult_id)
|
||||
@@ -300,5 +307,5 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox help'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox help"
|
||||
|
||||
import os
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@@ -17,33 +17,44 @@ def help() -> None:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
from archivebox.misc.logging_util import log_cli_command
|
||||
|
||||
log_cli_command('help', [], None, '.')
|
||||
|
||||
COMMANDS_HELP_TEXT = '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
|
||||
log_cli_command("help", [], None, ".")
|
||||
|
||||
COMMANDS_HELP_TEXT = (
|
||||
"\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
)
|
||||
+ "\n\n "
|
||||
+ "\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
)
|
||||
+ "\n\n "
|
||||
+ "\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
)
|
||||
)
|
||||
|
||||
DOCKER_USAGE = '''
|
||||
|
||||
DOCKER_USAGE = (
|
||||
"""
|
||||
[dodger_blue3]Docker Usage:[/dodger_blue3]
|
||||
[grey53]# using Docker Compose:[/grey53]
|
||||
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
[grey53]# using Docker:[/grey53]
|
||||
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
''' if IN_DOCKER else ''
|
||||
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
|
||||
"""
|
||||
if IN_DOCKER
|
||||
else ""
|
||||
)
|
||||
DOCKER_DOCS = (
|
||||
"\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]"
|
||||
if IN_DOCKER
|
||||
else ""
|
||||
)
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ""
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ""
|
||||
|
||||
print(f'''{DOCKER_USAGE}
|
||||
print(f"""{DOCKER_USAGE}
|
||||
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
|
||||
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
@@ -54,12 +65,11 @@ def help() -> None:
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
|
||||
''')
|
||||
|
||||
|
||||
""")
|
||||
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
|
||||
EXAMPLE_USAGE = f'''
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~")
|
||||
EXAMPLE_USAGE = f"""
|
||||
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
|
||||
|
||||
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
|
||||
@@ -73,33 +83,49 @@ def help() -> None:
|
||||
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
|
||||
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
|
||||
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
|
||||
'''
|
||||
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
|
||||
"""
|
||||
print(
|
||||
Panel(
|
||||
EXAMPLE_USAGE,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]",
|
||||
subtitle="Commands run inside this dir will only apply to this collection.",
|
||||
),
|
||||
)
|
||||
else:
|
||||
DATA_SETUP_HELP = '\n'
|
||||
DATA_SETUP_HELP = "\n"
|
||||
if IN_DOCKER:
|
||||
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
|
||||
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
|
||||
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
|
||||
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
|
||||
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
|
||||
DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n"
|
||||
DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n"
|
||||
DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n"
|
||||
DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n"
|
||||
DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n"
|
||||
DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
|
||||
DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n"
|
||||
print(
|
||||
Panel(
|
||||
DATA_SETUP_HELP,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[red]:cross_mark: No collection is currently active[/red]",
|
||||
subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.option("--help", "-h", is_flag=True, help="Show help")
|
||||
def main(**kwargs):
|
||||
"""Print the ArchiveBox help message and usage"""
|
||||
return help()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Mapping
|
||||
from collections.abc import Mapping
|
||||
|
||||
from rich import print
|
||||
import rich_click as click
|
||||
@@ -14,12 +14,12 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None:
|
||||
url = link_dict.get('url')
|
||||
url = link_dict.get("url")
|
||||
if not isinstance(url, str) or not url:
|
||||
return None
|
||||
|
||||
record: dict[str, object] = {'url': url}
|
||||
for key in ('timestamp', 'title', 'tags', 'sources'):
|
||||
record: dict[str, object] = {"url": url}
|
||||
for key in ("timestamp", "title", "tags", "sources"):
|
||||
value = link_dict.get(key)
|
||||
if value is not None:
|
||||
record[key] = value
|
||||
@@ -27,15 +27,15 @@ def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, di
|
||||
|
||||
|
||||
@enforce_types
|
||||
def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
def init(force: bool = False, quick: bool = False, install: bool = False) -> None:
|
||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.collection import write_config_file
|
||||
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details
|
||||
from archivebox.misc.db import apply_migrations
|
||||
|
||||
|
||||
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
|
||||
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
|
||||
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
|
||||
@@ -43,69 +43,71 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
|
||||
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
|
||||
if is_empty and not existing_index:
|
||||
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]")
|
||||
print("[green]----------------------------------------------------------------------[/green]")
|
||||
elif existing_index:
|
||||
# TODO: properly detect and print the existing version in current index as well
|
||||
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]")
|
||||
print("[green]----------------------------------------------------------------------[/green]")
|
||||
else:
|
||||
if force:
|
||||
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
|
||||
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
|
||||
print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]")
|
||||
print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]")
|
||||
else:
|
||||
print(
|
||||
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
"[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
" You must run init in a completely empty directory, or an existing data folder.\n\n"
|
||||
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
|
||||
" then run and run 'archivebox init' to pick up where you left off.\n\n"
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||
)
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)",
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
if existing_index:
|
||||
print('\n[green][*] Verifying archive folder structure...[/green]')
|
||||
print("\n[green][*] Verifying archive folder structure...[/green]")
|
||||
else:
|
||||
print('\n[green][+] Building archive folder structure...[/green]')
|
||||
|
||||
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
|
||||
print("\n[green][+] Building archive folder structure...[/green]")
|
||||
|
||||
print(
|
||||
f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...",
|
||||
)
|
||||
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
||||
|
||||
|
||||
print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...")
|
||||
|
||||
# create the .archivebox_id file with a unique ID for this collection
|
||||
from archivebox.config.paths import _get_collection_id
|
||||
_get_collection_id(DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
_get_collection_id(DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
||||
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
||||
print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]")
|
||||
else:
|
||||
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
|
||||
|
||||
print("\n[green][+] Building main SQL index and running initial migrations...[/green]")
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
|
||||
for migration_line in apply_migrations(DATA_DIR):
|
||||
sys.stdout.write(f' {migration_line}\n')
|
||||
sys.stdout.write(f" {migration_line}\n")
|
||||
|
||||
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
|
||||
print()
|
||||
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
|
||||
|
||||
print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}")
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
print()
|
||||
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||
print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]")
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@@ -114,10 +116,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
if existing_index:
|
||||
all_links = Snapshot.objects.all()
|
||||
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||
print(f" √ Loaded {all_links.count()} links from existing main index.")
|
||||
|
||||
if quick:
|
||||
print(' > Skipping orphan snapshot import (quick mode)')
|
||||
print(" > Skipping orphan snapshot import (quick mode)")
|
||||
else:
|
||||
try:
|
||||
# Import orphaned links from legacy JSON indexes
|
||||
@@ -131,7 +133,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
orphaned_json_links[url] = record
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
|
||||
print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]")
|
||||
|
||||
orphaned_data_dir_links: dict[str, dict[str, object]] = {}
|
||||
for link_dict in parse_json_links_details(DATA_DIR):
|
||||
@@ -143,7 +145,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
orphaned_data_dir_links[url] = record
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
|
||||
print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]")
|
||||
|
||||
if pending_links:
|
||||
for link_dict in pending_links.values():
|
||||
@@ -151,42 +153,44 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
# Hint for orphaned snapshot directories
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
|
||||
print(' archivebox update')
|
||||
print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:")
|
||||
print(" archivebox update")
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print(file=sys.stderr)
|
||||
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
|
||||
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
|
||||
print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr)
|
||||
print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
|
||||
print(' archivebox init --quick', file=sys.stderr)
|
||||
print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr)
|
||||
print(" archivebox init --quick", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||
print("\n[green]----------------------------------------------------------------------[/green]")
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(
|
||||
username=SERVER_CONFIG.ADMIN_USERNAME,
|
||||
).exists():
|
||||
print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]")
|
||||
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||
|
||||
if existing_index:
|
||||
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
|
||||
print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]")
|
||||
else:
|
||||
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
|
||||
print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]")
|
||||
|
||||
|
||||
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True)
|
||||
if working_tmp_dir:
|
||||
@@ -195,33 +199,35 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True)
|
||||
if working_lib_dir:
|
||||
working_lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
(working_lib_dir / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(working_lib_dir / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if install:
|
||||
from archivebox.cli.archivebox_install import install as install_method
|
||||
|
||||
install_method()
|
||||
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To view your archive index, run:')
|
||||
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
|
||||
print(" [violet]Hint:[/violet] To view your archive index, run:")
|
||||
print(
|
||||
" archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]",
|
||||
)
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" To add new links, you can run:")
|
||||
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
||||
print(" For more usage and examples, run:")
|
||||
print(" archivebox help")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
|
||||
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
|
||||
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
|
||||
@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway")
|
||||
@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs")
|
||||
@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving")
|
||||
@docstring(init.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
init(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
|
||||
@@ -11,7 +11,7 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None:
|
||||
"""Detect and install ArchiveBox dependencies by running the abx-dl install flow
|
||||
|
||||
Examples:
|
||||
@@ -31,33 +31,34 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
|
||||
|
||||
# Show what we're installing
|
||||
if binaries:
|
||||
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
|
||||
print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]")
|
||||
else:
|
||||
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
|
||||
print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]")
|
||||
|
||||
if binproviders != '*':
|
||||
print(f'[green][+] Using providers: {binproviders}[/green]')
|
||||
if binproviders != "*":
|
||||
print(f"[green][+] Using providers: {binproviders}[/green]")
|
||||
|
||||
if IS_ROOT:
|
||||
EUID = os.geteuid()
|
||||
print()
|
||||
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
|
||||
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||
print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]")
|
||||
print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].")
|
||||
print()
|
||||
|
||||
if dry_run:
|
||||
print('[dim]Dry run - would run the abx-dl install flow[/dim]')
|
||||
print("[dim]Dry run - would run the abx-dl install flow[/dim]")
|
||||
return
|
||||
|
||||
# Set up Django
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
plugin_names = list(binaries)
|
||||
if binproviders != '*':
|
||||
plugin_names.extend(provider.strip() for provider in binproviders.split(',') if provider.strip())
|
||||
if binproviders != "*":
|
||||
plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip())
|
||||
|
||||
print('[+] Running installer via abx-dl bus...')
|
||||
print("[+] Running installer via abx-dl bus...")
|
||||
print()
|
||||
|
||||
from archivebox.services.runner import run_install
|
||||
@@ -68,28 +69,36 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
|
||||
|
||||
# Check for superuser
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
|
||||
stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green")
|
||||
stderr(" archivebox manage createsuperuser")
|
||||
|
||||
print()
|
||||
|
||||
# Show version to display full status including installed binaries
|
||||
# Django is already loaded, so just import and call the function directly
|
||||
from archivebox.cli.archivebox_version import version as show_version
|
||||
|
||||
show_version(quiet=False)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('binaries', nargs=-1, type=str, required=False)
|
||||
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
|
||||
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
|
||||
@click.argument("binaries", nargs=-1, type=str, required=False)
|
||||
@click.option(
|
||||
"--binproviders",
|
||||
"-p",
|
||||
default="*",
|
||||
help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False)
|
||||
@docstring(install.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
install(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox list"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -12,31 +11,47 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
|
||||
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
|
||||
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
|
||||
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--url__icontains", help="Filter by URL contains")
|
||||
@click.option("--url__istartswith", help="Filter by URL starts with")
|
||||
@click.option("--tag", "-t", help="Filter by tag name")
|
||||
@click.option("--crawl-id", help="Filter by crawl ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
|
||||
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
|
||||
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
|
||||
@click.argument("query", nargs=-1)
|
||||
def main(
|
||||
status: str | None,
|
||||
url__icontains: str | None,
|
||||
url__istartswith: str | None,
|
||||
tag: str | None,
|
||||
crawl_id: str | None,
|
||||
limit: int | None,
|
||||
sort: str | None,
|
||||
csv: str | None,
|
||||
with_headers: bool,
|
||||
search: str | None,
|
||||
query: tuple[str, ...],
|
||||
) -> None:
|
||||
"""List Snapshots."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
))
|
||||
sys.exit(
|
||||
list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
search=search,
|
||||
query=" ".join(query),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -19,11 +19,10 @@ Examples:
|
||||
archivebox machine list --hostname__icontains=myserver
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox machine'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox machine"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -35,10 +34,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_machines(
|
||||
hostname__icontains: Optional[str] = None,
|
||||
os_platform: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
hostname__icontains: str | None = None,
|
||||
os_platform: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Machines as JSONL with optional filters.
|
||||
@@ -51,24 +51,24 @@ def list_machines(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Machine.objects.all().order_by('-created_at')
|
||||
queryset = Machine.objects.all().order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'hostname__icontains': hostname__icontains,
|
||||
'os_platform': os_platform,
|
||||
"hostname__icontains": hostname__icontains,
|
||||
"os_platform": os_platform,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for machine in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
|
||||
rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}")
|
||||
else:
|
||||
write_record(machine.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -76,24 +76,27 @@ def list_machines(
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Machine records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--hostname__icontains', help='Filter by hostname contains')
|
||||
@click.option('--os-platform', help='Filter by OS platform')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--hostname__icontains", help="Filter by hostname contains")
|
||||
@click.option("--os-platform", help="Filter by OS platform")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None):
|
||||
"""List Machines as JSONL."""
|
||||
sys.exit(list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,33 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import rich_click as click
|
||||
from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def manage(args: list[str] | None=None) -> None:
|
||||
def manage(args: list[str] | None = None) -> None:
|
||||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
|
||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr('')
|
||||
stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow")
|
||||
stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow")
|
||||
stderr("")
|
||||
|
||||
from django.core.management import execute_from_command_line
|
||||
execute_from_command_line(['manage.py', *(args or ['help'])])
|
||||
|
||||
execute_from_command_line(["manage.py", *(args or ["help"])])
|
||||
|
||||
|
||||
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.argument("args", nargs=-1)
|
||||
@docstring(manage.__doc__)
|
||||
def main(args: list[str] | None=None) -> None:
|
||||
def main(args: list[str] | None = None) -> None:
|
||||
manage(args=args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -6,8 +6,8 @@ Start the Model Context Protocol (MCP) server in stdio mode.
|
||||
Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox mcp'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox mcp"
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -45,5 +45,5 @@ def main(**kwargs):
|
||||
mcp()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -24,8 +24,8 @@ Examples:
|
||||
archivebox persona list --name=old | archivebox persona delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox persona'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox persona"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -35,7 +35,7 @@ import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
from collections import OrderedDict
|
||||
|
||||
import rich_click as click
|
||||
@@ -49,134 +49,145 @@ from archivebox.personas import importers as persona_importers
|
||||
# Browser Profile Locations
|
||||
# =============================================================================
|
||||
|
||||
def get_chrome_user_data_dir() -> Optional[Path]:
|
||||
|
||||
def get_chrome_user_data_dir() -> Path | None:
|
||||
"""Get the default Chrome user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin': # macOS
|
||||
if system == "Darwin": # macOS
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
|
||||
home / 'Library' / 'Application Support' / 'Chromium',
|
||||
home / "Library" / "Application Support" / "Google" / "Chrome",
|
||||
home / "Library" / "Application Support" / "Chromium",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'google-chrome',
|
||||
home / '.config' / 'chromium',
|
||||
home / '.config' / 'chrome',
|
||||
home / 'snap' / 'chromium' / 'common' / 'chromium',
|
||||
home / ".config" / "google-chrome",
|
||||
home / ".config" / "chromium",
|
||||
home / ".config" / "chrome",
|
||||
home / "snap" / "chromium" / "common" / "chromium",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'Google' / 'Chrome' / 'User Data',
|
||||
local_app_data / 'Chromium' / 'User Data',
|
||||
local_app_data / "Google" / "Chrome" / "User Data",
|
||||
local_app_data / "Chromium" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_brave_user_data_dir() -> Optional[Path]:
|
||||
def get_brave_user_data_dir() -> Path | None:
|
||||
"""Get the default Brave user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
|
||||
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'BraveSoftware' / 'Brave-Browser',
|
||||
home / ".config" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
|
||||
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_edge_user_data_dir() -> Optional[Path]:
|
||||
def get_edge_user_data_dir() -> Path | None:
|
||||
"""Get the default Edge user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Microsoft Edge',
|
||||
home / "Library" / "Application Support" / "Microsoft Edge",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'microsoft-edge',
|
||||
home / '.config' / 'microsoft-edge-beta',
|
||||
home / '.config' / 'microsoft-edge-dev',
|
||||
home / ".config" / "microsoft-edge",
|
||||
home / ".config" / "microsoft-edge-beta",
|
||||
home / ".config" / "microsoft-edge-dev",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
|
||||
local_app_data / "Microsoft" / "Edge" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_browser_binary(browser: str) -> Optional[str]:
|
||||
def get_browser_binary(browser: str) -> str | None:
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
browser = browser.lower()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = {
|
||||
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
|
||||
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
|
||||
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
|
||||
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
|
||||
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
|
||||
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
|
||||
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
|
||||
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
|
||||
}.get(browser, [])
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = {
|
||||
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
|
||||
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
|
||||
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
|
||||
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
|
||||
"chrome": [
|
||||
"/usr/bin/google-chrome",
|
||||
"/usr/bin/google-chrome-stable",
|
||||
"/usr/bin/google-chrome-beta",
|
||||
"/usr/bin/google-chrome-unstable",
|
||||
],
|
||||
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
|
||||
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
|
||||
"edge": [
|
||||
"/usr/bin/microsoft-edge",
|
||||
"/usr/bin/microsoft-edge-stable",
|
||||
"/usr/bin/microsoft-edge-beta",
|
||||
"/usr/bin/microsoft-edge-dev",
|
||||
],
|
||||
}.get(browser, [])
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = {
|
||||
'chrome': [
|
||||
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
|
||||
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
||||
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
||||
"chrome": [
|
||||
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
|
||||
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
||||
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
||||
],
|
||||
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
|
||||
'brave': [
|
||||
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
|
||||
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
|
||||
"brave": [
|
||||
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
|
||||
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
],
|
||||
'edge': [
|
||||
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
|
||||
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
"edge": [
|
||||
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
|
||||
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
],
|
||||
}.get(browser, [])
|
||||
else:
|
||||
@@ -190,13 +201,13 @@ def get_browser_binary(browser: str) -> Optional[str]:
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
'chrome': get_chrome_user_data_dir,
|
||||
'chromium': get_chrome_user_data_dir, # Same locations
|
||||
'brave': get_brave_user_data_dir,
|
||||
'edge': get_edge_user_data_dir,
|
||||
"chrome": get_chrome_user_data_dir,
|
||||
"chromium": get_chrome_user_data_dir, # Same locations
|
||||
"brave": get_brave_user_data_dir,
|
||||
"edge": get_edge_user_data_dir,
|
||||
}
|
||||
|
||||
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -204,12 +215,12 @@ CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
# =============================================================================
|
||||
|
||||
NETSCAPE_COOKIE_HEADER = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
"# Netscape HTTP Cookie File",
|
||||
"# https://curl.se/docs/http-cookies.html",
|
||||
"# This file was generated by ArchiveBox persona cookie extraction",
|
||||
"#",
|
||||
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
|
||||
"",
|
||||
]
|
||||
|
||||
|
||||
@@ -219,9 +230,9 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
|
||||
return cookies
|
||||
|
||||
for line in path.read_text().splitlines():
|
||||
if not line or line.startswith('#'):
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
|
||||
@@ -233,8 +244,8 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
|
||||
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
|
||||
lines = list(NETSCAPE_COOKIE_HEADER)
|
||||
for cookie in cookies.values():
|
||||
lines.append('\t'.join(cookie))
|
||||
path.write_text('\n'.join(lines) + '\n')
|
||||
lines.append("\t".join(cookie))
|
||||
path.write_text("\n".join(lines) + "\n")
|
||||
|
||||
|
||||
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
|
||||
@@ -259,52 +270,52 @@ def extract_cookies_via_cdp(
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
# Find the cookie extraction script
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
|
||||
extract_script = chrome_plugin_dir / 'extract_cookies.js'
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome"
|
||||
extract_script = chrome_plugin_dir / "extract_cookies.js"
|
||||
|
||||
if not extract_script.exists():
|
||||
rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
|
||||
# Get node modules dir
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
|
||||
|
||||
# Set up environment
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(node_modules_dir)
|
||||
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env["NODE_MODULES_DIR"] = str(node_modules_dir)
|
||||
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
|
||||
env["CHROME_HEADLESS"] = "true"
|
||||
if chrome_binary:
|
||||
env['CHROME_BINARY'] = str(chrome_binary)
|
||||
env["CHROME_BINARY"] = str(chrome_binary)
|
||||
output_path = output_file
|
||||
temp_output = None
|
||||
temp_dir = None
|
||||
if output_file.exists():
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
|
||||
temp_output = temp_dir / 'cookies.txt'
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_"))
|
||||
temp_output = temp_dir / "cookies.txt"
|
||||
output_path = temp_output
|
||||
if profile_dir:
|
||||
extra_arg = f'--profile-directory={profile_dir}'
|
||||
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
|
||||
extra_arg = f"--profile-directory={profile_dir}"
|
||||
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
|
||||
args_list = []
|
||||
if existing_extra:
|
||||
if existing_extra.startswith('['):
|
||||
if existing_extra.startswith("["):
|
||||
try:
|
||||
parsed = json.loads(existing_extra)
|
||||
if isinstance(parsed, list):
|
||||
args_list.extend(str(x) for x in parsed)
|
||||
except Exception:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
else:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
args_list.append(extra_arg)
|
||||
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
|
||||
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
|
||||
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_path)
|
||||
env["COOKIES_OUTPUT_FILE"] = str(output_path)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(extract_script)],
|
||||
["node", str(extract_script)],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -316,17 +327,17 @@ def extract_cookies_via_cdp(
|
||||
_merge_netscape_cookies(output_file, temp_output)
|
||||
return True
|
||||
else:
|
||||
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
finally:
|
||||
if temp_dir and temp_dir.exists():
|
||||
@@ -337,6 +348,7 @@ def extract_cookies_via_cdp(
|
||||
# Validation Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate persona name to prevent path traversal attacks.
|
||||
@@ -348,19 +360,19 @@ def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
return False, "Persona name cannot be empty"
|
||||
|
||||
# Check for path separators
|
||||
if '/' in name or '\\' in name:
|
||||
if "/" in name or "\\" in name:
|
||||
return False, "Persona name cannot contain path separators (/ or \\)"
|
||||
|
||||
# Check for parent directory references
|
||||
if '..' in name:
|
||||
if ".." in name:
|
||||
return False, "Persona name cannot contain parent directory references (..)"
|
||||
|
||||
# Check for hidden files/directories
|
||||
if name.startswith('.'):
|
||||
if name.startswith("."):
|
||||
return False, "Persona name cannot start with a dot (.)"
|
||||
|
||||
# Ensure name doesn't contain null bytes or other dangerous chars
|
||||
if '\x00' in name or '\n' in name or '\r' in name:
|
||||
if "\x00" in name or "\n" in name or "\r" in name:
|
||||
return False, "Persona name contains invalid characters"
|
||||
|
||||
return True, ""
|
||||
@@ -394,10 +406,11 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_personas(
|
||||
names: Iterable[str],
|
||||
import_from: Optional[str] = None,
|
||||
profile: Optional[str] = None,
|
||||
import_from: str | None = None,
|
||||
profile: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Personas from names.
|
||||
@@ -416,7 +429,7 @@ def create_personas(
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Validate import source if specified
|
||||
@@ -424,23 +437,23 @@ def create_personas(
|
||||
if import_from:
|
||||
import_from = import_from.lower()
|
||||
if import_from not in BROWSER_PROFILE_FINDERS:
|
||||
rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
|
||||
rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
|
||||
rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr)
|
||||
rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
|
||||
if not source_profile_dir:
|
||||
rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr)
|
||||
|
||||
if profile is None and (source_profile_dir / 'Default').exists():
|
||||
profile = 'Default'
|
||||
if profile is None and (source_profile_dir / "Default").exists():
|
||||
profile = "Default"
|
||||
|
||||
browser_binary = get_browser_binary(import_from)
|
||||
if browser_binary:
|
||||
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr)
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
@@ -459,11 +472,11 @@ def create_personas(
|
||||
if created:
|
||||
persona.ensure_dirs()
|
||||
created_count += 1
|
||||
rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr)
|
||||
|
||||
cookies_file = Path(persona.path) / 'cookies.txt'
|
||||
cookies_file = Path(persona.path) / "cookies.txt"
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
|
||||
@@ -477,29 +490,31 @@ def create_personas(
|
||||
capture_storage=False,
|
||||
)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if import_result.profile_copied:
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr)
|
||||
if import_result.cookies_imported:
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr)
|
||||
elif not import_result.profile_copied:
|
||||
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr)
|
||||
|
||||
for warning in import_result.warnings:
|
||||
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr)
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
|
||||
"COOKIES_FILE": persona.COOKIES_FILE,
|
||||
},
|
||||
)
|
||||
|
||||
rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -507,10 +522,11 @@ def create_personas(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_personas(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
name__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Personas as JSONL with optional filters.
|
||||
@@ -523,33 +539,35 @@ def list_personas(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Persona.objects.all().order_by('name')
|
||||
queryset = Persona.objects.all().order_by("name")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
"name": name,
|
||||
"name__icontains": name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for persona in queryset:
|
||||
cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
|
||||
chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
|
||||
cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]"
|
||||
chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]"
|
||||
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
|
||||
rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]")
|
||||
else:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
|
||||
"COOKIES_FILE": persona.COOKIES_FILE,
|
||||
},
|
||||
)
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -557,7 +575,8 @@ def list_personas(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_personas(name: Optional[str] = None) -> int:
|
||||
|
||||
def update_personas(name: str | None = None) -> int:
|
||||
"""
|
||||
Update Personas from stdin JSONL.
|
||||
|
||||
@@ -575,13 +594,13 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
persona_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
persona_id = record.get("id")
|
||||
old_name = record.get("name")
|
||||
|
||||
if not persona_id and not old_name:
|
||||
continue
|
||||
@@ -613,17 +632,19 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
},
|
||||
)
|
||||
|
||||
except Persona.DoesNotExist:
|
||||
rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -631,6 +652,7 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Personas from stdin JSONL.
|
||||
@@ -646,23 +668,24 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect persona IDs or names
|
||||
persona_ids = []
|
||||
persona_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
persona_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
persona_names.append(r['name'])
|
||||
if r.get("id"):
|
||||
persona_ids.append(r["id"])
|
||||
elif r.get("name"):
|
||||
persona_names.append(r["name"])
|
||||
|
||||
if not persona_ids and not persona_names:
|
||||
rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
query = Q()
|
||||
if persona_ids:
|
||||
query |= Q(id__in=persona_ids)
|
||||
@@ -673,17 +696,17 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
count = personas.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr)
|
||||
for persona in personas:
|
||||
rprint(f' {persona.name} ({persona.path})', file=sys.stderr)
|
||||
rprint(f" {persona.name} ({persona.path})", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Delete persona directories and database records
|
||||
@@ -701,7 +724,7 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
persona.delete()
|
||||
deleted_count += 1
|
||||
|
||||
rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -709,44 +732,45 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Persona records (browser profiles)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
|
||||
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
|
||||
@main.command("create")
|
||||
@click.argument("names", nargs=-1)
|
||||
@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)")
|
||||
@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)")
|
||||
def create_cmd(names: tuple, import_from: str | None, profile: str | None):
|
||||
"""Create Personas, optionally importing from a browser profile."""
|
||||
sys.exit(create_personas(names, import_from=import_from, profile=profile))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", help="Filter by exact name")
|
||||
@click.option("--name__icontains", help="Filter by name contains")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
|
||||
"""List Personas as JSONL."""
|
||||
sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--name", "-n", help="Set new name")
|
||||
def update_cmd(name: str | None):
|
||||
"""Update Personas from stdin JSONL."""
|
||||
sys.exit(update_personas(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Personas from stdin JSONL."""
|
||||
sys.exit(delete_personas(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -137,7 +136,7 @@ BINARY_MACHINE_DIAGRAM = """
|
||||
@enforce_types
|
||||
def pluginmap(
|
||||
show_disabled: bool = False,
|
||||
model: Optional[str] = None,
|
||||
model: str | None = None,
|
||||
quiet: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -164,25 +163,25 @@ def pluginmap(
|
||||
|
||||
# Model event types that can have hooks
|
||||
model_events = {
|
||||
'Crawl': {
|
||||
'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': CRAWL_MACHINE_DIAGRAM,
|
||||
"Crawl": {
|
||||
"description": "Hooks run when a Crawl starts (QUEUED→STARTED)",
|
||||
"machine": "CrawlMachine",
|
||||
"diagram": CRAWL_MACHINE_DIAGRAM,
|
||||
},
|
||||
'CrawlEnd': {
|
||||
'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': None, # Part of CrawlMachine
|
||||
"CrawlEnd": {
|
||||
"description": "Hooks run when a Crawl finishes (STARTED→SEALED)",
|
||||
"machine": "CrawlMachine",
|
||||
"diagram": None, # Part of CrawlMachine
|
||||
},
|
||||
'Snapshot': {
|
||||
'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
|
||||
'machine': 'SnapshotMachine',
|
||||
'diagram': SNAPSHOT_MACHINE_DIAGRAM,
|
||||
"Snapshot": {
|
||||
"description": "Hooks run for each Snapshot (creates ArchiveResults)",
|
||||
"machine": "SnapshotMachine",
|
||||
"diagram": SNAPSHOT_MACHINE_DIAGRAM,
|
||||
},
|
||||
'Binary': {
|
||||
'description': 'Hooks for installing binary dependencies (providers)',
|
||||
'machine': 'BinaryMachine',
|
||||
'diagram': BINARY_MACHINE_DIAGRAM,
|
||||
"Binary": {
|
||||
"description": "Hooks for installing binary dependencies (providers)",
|
||||
"machine": "BinaryMachine",
|
||||
"diagram": BINARY_MACHINE_DIAGRAM,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -195,16 +194,16 @@ def pluginmap(
|
||||
model_events = {model: model_events[model]}
|
||||
|
||||
result = {
|
||||
'models': {},
|
||||
'plugins_dir': str(BUILTIN_PLUGINS_DIR),
|
||||
'user_plugins_dir': str(USER_PLUGINS_DIR),
|
||||
"models": {},
|
||||
"plugins_dir": str(BUILTIN_PLUGINS_DIR),
|
||||
"user_plugins_dir": str(USER_PLUGINS_DIR),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
prnt()
|
||||
prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
|
||||
prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
|
||||
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
|
||||
prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]")
|
||||
prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]")
|
||||
prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]")
|
||||
prnt()
|
||||
|
||||
for event_name, info in model_events.items():
|
||||
@@ -218,88 +217,93 @@ def pluginmap(
|
||||
plugin_name = hook_path.parent.name
|
||||
is_bg = is_background_hook(hook_path.name)
|
||||
|
||||
hook_infos.append({
|
||||
'path': str(hook_path),
|
||||
'name': hook_path.name,
|
||||
'plugin': plugin_name,
|
||||
'is_background': is_bg,
|
||||
'extension': hook_path.suffix,
|
||||
})
|
||||
hook_infos.append(
|
||||
{
|
||||
"path": str(hook_path),
|
||||
"name": hook_path.name,
|
||||
"plugin": plugin_name,
|
||||
"is_background": is_bg,
|
||||
"extension": hook_path.suffix,
|
||||
},
|
||||
)
|
||||
|
||||
result['models'][event_name] = {
|
||||
'description': info['description'],
|
||||
'machine': info['machine'],
|
||||
'hooks': hook_infos,
|
||||
'hook_count': len(hook_infos),
|
||||
result["models"][event_name] = {
|
||||
"description": info["description"],
|
||||
"machine": info["machine"],
|
||||
"hooks": hook_infos,
|
||||
"hook_count": len(hook_infos),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
# Show diagram if this model has one
|
||||
if info.get('diagram'):
|
||||
assert info['diagram'] is not None
|
||||
prnt(Panel(
|
||||
info['diagram'],
|
||||
title=f'[bold green]{info["machine"]}[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
if info.get("diagram"):
|
||||
assert info["diagram"] is not None
|
||||
prnt(
|
||||
Panel(
|
||||
info["diagram"],
|
||||
title=f"[bold green]{info['machine']}[/bold green]",
|
||||
border_style="green",
|
||||
expand=False,
|
||||
),
|
||||
)
|
||||
prnt()
|
||||
|
||||
# Create hooks table
|
||||
table = Table(
|
||||
title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
|
||||
title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)",
|
||||
box=box.ROUNDED,
|
||||
show_header=True,
|
||||
header_style='bold magenta',
|
||||
header_style="bold magenta",
|
||||
)
|
||||
table.add_column('Plugin', style='cyan', width=20)
|
||||
table.add_column('Hook Name', style='green')
|
||||
table.add_column('BG', justify='center', width=4)
|
||||
table.add_column('Type', justify='center', width=5)
|
||||
table.add_column("Plugin", style="cyan", width=20)
|
||||
table.add_column("Hook Name", style="green")
|
||||
table.add_column("BG", justify="center", width=4)
|
||||
table.add_column("Type", justify="center", width=5)
|
||||
|
||||
# Sort lexicographically by hook name
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h["name"])
|
||||
|
||||
for hook in sorted_hooks:
|
||||
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
|
||||
ext = hook['extension'].lstrip('.')
|
||||
bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else ""
|
||||
ext = hook["extension"].lstrip(".")
|
||||
table.add_row(
|
||||
hook['plugin'],
|
||||
hook['name'],
|
||||
hook["plugin"],
|
||||
hook["name"],
|
||||
bg_marker,
|
||||
ext,
|
||||
)
|
||||
|
||||
prnt(table)
|
||||
prnt()
|
||||
prnt(f'[dim]{info["description"]}[/dim]')
|
||||
prnt(f"[dim]{info['description']}[/dim]")
|
||||
prnt()
|
||||
|
||||
# Summary
|
||||
if not quiet:
|
||||
total_hooks = sum(m['hook_count'] for m in result['models'].values())
|
||||
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
|
||||
total_hooks = sum(m["hook_count"] for m in result["models"].values())
|
||||
prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]")
|
||||
prnt()
|
||||
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
|
||||
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
|
||||
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
|
||||
prnt('[dim] - ext: py, sh, or js[/dim]')
|
||||
prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]")
|
||||
prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]")
|
||||
prnt("[dim] - .bg: Background hook (non-blocking)[/dim]")
|
||||
prnt("[dim] - ext: py, sh, or js[/dim]")
|
||||
prnt()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
|
||||
@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
|
||||
@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too")
|
||||
@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams")
|
||||
@docstring(pluginmap.__doc__)
|
||||
def main(**kwargs):
|
||||
import json
|
||||
|
||||
result = pluginmap(**kwargs)
|
||||
if kwargs.get('quiet'):
|
||||
if kwargs.get("quiet"):
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -22,11 +22,10 @@ Examples:
|
||||
archivebox process list --limit=10
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox process'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox process"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -38,10 +37,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_processes(
|
||||
binary_name: Optional[str] = None,
|
||||
machine_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
binary_name: str | None = None,
|
||||
machine_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Processes as JSONL with optional filters.
|
||||
@@ -54,29 +54,29 @@ def list_processes(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
|
||||
queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {}
|
||||
if binary_name:
|
||||
filter_kwargs['binary__name'] = binary_name
|
||||
filter_kwargs["binary__name"] = binary_name
|
||||
if machine_id:
|
||||
filter_kwargs['machine_id'] = machine_id
|
||||
filter_kwargs["machine_id"] = machine_id
|
||||
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for process in queryset:
|
||||
if is_tty:
|
||||
binary_name_str = process.binary.name if process.binary else 'unknown'
|
||||
exit_code = process.exit_code if process.exit_code is not None else '?'
|
||||
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
|
||||
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
|
||||
binary_name_str = process.binary.name if process.binary else "unknown"
|
||||
exit_code = process.exit_code if process.exit_code is not None else "?"
|
||||
status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow"
|
||||
rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]")
|
||||
else:
|
||||
write_record(process.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -84,24 +84,27 @@ def list_processes(
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Process records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--binary-name', '-b', help='Filter by binary name')
|
||||
@click.option('--machine-id', '-m', help='Filter by machine ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--binary-name", "-b", help="Filter by binary name")
|
||||
@click.option("--machine-id", "-m", help="Filter by machine ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None):
|
||||
"""List Processes as JSONL."""
|
||||
sys.exit(list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox remove'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox remove"
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -26,25 +26,27 @@ from archivebox.misc.logging_util import (
|
||||
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_patterns: Iterable[str]=(),
|
||||
filter_type: str='exact',
|
||||
snapshots: QuerySet | None=None,
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
yes: bool=False,
|
||||
delete: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
def remove(
|
||||
filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = "exact",
|
||||
snapshots: QuerySet | None = None,
|
||||
after: float | None = None,
|
||||
before: float | None = None,
|
||||
yes: bool = False,
|
||||
delete: bool = False,
|
||||
out_dir: Path = DATA_DIR,
|
||||
) -> QuerySet:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
|
||||
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
|
||||
pattern_list = list(filter_patterns)
|
||||
|
||||
log_list_started(pattern_list or None, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
timer = TimedProgress(360, prefix=" ")
|
||||
try:
|
||||
snapshots = get_snapshots(
|
||||
snapshots=snapshots,
|
||||
@@ -63,7 +65,7 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
log_list_finished(snapshots)
|
||||
log_removal_started(snapshots, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
timer = TimedProgress(360, prefix=" ")
|
||||
try:
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
@@ -88,17 +90,23 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
|
||||
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
|
||||
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
|
||||
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
|
||||
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm")
|
||||
@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index")
|
||||
@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp")
|
||||
@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp")
|
||||
@click.option(
|
||||
"--filter-type",
|
||||
"-f",
|
||||
type=click.Choice(("exact", "substring", "domain", "regex", "tag")),
|
||||
default="exact",
|
||||
help="Type of pattern matching to use when filtering URLs",
|
||||
)
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(remove.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Remove the specified URLs from the archive"""
|
||||
remove(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -37,8 +37,8 @@ Examples:
|
||||
archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox run'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox run"
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -87,8 +87,8 @@ def process_stdin_records() -> int:
|
||||
binary_ids: list[str] = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_id = record.get('id')
|
||||
record_type = record.get("type", "")
|
||||
record_id = record.get("id")
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
@@ -97,10 +97,10 @@ def process_stdin_records() -> int:
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=record_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
# New crawl - create it
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if crawl:
|
||||
crawl.retry_at = timezone.now()
|
||||
@@ -112,16 +112,16 @@ def process_stdin_records() -> int:
|
||||
output_records.append(crawl.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type):
|
||||
if record_id:
|
||||
# Existing snapshot - re-queue
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=record_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
# New snapshot - create it
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if snapshot:
|
||||
snapshot.retry_at = timezone.now()
|
||||
@@ -132,7 +132,7 @@ def process_stdin_records() -> int:
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
run_all_plugins_for_crawl.add(crawl_id)
|
||||
@@ -149,11 +149,16 @@ def process_stdin_records() -> int:
|
||||
else:
|
||||
archiveresult = None
|
||||
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin_name = record.get('plugin')
|
||||
snapshot_id = record.get("snapshot_id")
|
||||
plugin_name = record.get("plugin")
|
||||
snapshot = None
|
||||
if archiveresult:
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
if archiveresult.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
plugin_name = plugin_name or archiveresult.plugin
|
||||
@@ -167,12 +172,12 @@ def process_stdin_records() -> int:
|
||||
snapshot.retry_at = timezone.now()
|
||||
if snapshot.status != Snapshot.StatusChoices.STARTED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl = snapshot.crawl
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
if plugin_name:
|
||||
@@ -203,7 +208,7 @@ def process_stdin_records() -> int:
|
||||
output_records.append(record)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Output all processed records (for chaining)
|
||||
@@ -212,10 +217,10 @@ def process_stdin_records() -> int:
|
||||
write_record(rec)
|
||||
|
||||
if queued_count == 0:
|
||||
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records to process[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr)
|
||||
|
||||
for binary_id in binary_ids:
|
||||
run_binary(binary_id)
|
||||
@@ -245,13 +250,14 @@ def run_runner(daemon: bool = False) -> int:
|
||||
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Process.cleanup_orphaned_workers()
|
||||
recover_orphaned_snapshots()
|
||||
recover_orphaned_crawls()
|
||||
Machine.current()
|
||||
current = Process.current()
|
||||
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
|
||||
current.process_type = Process.TypeChoices.ORCHESTRATOR
|
||||
current.save(update_fields=['process_type', 'modified_at'])
|
||||
current.save(update_fields=["process_type", "modified_at"])
|
||||
|
||||
try:
|
||||
run_pending_crawls(daemon=daemon)
|
||||
@@ -259,21 +265,21 @@ def run_runner(daemon: bool = False) -> int:
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
finally:
|
||||
current.refresh_from_db()
|
||||
if current.status != Process.StatusChoices.EXITED:
|
||||
current.status = Process.StatusChoices.EXITED
|
||||
current.ended_at = current.ended_at or timezone.now()
|
||||
current.save(update_fields=['status', 'ended_at', 'modified_at'])
|
||||
current.save(update_fields=["status", "ended_at", "modified_at"])
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--crawl-id', help="Run the crawl runner for a specific crawl only")
|
||||
@click.option('--snapshot-id', help="Run one snapshot through its crawl")
|
||||
@click.option('--binary-id', help="Run one queued binary install directly on the bus")
|
||||
@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only")
|
||||
@click.option("--snapshot-id", help="Run one snapshot through its crawl")
|
||||
@click.option("--binary-id", help="Run one queued binary install directly on the bus")
|
||||
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
"""
|
||||
Process queued work.
|
||||
@@ -297,21 +303,24 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if crawl_id:
|
||||
try:
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
run_crawl(crawl_id)
|
||||
sys.exit(0)
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -333,17 +342,18 @@ def run_snapshot_worker(snapshot_id: str) -> int:
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.select_related('crawl').get(id=snapshot_id)
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)])
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -10,18 +10,20 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
@enforce_types
|
||||
def schedule(add: bool = False,
|
||||
show: bool = False,
|
||||
clear: bool = False,
|
||||
foreground: bool = False,
|
||||
run_all: bool = False,
|
||||
quiet: bool = False,
|
||||
every: str | None = None,
|
||||
tag: str = '',
|
||||
depth: int | str = 0,
|
||||
overwrite: bool = False,
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: str | None = None):
|
||||
def schedule(
|
||||
add: bool = False,
|
||||
show: bool = False,
|
||||
clear: bool = False,
|
||||
foreground: bool = False,
|
||||
run_all: bool = False,
|
||||
quiet: bool = False,
|
||||
every: str | None = None,
|
||||
tag: str = "",
|
||||
depth: int | str = 0,
|
||||
overwrite: bool = False,
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: str | None = None,
|
||||
):
|
||||
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -33,55 +35,51 @@ def schedule(add: bool = False,
|
||||
|
||||
depth = int(depth)
|
||||
result: dict[str, object] = {
|
||||
'created_schedule_ids': [],
|
||||
'disabled_count': 0,
|
||||
'run_all_enqueued': 0,
|
||||
'active_schedule_ids': [],
|
||||
"created_schedule_ids": [],
|
||||
"disabled_count": 0,
|
||||
"run_all_enqueued": 0,
|
||||
"active_schedule_ids": [],
|
||||
}
|
||||
|
||||
def _active_schedules():
|
||||
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
|
||||
return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at")
|
||||
|
||||
if clear:
|
||||
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
|
||||
is_enabled=False,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
result['disabled_count'] = disabled_count
|
||||
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
|
||||
result["disabled_count"] = disabled_count
|
||||
print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]")
|
||||
|
||||
if every or add:
|
||||
schedule_str = (every or 'day').strip()
|
||||
schedule_str = (every or "day").strip()
|
||||
validate_schedule(schedule_str)
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
is_update_schedule = not import_path
|
||||
template_urls = import_path or 'archivebox://update'
|
||||
template_label = (
|
||||
f'Scheduled import: {template_urls}'
|
||||
if import_path else
|
||||
'Scheduled ArchiveBox update'
|
||||
)[:64]
|
||||
template_urls = import_path or "archivebox://update"
|
||||
template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64]
|
||||
template_notes = (
|
||||
f'Created by archivebox schedule for {template_urls}'
|
||||
if import_path else
|
||||
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
|
||||
f"Created by archivebox schedule for {template_urls}"
|
||||
if import_path
|
||||
else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls."
|
||||
)
|
||||
|
||||
template = Crawl.objects.create(
|
||||
urls=template_urls,
|
||||
max_depth=0 if is_update_schedule else depth,
|
||||
tags_str='' if is_update_schedule else tag,
|
||||
tags_str="" if is_update_schedule else tag,
|
||||
label=template_label,
|
||||
notes=template_notes,
|
||||
created_by_id=created_by_id,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': 0 if is_update_schedule else depth,
|
||||
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
|
||||
"ONLY_NEW": not update,
|
||||
"OVERWRITE": overwrite,
|
||||
"DEPTH": 0 if is_update_schedule else depth,
|
||||
"SCHEDULE_KIND": "update" if is_update_schedule else "crawl",
|
||||
},
|
||||
)
|
||||
crawl_schedule = CrawlSchedule.objects.create(
|
||||
@@ -92,31 +90,31 @@ def schedule(add: bool = False,
|
||||
notes=template_notes,
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
result['created_schedule_ids'] = [str(crawl_schedule.id)]
|
||||
result["created_schedule_ids"] = [str(crawl_schedule.id)]
|
||||
|
||||
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
|
||||
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
|
||||
print(f' id={crawl_schedule.id}')
|
||||
print(f' every={crawl_schedule.schedule}')
|
||||
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
|
||||
schedule_type = "maintenance update" if is_update_schedule else "crawl"
|
||||
print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]")
|
||||
print(f" id={crawl_schedule.id}")
|
||||
print(f" every={crawl_schedule.schedule}")
|
||||
print(f" next_run={crawl_schedule.next_run_at.isoformat()}")
|
||||
if import_path:
|
||||
print(f' source={import_path}')
|
||||
print(f" source={import_path}")
|
||||
|
||||
schedules = list(_active_schedules())
|
||||
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
|
||||
result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules]
|
||||
|
||||
if show:
|
||||
if schedules:
|
||||
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
|
||||
print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]")
|
||||
for scheduled_crawl in schedules:
|
||||
template = scheduled_crawl.template
|
||||
print(
|
||||
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
|
||||
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
|
||||
f'source={template.urls.splitlines()[0] if template.urls else ""}'
|
||||
f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} "
|
||||
f"next_run={scheduled_crawl.next_run_at.isoformat()} "
|
||||
f"source={template.urls.splitlines()[0] if template.urls else ''}",
|
||||
)
|
||||
else:
|
||||
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
||||
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
|
||||
|
||||
if run_all:
|
||||
enqueued = 0
|
||||
@@ -124,13 +122,17 @@ def schedule(add: bool = False,
|
||||
for scheduled_crawl in schedules:
|
||||
scheduled_crawl.enqueue(queued_at=now)
|
||||
enqueued += 1
|
||||
result['run_all_enqueued'] = enqueued
|
||||
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
|
||||
result["run_all_enqueued"] = enqueued
|
||||
print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]")
|
||||
if enqueued:
|
||||
print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
|
||||
print(
|
||||
"[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]",
|
||||
)
|
||||
|
||||
if foreground:
|
||||
print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
|
||||
print(
|
||||
"[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]",
|
||||
)
|
||||
run_pending_crawls(daemon=True)
|
||||
|
||||
if quiet:
|
||||
@@ -138,33 +140,38 @@ def schedule(add: bool = False,
|
||||
|
||||
if not any((every, add, show, clear, foreground, run_all)):
|
||||
if schedules:
|
||||
print('[green]\\[*] Active scheduled crawls:[/green]')
|
||||
print("[green]\\[*] Active scheduled crawls:[/green]")
|
||||
for scheduled_crawl in schedules:
|
||||
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
|
||||
print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}")
|
||||
else:
|
||||
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
||||
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
|
||||
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
|
||||
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
|
||||
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
|
||||
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
|
||||
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
|
||||
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
|
||||
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
|
||||
@click.argument('import_path', required=False)
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output")
|
||||
@click.option("--add", is_flag=True, help="Create a new scheduled crawl")
|
||||
@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots")
|
||||
@click.option(
|
||||
"--depth",
|
||||
type=click.Choice([str(i) for i in range(5)]),
|
||||
default="0",
|
||||
help="Recursively archive linked pages up to N hops away",
|
||||
)
|
||||
@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
||||
@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run")
|
||||
@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules")
|
||||
@click.option("--show", is_flag=True, help="Print all currently enabled schedules")
|
||||
@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)")
|
||||
@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once")
|
||||
@click.argument("import_path", required=False)
|
||||
@docstring(schedule.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
||||
schedule(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox search"
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Callable
|
||||
from typing import TYPE_CHECKING
|
||||
from collections.abc import Callable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -20,30 +21,28 @@ if TYPE_CHECKING:
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
||||
'exact': lambda pattern: Q(url=pattern),
|
||||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: (
|
||||
Q(url__istartswith=f'http://{pattern}')
|
||||
| Q(url__istartswith=f'https://{pattern}')
|
||||
| Q(url__istartswith=f'ftp://{pattern}')
|
||||
"exact": lambda pattern: Q(url=pattern),
|
||||
"substring": lambda pattern: Q(url__icontains=pattern),
|
||||
"regex": lambda pattern: Q(url__iregex=pattern),
|
||||
"domain": lambda pattern: (
|
||||
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
||||
),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
"tag": lambda pattern: Q(tags__name=pattern),
|
||||
"timestamp": lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
|
||||
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
|
||||
|
||||
|
||||
def _apply_pattern_filters(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
filter_patterns: list[str],
|
||||
filter_type: str,
|
||||
) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
filter_builder = LINK_FILTERS.get(filter_type)
|
||||
if filter_builder is None:
|
||||
stderr()
|
||||
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red')
|
||||
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
|
||||
raise SystemExit(2)
|
||||
|
||||
query = Q()
|
||||
@@ -53,7 +52,7 @@ def _apply_pattern_filters(
|
||||
|
||||
|
||||
def _snapshots_to_json(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
@@ -63,31 +62,35 @@ def _snapshots_to_json(
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.util import to_json
|
||||
|
||||
main_index_header = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': VERSION,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': {},
|
||||
},
|
||||
} if with_headers else {}
|
||||
main_index_header = (
|
||||
{
|
||||
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
|
||||
"schema": "archivebox.index.json",
|
||||
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
|
||||
"meta": {
|
||||
"project": "ArchiveBox",
|
||||
"version": VERSION,
|
||||
"git_sha": VERSION,
|
||||
"website": "https://ArchiveBox.io",
|
||||
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
|
||||
"source": "https://github.com/ArchiveBox/ArchiveBox",
|
||||
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
|
||||
"dependencies": {},
|
||||
},
|
||||
}
|
||||
if with_headers
|
||||
else {}
|
||||
)
|
||||
|
||||
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
output: dict[str, object] | list[dict[str, object]]
|
||||
if with_headers:
|
||||
output = {
|
||||
**main_index_header,
|
||||
'num_links': len(snapshot_dicts),
|
||||
'updated': datetime.now(tz.utc),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': snapshot_dicts,
|
||||
"num_links": len(snapshot_dicts),
|
||||
"updated": datetime.now(tz.utc),
|
||||
"last_run_cmd": sys.argv,
|
||||
"links": snapshot_dicts,
|
||||
}
|
||||
else:
|
||||
output = snapshot_dicts
|
||||
@@ -96,18 +99,18 @@ def _snapshots_to_json(
|
||||
|
||||
|
||||
def _snapshots_to_csv(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
cols: list[str],
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
header = ','.join(cols) if with_headers else ''
|
||||
rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return '\n'.join((header, *rows))
|
||||
header = ",".join(cols) if with_headers else ""
|
||||
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return "\n".join((header, *rows))
|
||||
|
||||
|
||||
def _snapshots_to_html(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
@@ -119,26 +122,31 @@ def _snapshots_to_html(
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
template = 'static_index.html' if with_headers else 'minimal_index.html'
|
||||
template = "static_index.html" if with_headers else "minimal_index.html"
|
||||
snapshot_list = list(snapshots.iterator(chunk_size=500))
|
||||
|
||||
return render_to_string(template, {
|
||||
'version': VERSION,
|
||||
'git_sha': get_COMMIT_HASH() or VERSION,
|
||||
'num_links': str(len(snapshot_list)),
|
||||
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
|
||||
'links': snapshot_list,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
})
|
||||
return render_to_string(
|
||||
template,
|
||||
{
|
||||
"version": VERSION,
|
||||
"git_sha": get_COMMIT_HASH() or VERSION,
|
||||
"num_links": str(len(snapshot_list)),
|
||||
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
|
||||
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
|
||||
"links": snapshot_list,
|
||||
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
|
||||
filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
def get_snapshots(
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
|
||||
filter_patterns: list[str] | None = None,
|
||||
filter_type: str = "substring",
|
||||
after: float | None = None,
|
||||
before: float | None = None,
|
||||
out_dir: Path = DATA_DIR,
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@@ -155,29 +163,31 @@ def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
|
||||
result = _apply_pattern_filters(result, filter_patterns, filter_type)
|
||||
|
||||
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
|
||||
result = result.select_related('crawl', 'crawl__created_by')
|
||||
result = result.select_related("crawl", "crawl__created_by")
|
||||
|
||||
if not result.exists():
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@enforce_types
|
||||
def search(filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
status: str='indexed',
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
sort: str | None=None,
|
||||
json: bool=False,
|
||||
html: bool=False,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
def search(
|
||||
filter_patterns: list[str] | None = None,
|
||||
filter_type: str = "substring",
|
||||
status: str = "indexed",
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
sort: str | None = None,
|
||||
json: bool = False,
|
||||
html: bool = False,
|
||||
csv: str | None = None,
|
||||
with_headers: bool = False,
|
||||
):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
|
||||
raise SystemExit(2)
|
||||
|
||||
# Query DB directly - no filesystem scanning
|
||||
@@ -189,9 +199,9 @@ def search(filter_patterns: list[str] | None=None,
|
||||
)
|
||||
|
||||
# Apply status filter
|
||||
if status == 'archived':
|
||||
if status == "archived":
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
elif status == 'unarchived':
|
||||
elif status == "unarchived":
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
# 'indexed' = all snapshots (no filter)
|
||||
|
||||
@@ -204,9 +214,10 @@ def search(filter_patterns: list[str] | None=None,
|
||||
elif html:
|
||||
output = _snapshots_to_html(snapshots, with_headers=with_headers)
|
||||
elif csv:
|
||||
output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers)
|
||||
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
|
||||
# Convert to dict for printable_folders
|
||||
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
|
||||
output = printable_folders(folders, with_headers)
|
||||
@@ -214,28 +225,33 @@ def search(filter_patterns: list[str] | None=None,
|
||||
# Structured exports must be written directly to stdout.
|
||||
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
if not output.endswith("\n"):
|
||||
sys.stdout.write("\n")
|
||||
return output
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
|
||||
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
|
||||
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
|
||||
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
|
||||
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
|
||||
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
|
||||
@click.help_option('--help', '-h')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option(
|
||||
"--filter-type",
|
||||
"-f",
|
||||
type=click.Choice(["search", *LINK_FILTERS.keys()]),
|
||||
default="substring",
|
||||
help="Pattern matching type for filtering URLs",
|
||||
)
|
||||
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
|
||||
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
|
||||
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
|
||||
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
|
||||
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
|
||||
@click.help_option("--help", "-h")
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(search.__doc__)
|
||||
def main(**kwargs):
|
||||
return search(**kwargs)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
@@ -15,20 +15,23 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
|
||||
"""Stop any existing orchestrator process so the server can take ownership."""
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
process_model.cleanup_orphaned_workers()
|
||||
|
||||
running_runners = list(process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by('created_at'))
|
||||
running_runners = list(
|
||||
process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by("created_at"),
|
||||
)
|
||||
|
||||
if not running_runners:
|
||||
return 0
|
||||
|
||||
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
|
||||
log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]")
|
||||
|
||||
if supervisor is not None and stop_worker_fn is not None:
|
||||
for worker_name in ('worker_runner', 'worker_runner_watch'):
|
||||
for worker_name in ("worker_runner", "worker_runner_watch"):
|
||||
try:
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
except Exception:
|
||||
@@ -47,23 +50,70 @@ def stop_existing_background_runner(*, machine, process_model, supervisor=None,
|
||||
return len(running_runners)
|
||||
|
||||
|
||||
def _read_supervisor_worker_command(worker_name: str) -> str:
|
||||
from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file
|
||||
|
||||
worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf"
|
||||
if not worker_conf.exists():
|
||||
return ""
|
||||
|
||||
for line in worker_conf.read_text().splitlines():
|
||||
if line.startswith("command="):
|
||||
return line.removeprefix("command=").strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _worker_command_matches_bind(command: str, host: str, port: str) -> bool:
|
||||
if not command:
|
||||
return False
|
||||
return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command)
|
||||
|
||||
|
||||
def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int:
|
||||
"""Stop existing ArchiveBox web workers if they already own the requested bind."""
|
||||
stopped = 0
|
||||
|
||||
for worker_name in ("worker_runserver", "worker_daphne"):
|
||||
try:
|
||||
proc = supervisor.getProcessInfo(worker_name) if supervisor else None
|
||||
except Exception:
|
||||
proc = None
|
||||
if not isinstance(proc, dict) or proc.get("statename") != "RUNNING":
|
||||
continue
|
||||
|
||||
command = _read_supervisor_worker_command(worker_name)
|
||||
if not _worker_command_matches_bind(command, host, port):
|
||||
continue
|
||||
|
||||
if stopped == 0:
|
||||
log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]")
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
stopped += 1
|
||||
|
||||
return stopped
|
||||
|
||||
|
||||
@enforce_types
|
||||
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool=False,
|
||||
init: bool=False,
|
||||
debug: bool=False,
|
||||
daemonize: bool=False,
|
||||
nothreading: bool=False) -> None:
|
||||
def server(
|
||||
runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool = False,
|
||||
init: bool = False,
|
||||
debug: bool = False,
|
||||
daemonize: bool = False,
|
||||
nothreading: bool = False,
|
||||
) -> None:
|
||||
"""Run the ArchiveBox HTTP server"""
|
||||
|
||||
runserver_args = list(runserver_args)
|
||||
|
||||
|
||||
if init:
|
||||
from archivebox.cli.archivebox_init import init as archivebox_init
|
||||
|
||||
archivebox_init(quick=True)
|
||||
print()
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
|
||||
check_data_folder()
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
@@ -73,22 +123,24 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
SHELL_CONFIG.DEBUG = True
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
|
||||
print()
|
||||
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print(
|
||||
"[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:",
|
||||
)
|
||||
print(" [green]archivebox manage createsuperuser[/green]")
|
||||
print()
|
||||
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
host = "127.0.0.1"
|
||||
port = "8000"
|
||||
|
||||
try:
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||
if ':' in host_and_port:
|
||||
host, port = host_and_port.split(':')
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0]
|
||||
if ":" in host_and_port:
|
||||
host, port = host_and_port.split(":")
|
||||
else:
|
||||
if '.' in host_and_port:
|
||||
if "." in host_and_port:
|
||||
host = host_and_port
|
||||
else:
|
||||
port = host_and_port
|
||||
@@ -104,66 +156,80 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
machine = Machine.current()
|
||||
supervisor = get_existing_supervisord_process()
|
||||
stop_existing_background_runner(
|
||||
machine=machine,
|
||||
process_model=Process,
|
||||
supervisor=get_existing_supervisord_process(),
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
)
|
||||
if supervisor:
|
||||
stop_existing_server_workers(
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
host=host,
|
||||
port=port,
|
||||
)
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f"[red][X] Error: Port {port} is already in use[/red]")
|
||||
print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}")
|
||||
print(" Stop the conflicting process or choose a different port")
|
||||
sys.exit(1)
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
|
||||
server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne"
|
||||
server_proc = get_worker(supervisor, server_worker_name)
|
||||
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
|
||||
if server_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
if runner_watch_state == 'RUNNING':
|
||||
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
|
||||
server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None
|
||||
if server_state == "RUNNING":
|
||||
runner_proc = get_worker(supervisor, "worker_runner")
|
||||
runner_watch_proc = get_worker(supervisor, "worker_runner_watch")
|
||||
runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None
|
||||
print("[red][X] Error: ArchiveBox server is already running[/red]")
|
||||
print(
|
||||
f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
|
||||
)
|
||||
if runner_state == "RUNNING":
|
||||
print(" [green]√[/green] Background runner (worker_runner) is RUNNING")
|
||||
if runner_watch_state == "RUNNING":
|
||||
print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING")
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print("[yellow]To stop the existing server, run:[/yellow]")
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
print(" pkill -f supervisord")
|
||||
sys.exit(1)
|
||||
|
||||
if run_in_debug:
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]")
|
||||
else:
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print("[green][+] Starting ArchiveBox webserver...[/green]")
|
||||
print(
|
||||
f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
|
||||
)
|
||||
print(
|
||||
f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]",
|
||||
)
|
||||
print(" > Writing ArchiveBox error log to ./logs/errors.log")
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('runserver_args', nargs=-1)
|
||||
@click.option('--reload', is_flag=True, help='Enable auto-reloading when code or templates change')
|
||||
@click.option('--debug', is_flag=True, help='Enable DEBUG=True mode with more verbose errors')
|
||||
@click.option('--nothreading', is_flag=True, help='Force runserver to run in single-threaded mode')
|
||||
@click.option('--init', is_flag=True, help='Run a full archivebox init/upgrade before starting the server')
|
||||
@click.option('--daemonize', is_flag=True, help='Run the server in the background as a daemon')
|
||||
@click.argument("runserver_args", nargs=-1)
|
||||
@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change")
|
||||
@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors")
|
||||
@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode")
|
||||
@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server")
|
||||
@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon")
|
||||
@docstring(server.__doc__)
|
||||
def main(**kwargs):
|
||||
server(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,27 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def shell(args: Iterable[str]=()) -> None:
|
||||
def shell(args: Iterable[str] = ()) -> None:
|
||||
"""Enter an interactive ArchiveBox Django shell"""
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
call_command("shell_plus", *args)
|
||||
|
||||
|
||||
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.argument("args", nargs=-1)
|
||||
@docstring(shell.__doc__)
|
||||
def main(args: Iterable[str]=()) -> None:
|
||||
def main(args: Iterable[str] = ()) -> None:
|
||||
shell(args=args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,14 +27,16 @@ Examples:
|
||||
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox snapshot"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
from django.db.models import Q, Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
@@ -43,12 +45,13 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_snapshots(
|
||||
urls: Iterable[str],
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
tag: str = "",
|
||||
status: str = "queued",
|
||||
depth: int = 0,
|
||||
created_by_id: Optional[int] = None,
|
||||
created_by_id: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
||||
@@ -59,8 +62,10 @@ def create_snapshots(
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_CRAWL
|
||||
read_args_or_stdin,
|
||||
write_record,
|
||||
TYPE_SNAPSHOT,
|
||||
TYPE_CRAWL,
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -73,7 +78,7 @@ def create_snapshots(
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Process each record - handle Crawls and plain URLs/Snapshots
|
||||
@@ -81,7 +86,7 @@ def create_snapshots(
|
||||
pass_through_count = 0
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
@@ -91,14 +96,14 @@ def create_snapshots(
|
||||
|
||||
# Input is a Crawl - get or create it, then create Snapshots for its URLs
|
||||
crawl = None
|
||||
crawl_id = record.get('id')
|
||||
crawl_id = record.get("id")
|
||||
if crawl_id:
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if not crawl:
|
||||
continue
|
||||
@@ -109,27 +114,27 @@ def create_snapshots(
|
||||
if tag:
|
||||
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
|
||||
snapshot_record = {
|
||||
'url': url,
|
||||
'tags': merged_tags,
|
||||
'crawl_id': str(crawl.id),
|
||||
'depth': depth,
|
||||
'status': status,
|
||||
"url": url,
|
||||
"tags": merged_tags,
|
||||
"crawl_id": str(crawl.id),
|
||||
"depth": depth,
|
||||
"status": status,
|
||||
}
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or record.get('url'):
|
||||
elif record_type == TYPE_SNAPSHOT or record.get("url"):
|
||||
# Input is a Snapshot or plain URL
|
||||
if tag and not record.get('tags'):
|
||||
record['tags'] = tag
|
||||
if tag and not record.get("tags"):
|
||||
record["tags"] = tag
|
||||
if status:
|
||||
record['status'] = status
|
||||
record['depth'] = record.get('depth', depth)
|
||||
record["status"] = status
|
||||
record["depth"] = record.get("depth", depth)
|
||||
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
@@ -142,21 +147,21 @@ def create_snapshots(
|
||||
pass_through_count += 1
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
if not created_snapshots:
|
||||
if pass_through_count > 0:
|
||||
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
||||
rprint("[red]No snapshots created[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr)
|
||||
|
||||
if is_tty:
|
||||
for snapshot in created_snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
@@ -165,16 +170,19 @@ def create_snapshots(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_snapshots(
|
||||
status: Optional[str] = None,
|
||||
url__icontains: Optional[str] = None,
|
||||
url__istartswith: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
sort: Optional[str] = None,
|
||||
csv: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
url__icontains: str | None = None,
|
||||
url__istartswith: str | None = None,
|
||||
tag: str | None = None,
|
||||
crawl_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
sort: str | None = None,
|
||||
csv: str | None = None,
|
||||
with_headers: bool = False,
|
||||
search: str | None = None,
|
||||
query: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
@@ -184,64 +192,106 @@ def list_snapshots(
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.search import (
|
||||
get_default_search_mode,
|
||||
get_search_mode,
|
||||
prioritize_metadata_matches,
|
||||
query_search_index,
|
||||
)
|
||||
|
||||
if with_headers and not csv:
|
||||
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
|
||||
rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
is_tty = sys.stdout.isatty() and not csv
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'url__icontains': url__icontains,
|
||||
'url__istartswith': url__istartswith,
|
||||
'crawl_id': crawl_id,
|
||||
"status": status,
|
||||
"url__icontains": url__icontains,
|
||||
"url__istartswith": url__istartswith,
|
||||
"crawl_id": crawl_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
queryset = apply_filters(queryset, filter_kwargs)
|
||||
|
||||
# Tag filter requires special handling (M2M)
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
query = (query or "").strip()
|
||||
if query:
|
||||
metadata_qs = queryset.filter(
|
||||
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
|
||||
)
|
||||
requested_search_mode = (search or "").strip().lower()
|
||||
if requested_search_mode == "content":
|
||||
requested_search_mode = "contents"
|
||||
search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode)
|
||||
|
||||
if search_mode == "meta":
|
||||
queryset = metadata_qs
|
||||
else:
|
||||
try:
|
||||
deep_qsearch = None
|
||||
if search_mode == "deep":
|
||||
qsearch = query_search_index(query, search_mode="contents")
|
||||
deep_qsearch = query_search_index(query, search_mode="deep")
|
||||
else:
|
||||
qsearch = query_search_index(query, search_mode=search_mode)
|
||||
queryset = prioritize_metadata_matches(
|
||||
queryset,
|
||||
metadata_qs,
|
||||
qsearch,
|
||||
deep_queryset=deep_qsearch,
|
||||
ordering=("-created_at",) if not sort else None,
|
||||
)
|
||||
except Exception as err:
|
||||
rprint(
|
||||
f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
queryset = metadata_qs
|
||||
|
||||
if sort:
|
||||
queryset = queryset.order_by(sort)
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
count = 0
|
||||
if csv:
|
||||
cols = [col.strip() for col in csv.split(',') if col.strip()]
|
||||
cols = [col.strip() for col in csv.split(",") if col.strip()]
|
||||
if not cols:
|
||||
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
|
||||
rprint("[red]No CSV columns provided[/red]", file=sys.stderr)
|
||||
return 2
|
||||
rows: list[str] = []
|
||||
if with_headers:
|
||||
rows.append(','.join(cols))
|
||||
rows.append(",".join(cols))
|
||||
for snapshot in queryset.iterator(chunk_size=500):
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=','))
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=","))
|
||||
count += 1
|
||||
output = '\n'.join(rows)
|
||||
output = "\n".join(rows)
|
||||
if output:
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
if not output.endswith("\n"):
|
||||
sys.stdout.write("\n")
|
||||
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(snapshot.status, 'dim')
|
||||
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"sealed": "green",
|
||||
}.get(snapshot.status, "dim")
|
||||
rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}")
|
||||
else:
|
||||
write_record(snapshot.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -249,9 +299,10 @@ def list_snapshots(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_snapshots(
|
||||
status: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
tag: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Snapshots from stdin JSONL.
|
||||
@@ -272,12 +323,12 @@ def update_snapshots(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id')
|
||||
snapshot_id = record.get("id")
|
||||
if not snapshot_id:
|
||||
continue
|
||||
|
||||
@@ -292,6 +343,7 @@ def update_snapshots(
|
||||
# Add tag to existing tags
|
||||
snapshot.save() # Ensure saved before M2M
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
tag_obj, _ = Tag.objects.get_or_create(name=tag)
|
||||
snapshot.tags.add(tag_obj)
|
||||
|
||||
@@ -302,10 +354,10 @@ def update_snapshots(
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -313,6 +365,7 @@ def update_snapshots(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Snapshots from stdin JSONL.
|
||||
@@ -328,35 +381,35 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshot_ids = [r.get('id') for r in records if r.get('id')]
|
||||
snapshot_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
|
||||
count = snapshots.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr)
|
||||
for snapshot in snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = snapshots.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -364,57 +417,81 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Snapshot records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@main.command("create")
|
||||
@click.argument("urls", nargs=-1)
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
|
||||
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
|
||||
"""Create Snapshots from URLs or stdin JSONL."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--url__icontains", help="Filter by URL contains")
|
||||
@click.option("--url__istartswith", help="Filter by URL starts with")
|
||||
@click.option("--tag", "-t", help="Filter by tag name")
|
||||
@click.option("--crawl-id", help="Filter by crawl ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
|
||||
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
|
||||
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
|
||||
@click.argument("query", nargs=-1)
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
url__icontains: str | None,
|
||||
url__istartswith: str | None,
|
||||
tag: str | None,
|
||||
crawl_id: str | None,
|
||||
limit: int | None,
|
||||
sort: str | None,
|
||||
csv: str | None,
|
||||
with_headers: bool,
|
||||
search: str | None,
|
||||
query: tuple[str, ...],
|
||||
):
|
||||
"""List Snapshots as JSONL."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
search=search,
|
||||
query=" ".join(query),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--tag', '-t', help='Add tag')
|
||||
def update_cmd(status: Optional[str], tag: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
@click.option("--tag", "-t", help="Add tag")
|
||||
def update_cmd(status: str | None, tag: str | None):
|
||||
"""Update Snapshots from stdin JSONL."""
|
||||
sys.exit(update_snapshots(status=status, tag=tag))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Snapshots from stdin JSONL."""
|
||||
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox snapshot"
|
||||
|
||||
import sys
|
||||
|
||||
@@ -10,15 +10,15 @@ import rich_click as click
|
||||
from archivebox.cli.archivebox_snapshot import create_snapshots
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.command(context_settings={"ignore_unknown_options": True})
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
|
||||
@click.argument("urls", nargs=-1)
|
||||
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
@@ -16,31 +16,34 @@ from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
|
||||
@enforce_types
|
||||
def status(out_dir: Path=DATA_DIR) -> None:
|
||||
def status(out_dir: Path = DATA_DIR) -> None:
|
||||
"""Print out some info and statistics about the archive collection"""
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.db.models import Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
print('[green]\\[*] Scanning archive main index...[/green]')
|
||||
print(f'[yellow] {out_dir}/*[/yellow]')
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
|
||||
print("[green]\\[*] Scanning archive main index...[/green]")
|
||||
print(f"[yellow] {out_dir}/*[/yellow]")
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.")
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Index size: {size} across {num_files} files')
|
||||
print(f" Index size: {size} across {num_files} files")
|
||||
print()
|
||||
|
||||
links = list(Snapshot.objects.all())
|
||||
links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)))
|
||||
num_sql_links = len(links)
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
|
||||
print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})")
|
||||
print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)")
|
||||
print()
|
||||
print('[green]\\[*] Scanning archive data directories...[/green]')
|
||||
users_dir = out_dir / 'users'
|
||||
print("[green]\\[*] Scanning archive data directories...[/green]")
|
||||
users_dir = out_dir / "users"
|
||||
scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()]
|
||||
scan_roots_display = ', '.join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
|
||||
print(f'[yellow] {scan_roots_display}[/yellow]')
|
||||
scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
|
||||
print(f"[yellow] {scan_roots_display}[/yellow]")
|
||||
num_bytes = num_dirs = num_files = 0
|
||||
for root in scan_roots:
|
||||
root_bytes, root_dirs, root_files = get_dir_size(root)
|
||||
@@ -48,80 +51,66 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
num_dirs += root_dirs
|
||||
num_files += root_files
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print(f" Size: {size} across {num_files} files in {num_dirs} directories")
|
||||
|
||||
# Use DB as source of truth for snapshot status
|
||||
num_indexed = len(links)
|
||||
num_archived = sum(1 for snapshot in links if snapshot.is_archived)
|
||||
num_unarchived = max(num_indexed - num_archived, 0)
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
|
||||
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
|
||||
print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)")
|
||||
print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)")
|
||||
print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)")
|
||||
|
||||
# Count snapshot directories on filesystem across both legacy and current layouts.
|
||||
expected_snapshot_dirs = {
|
||||
str(Path(snapshot.output_dir).resolve())
|
||||
for snapshot in links
|
||||
if Path(snapshot.output_dir).exists()
|
||||
}
|
||||
expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()}
|
||||
discovered_snapshot_dirs = set()
|
||||
|
||||
if ARCHIVE_DIR.exists():
|
||||
discovered_snapshot_dirs.update(
|
||||
str(entry.resolve())
|
||||
for entry in ARCHIVE_DIR.iterdir()
|
||||
if entry.is_dir()
|
||||
)
|
||||
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir())
|
||||
|
||||
if users_dir.exists():
|
||||
discovered_snapshot_dirs.update(
|
||||
str(entry.resolve())
|
||||
for entry in users_dir.glob('*/snapshots/*/*/*')
|
||||
if entry.is_dir()
|
||||
)
|
||||
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir())
|
||||
|
||||
orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs)
|
||||
num_present = len(discovered_snapshot_dirs)
|
||||
num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs)
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), '(snapshot directories on disk)')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
|
||||
print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)")
|
||||
print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)")
|
||||
|
||||
num_orphaned = len(orphaned_dirs)
|
||||
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
|
||||
print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)")
|
||||
|
||||
if num_indexed:
|
||||
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
|
||||
print(" [violet]Hint:[/violet] You can list snapshots by status like so:")
|
||||
print(" [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]")
|
||||
|
||||
if orphaned_dirs:
|
||||
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:")
|
||||
print(" [green]archivebox init[/green]")
|
||||
|
||||
print()
|
||||
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
|
||||
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
|
||||
admin_users = User.objects.filter(is_superuser=True).exclude(username='system')
|
||||
print("[green]\\[*] Scanning recent archive changes and user logins:[/green]")
|
||||
print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]")
|
||||
admin_users = User.objects.filter(is_superuser=True).exclude(username="system")
|
||||
users = [user.get_username() for user in admin_users]
|
||||
print(f' UI users {len(users)}: {", ".join(users)}')
|
||||
last_login = admin_users.order_by('last_login').last()
|
||||
print(f" UI users {len(users)}: {', '.join(users)}")
|
||||
last_login = admin_users.order_by("last_login").last()
|
||||
if last_login:
|
||||
print(f' Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}')
|
||||
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
|
||||
print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}")
|
||||
last_downloaded = Snapshot.objects.order_by("downloaded_at").last()
|
||||
if last_downloaded:
|
||||
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
|
||||
print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}")
|
||||
|
||||
if not users:
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] You can create an admin user by running:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print(" [violet]Hint:[/violet] You can create an admin user by running:")
|
||||
print(" [green]archivebox manage createsuperuser[/green]")
|
||||
|
||||
print()
|
||||
recent_snapshots = sorted(
|
||||
links,
|
||||
key=lambda snapshot: (
|
||||
snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at
|
||||
),
|
||||
key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at,
|
||||
reverse=True,
|
||||
)[:10]
|
||||
for snapshot in recent_snapshots:
|
||||
@@ -129,14 +118,14 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
continue
|
||||
print(
|
||||
(
|
||||
'[grey53] '
|
||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||
"[grey53] "
|
||||
f" > {str(snapshot.downloaded_at)[:16]} "
|
||||
f"[{snapshot.num_outputs} {('X', '√')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] "
|
||||
f'"{snapshot.title}": {snapshot.url}'
|
||||
'[/grey53]'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||
"[/grey53]"
|
||||
)[: SHELL_CONFIG.TERM_WIDTH],
|
||||
)
|
||||
print('[grey53] ...')
|
||||
print("[grey53] ...")
|
||||
|
||||
|
||||
@click.command()
|
||||
@@ -146,5 +135,5 @@ def main(**kwargs):
|
||||
status(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,11 +27,11 @@ Examples:
|
||||
archivebox tag list --name=unused | archivebox tag delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox tag'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox tag"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -43,6 +43,7 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_tags(names: Iterable[str]) -> int:
|
||||
"""
|
||||
Create Tags from names.
|
||||
@@ -60,7 +61,7 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
created_count = 0
|
||||
@@ -76,11 +77,11 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
|
||||
if created:
|
||||
created_count += 1
|
||||
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr)
|
||||
|
||||
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -88,10 +89,11 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_tags(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
name__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Tags as JSONL with optional filters.
|
||||
@@ -104,12 +106,12 @@ def list_tags(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Tag.objects.all().order_by('name')
|
||||
queryset = Tag.objects.all().order_by("name")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
"name": name,
|
||||
"name__icontains": name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -117,12 +119,12 @@ def list_tags(
|
||||
for tag in queryset:
|
||||
snapshot_count = tag.snapshot_set.count()
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
|
||||
rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]")
|
||||
else:
|
||||
write_record(tag.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -130,7 +132,8 @@ def list_tags(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_tags(name: Optional[str] = None) -> int:
|
||||
|
||||
def update_tags(name: str | None = None) -> int:
|
||||
"""
|
||||
Update Tags from stdin JSONL.
|
||||
|
||||
@@ -148,13 +151,13 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
tag_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
tag_id = record.get("id")
|
||||
old_name = record.get("name")
|
||||
|
||||
if not tag_id and not old_name:
|
||||
continue
|
||||
@@ -176,10 +179,10 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
write_record(tag.to_json())
|
||||
|
||||
except Tag.DoesNotExist:
|
||||
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -187,6 +190,7 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Tags from stdin JSONL.
|
||||
@@ -202,23 +206,24 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect tag IDs or names
|
||||
tag_ids = []
|
||||
tag_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
tag_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
tag_names.append(r['name'])
|
||||
if r.get("id"):
|
||||
tag_ids.append(r["id"])
|
||||
elif r.get("name"):
|
||||
tag_names.append(r["name"])
|
||||
|
||||
if not tag_ids and not tag_names:
|
||||
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
query = Q()
|
||||
if tag_ids:
|
||||
query |= Q(id__in=tag_ids)
|
||||
@@ -229,22 +234,22 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
count = tags.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr)
|
||||
for tag in tags:
|
||||
rprint(f' {tag.name}', file=sys.stderr)
|
||||
rprint(f" {tag.name}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = tags.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -252,42 +257,43 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Tag records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@main.command("create")
|
||||
@click.argument("names", nargs=-1)
|
||||
def create_cmd(names: tuple):
|
||||
"""Create Tags from names."""
|
||||
sys.exit(create_tags(names))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", help="Filter by exact name")
|
||||
@click.option("--name__icontains", help="Filter by name contains")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
|
||||
"""List Tags as JSONL."""
|
||||
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--name", "-n", help="Set new name")
|
||||
def update_cmd(name: str | None):
|
||||
"""Update Tags from stdin JSONL."""
|
||||
sys.exit(update_tags(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Tags from stdin JSONL."""
|
||||
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from collections.abc import Callable, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
@@ -20,24 +21,22 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
||||
'exact': lambda pattern: Q(url=pattern),
|
||||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: (
|
||||
Q(url__istartswith=f'http://{pattern}')
|
||||
| Q(url__istartswith=f'https://{pattern}')
|
||||
| Q(url__istartswith=f'ftp://{pattern}')
|
||||
"exact": lambda pattern: Q(url=pattern),
|
||||
"substring": lambda pattern: Q(url__icontains=pattern),
|
||||
"regex": lambda pattern: Q(url__iregex=pattern),
|
||||
"domain": lambda pattern: (
|
||||
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
||||
),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
"tag": lambda pattern: Q(tags__name=pattern),
|
||||
"timestamp": lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
|
||||
def _apply_pattern_filters(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
filter_patterns: list[str],
|
||||
filter_type: str,
|
||||
) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
filter_builder = LINK_FILTERS.get(filter_type)
|
||||
if filter_builder is None:
|
||||
raise SystemExit(2)
|
||||
@@ -48,21 +47,120 @@ def _apply_pattern_filters(
|
||||
return snapshots.filter(query)
|
||||
|
||||
|
||||
def _get_snapshot_crawl(snapshot: 'Snapshot') -> 'Crawl | None':
|
||||
def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None":
|
||||
try:
|
||||
return snapshot.crawl
|
||||
except ObjectDoesNotExist:
|
||||
return None
|
||||
|
||||
|
||||
def _get_search_indexing_plugins() -> list[str]:
|
||||
from abx_dl.models import discover_plugins
|
||||
from archivebox.hooks import get_search_backends
|
||||
|
||||
available_backends = set(get_search_backends())
|
||||
plugins = discover_plugins()
|
||||
return sorted(
|
||||
plugin_name
|
||||
for plugin_name, plugin in plugins.items()
|
||||
if plugin_name.startswith("search_backend_")
|
||||
and plugin_name.removeprefix("search_backend_") in available_backends
|
||||
and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks)
|
||||
)
|
||||
|
||||
|
||||
def _build_filtered_snapshots_queryset(
|
||||
*,
|
||||
filter_patterns: Iterable[str],
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
resume: str | None = None,
|
||||
):
|
||||
from archivebox.core.models import Snapshot
|
||||
from datetime import datetime
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
|
||||
|
||||
if before:
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
if resume:
|
||||
snapshots = snapshots.filter(timestamp__lte=resume)
|
||||
|
||||
return snapshots.select_related("crawl").order_by("-bookmarked_at")
|
||||
|
||||
|
||||
def reindex_snapshots(
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
search_plugins: list[str],
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
from archivebox.cli.archivebox_extract import run_plugins
|
||||
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0}
|
||||
records: list[dict[str, str]] = []
|
||||
|
||||
total = snapshots.count()
|
||||
print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}")
|
||||
|
||||
for snapshot in snapshots.iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
continue
|
||||
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
if has_directory:
|
||||
snapshot.reconcile_with_index_json()
|
||||
stats["reconciled"] += 1
|
||||
|
||||
for plugin_name in search_plugins:
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
||||
if existing_result:
|
||||
existing_result.reset_for_retry()
|
||||
records.append(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": plugin_name,
|
||||
},
|
||||
)
|
||||
stats["queued"] += 1
|
||||
|
||||
if not records:
|
||||
return stats
|
||||
|
||||
exit_code = run_plugins(
|
||||
args=(),
|
||||
records=records,
|
||||
wait=True,
|
||||
emit_results=False,
|
||||
)
|
||||
if exit_code != 0:
|
||||
raise SystemExit(exit_code)
|
||||
|
||||
stats["reindexed"] = len(records)
|
||||
return stats
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = 'exact',
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False) -> None:
|
||||
def update(
|
||||
filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = "exact",
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False,
|
||||
index_only: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving.
|
||||
|
||||
@@ -77,41 +175,69 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
|
||||
from rich import print
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
# Run migrations first to ensure DB schema is up-to-date
|
||||
print('[*] Checking for pending migrations...')
|
||||
print("[*] Checking for pending migrations...")
|
||||
try:
|
||||
call_command('migrate', '--no-input', verbosity=0)
|
||||
call_command("migrate", "--no-input", verbosity=0)
|
||||
except Exception as e:
|
||||
print(f'[!] Warning: Migration check failed: {e}')
|
||||
print(f"[!] Warning: Migration check failed: {e}")
|
||||
|
||||
while True:
|
||||
if filter_patterns or before or after:
|
||||
if index_only:
|
||||
search_plugins = _get_search_indexing_plugins()
|
||||
if not search_plugins:
|
||||
print("[*] No search indexing plugins are available, nothing to backfill.")
|
||||
break
|
||||
|
||||
if not (filter_patterns or before or after):
|
||||
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
|
||||
drain_old_archive_dirs(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
snapshots = _build_filtered_snapshots_queryset(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
resume=resume,
|
||||
)
|
||||
stats = reindex_snapshots(
|
||||
snapshots,
|
||||
search_plugins=search_plugins,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
print_index_stats(stats)
|
||||
elif filter_patterns or before or after:
|
||||
# Filtered mode: query DB only
|
||||
print('[*] Processing filtered snapshots from database...')
|
||||
print("[*] Processing filtered snapshots from database...")
|
||||
stats = process_filtered_snapshots(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
batch_size=batch_size
|
||||
resume=resume,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
print_stats(stats)
|
||||
else:
|
||||
# Full mode: drain old dirs + process DB
|
||||
stats_combined = {'phase1': {}, 'phase2': {}}
|
||||
stats_combined = {"phase1": {}, "phase2": {}}
|
||||
|
||||
print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
|
||||
stats_combined['phase1'] = drain_old_archive_dirs(
|
||||
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
|
||||
stats_combined["phase1"] = drain_old_archive_dirs(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
print('[*] Phase 2: Processing all database snapshots (most recent first)...')
|
||||
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
|
||||
print("[*] Phase 2: Processing all database snapshots (most recent first)...")
|
||||
stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume)
|
||||
|
||||
# Phase 3: Deduplication (disabled for now)
|
||||
# print('[*] Phase 3: Deduplicating...')
|
||||
@@ -122,7 +248,7 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
if not continuous:
|
||||
break
|
||||
|
||||
print('[yellow]Sleeping 60s before next pass...[/yellow]')
|
||||
print("[yellow]Sleeping 60s before next pass...[/yellow]")
|
||||
time.sleep(60)
|
||||
resume = None
|
||||
|
||||
@@ -144,34 +270,34 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
from archivebox.config import CONSTANTS
|
||||
from django.db import transaction
|
||||
|
||||
stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
|
||||
stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0}
|
||||
|
||||
archive_dir = CONSTANTS.ARCHIVE_DIR
|
||||
if not archive_dir.exists():
|
||||
return stats
|
||||
|
||||
print('[DEBUG Phase1] Scanning for old directories in archive/...')
|
||||
print("[DEBUG Phase1] Scanning for old directories in archive/...")
|
||||
|
||||
# Scan for real directories only (skip symlinks - they're already migrated)
|
||||
all_entries = list(os.scandir(archive_dir))
|
||||
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
|
||||
print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}")
|
||||
entries = [
|
||||
(e.stat().st_mtime, e.path)
|
||||
for e in all_entries
|
||||
if e.is_dir(follow_symlinks=False) # Skip symlinks
|
||||
]
|
||||
entries.sort(reverse=True) # Newest first
|
||||
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
|
||||
print(f'[*] Found {len(entries)} old directories to drain')
|
||||
print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}")
|
||||
print(f"[*] Found {len(entries)} old directories to drain")
|
||||
|
||||
for mtime, entry_path in entries:
|
||||
entry_path = Path(entry_path)
|
||||
|
||||
# Resume from timestamp if specified
|
||||
if resume_from and entry_path.name < resume_from:
|
||||
if resume_from and entry_path.name > resume_from:
|
||||
continue
|
||||
|
||||
stats['processed'] += 1
|
||||
stats["processed"] += 1
|
||||
|
||||
# Try to load existing snapshot from DB
|
||||
snapshot = Snapshot.load_from_directory(entry_path)
|
||||
@@ -182,16 +308,16 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if not snapshot:
|
||||
# Invalid directory - move to invalid/
|
||||
Snapshot.move_directory_to_invalid(entry_path)
|
||||
stats['invalid'] += 1
|
||||
stats["invalid"] += 1
|
||||
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
|
||||
continue
|
||||
|
||||
try:
|
||||
snapshot.save()
|
||||
stats['migrated'] += 1
|
||||
stats["migrated"] += 1
|
||||
print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}")
|
||||
except Exception as e:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
continue
|
||||
|
||||
@@ -201,30 +327,35 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if not has_valid_crawl:
|
||||
# Create a new crawl (created_by will default to system user)
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.create(urls=snapshot.url)
|
||||
# Use queryset update to avoid triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
|
||||
# Refresh the instance
|
||||
snapshot.crawl = crawl
|
||||
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
|
||||
|
||||
# Check if needs migration (0.8.x → 0.9.x)
|
||||
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
print(
|
||||
f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
|
||||
)
|
||||
if snapshot.fs_migration_needed:
|
||||
try:
|
||||
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
|
||||
# because snapshot.timestamp might be truncated
|
||||
old_dir = entry_path
|
||||
new_dir = snapshot.get_storage_path_for_version('0.9.0')
|
||||
new_dir = snapshot.get_storage_path_for_version("0.9.0")
|
||||
print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")
|
||||
|
||||
# Manually migrate files
|
||||
if not new_dir.exists() and old_dir.exists():
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
import shutil
|
||||
|
||||
file_count = 0
|
||||
for old_file in old_dir.rglob('*'):
|
||||
for old_file in old_dir.rglob("*"):
|
||||
if old_file.is_file():
|
||||
rel_path = old_file.relative_to(old_dir)
|
||||
new_file = new_dir / rel_path
|
||||
@@ -236,7 +367,8 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
|
||||
# Update only fs_version field using queryset update (bypasses validation)
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
|
||||
|
||||
# Commit the transaction
|
||||
transaction.commit()
|
||||
@@ -245,22 +377,22 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if old_dir.exists() and old_dir != new_dir:
|
||||
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
|
||||
|
||||
stats['migrated'] += 1
|
||||
stats["migrated"] += 1
|
||||
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
|
||||
except Exception as e:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
|
||||
transaction.commit()
|
||||
return stats
|
||||
|
||||
|
||||
def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]:
|
||||
"""
|
||||
O(n) scan over entire DB from most recent to least recent.
|
||||
|
||||
@@ -275,24 +407,30 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0}
|
||||
|
||||
total = Snapshot.objects.count()
|
||||
print(f'[*] Processing {total} snapshots from database (most recent first)...')
|
||||
queryset = Snapshot.objects.all()
|
||||
if resume:
|
||||
queryset = queryset.filter(timestamp__lte=resume)
|
||||
total = queryset.count()
|
||||
print(f"[*] Processing {total} snapshots from database (most recent first)...")
|
||||
|
||||
# Process from most recent to least recent
|
||||
for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size):
|
||||
stats['processed'] += 1
|
||||
for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
# Skip snapshots with missing crawl references (orphaned by migration errors)
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
print(
|
||||
f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
|
||||
)
|
||||
|
||||
# Check if snapshot has a directory on disk
|
||||
from pathlib import Path
|
||||
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
|
||||
@@ -313,22 +451,23 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
|
||||
# Use queryset update to set fs_version without triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
snapshot.fs_version = '0.9.0'
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
|
||||
snapshot.fs_version = "0.9.0"
|
||||
|
||||
# Queue for archiving (state machine will handle it)
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1 if has_directory else 0
|
||||
stats['queued'] += 1
|
||||
stats["reconciled"] += 1 if has_directory else 0
|
||||
stats["queued"] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed (e.g., missing crawl)
|
||||
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
|
||||
continue
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
@@ -341,31 +480,28 @@ def process_filtered_snapshots(
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
batch_size: int
|
||||
resume: str | None,
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
"""Process snapshots matching filters (DB query only)."""
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from datetime import datetime
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0}
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
|
||||
|
||||
if before:
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
snapshots = _build_filtered_snapshots_queryset(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
resume=resume,
|
||||
)
|
||||
|
||||
total = snapshots.count()
|
||||
print(f'[*] Found {total} matching snapshots')
|
||||
print(f"[*] Found {total} matching snapshots")
|
||||
|
||||
for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size):
|
||||
stats['processed'] += 1
|
||||
for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
# Skip snapshots with missing crawl references
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
@@ -384,14 +520,14 @@ def process_filtered_snapshots(
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1
|
||||
stats['queued'] += 1
|
||||
stats["reconciled"] += 1
|
||||
stats["queued"] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed
|
||||
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
|
||||
continue
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
@@ -405,9 +541,9 @@ def print_stats(stats: dict):
|
||||
|
||||
print(f"""
|
||||
[green]Update Complete[/green]
|
||||
Processed: {stats['processed']}
|
||||
Reconciled: {stats['reconciled']}
|
||||
Queued: {stats['queued']}
|
||||
Processed: {stats["processed"]}
|
||||
Reconciled: {stats["reconciled"]}
|
||||
Queued: {stats["queued"]}
|
||||
""")
|
||||
|
||||
|
||||
@@ -415,37 +551,50 @@ def print_combined_stats(stats_combined: dict):
|
||||
"""Print statistics for full mode."""
|
||||
from rich import print
|
||||
|
||||
s1 = stats_combined['phase1']
|
||||
s2 = stats_combined['phase2']
|
||||
s1 = stats_combined["phase1"]
|
||||
s2 = stats_combined["phase2"]
|
||||
|
||||
print(f"""
|
||||
[green]Archive Update Complete[/green]
|
||||
|
||||
Phase 1 (Drain Old Dirs):
|
||||
Checked: {s1.get('processed', 0)}
|
||||
Migrated: {s1.get('migrated', 0)}
|
||||
Skipped: {s1.get('skipped', 0)}
|
||||
Invalid: {s1.get('invalid', 0)}
|
||||
Checked: {s1.get("processed", 0)}
|
||||
Migrated: {s1.get("migrated", 0)}
|
||||
Skipped: {s1.get("skipped", 0)}
|
||||
Invalid: {s1.get("invalid", 0)}
|
||||
|
||||
Phase 2 (Process DB):
|
||||
Processed: {s2.get('processed', 0)}
|
||||
Reconciled: {s2.get('reconciled', 0)}
|
||||
Queued: {s2.get('queued', 0)}
|
||||
Processed: {s2.get("processed", 0)}
|
||||
Reconciled: {s2.get("reconciled", 0)}
|
||||
Queued: {s2.get("queued", 0)}
|
||||
""")
|
||||
|
||||
|
||||
def print_index_stats(stats: dict[str, Any]) -> None:
|
||||
from rich import print
|
||||
|
||||
print(f"""
|
||||
[green]Search Reindex Complete[/green]
|
||||
Processed: {stats["processed"]}
|
||||
Reconciled: {stats["reconciled"]}
|
||||
Queued: {stats["queued"]}
|
||||
Reindexed: {stats["reindexed"]}
|
||||
""")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--resume', type=str, help='Resume from timestamp')
|
||||
@click.option('--before', type=float, help='Only snapshots before timestamp')
|
||||
@click.option('--after', type=float, help='Only snapshots after timestamp')
|
||||
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
|
||||
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
|
||||
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option("--resume", type=str, help="Resume from timestamp")
|
||||
@click.option("--before", type=float, help="Only snapshots before timestamp")
|
||||
@click.option("--after", type=float, help="Only snapshots after timestamp")
|
||||
@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact")
|
||||
@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots")
|
||||
@click.option("--continuous", is_flag=True, help="Run continuously as background worker")
|
||||
@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content")
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
update(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import sys
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -14,19 +14,22 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def version(quiet: bool=False,
|
||||
binaries: Iterable[str]=()) -> list[str]:
|
||||
def version(
|
||||
quiet: bool = False,
|
||||
binaries: Iterable[str] = (),
|
||||
) -> list[str]:
|
||||
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
|
||||
|
||||
|
||||
# fast path for just getting the version and exiting, dont do any slower imports
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
print(VERSION)
|
||||
if quiet or '--version' in sys.argv:
|
||||
if quiet or "--version" in sys.argv:
|
||||
return []
|
||||
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.console import Console
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS, DATA_DIR
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
|
||||
@@ -34,78 +37,89 @@ def version(quiet: bool=False,
|
||||
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.misc.logging_util import printable_folder_status
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
|
||||
# Check if LDAP is enabled (simple config lookup)
|
||||
config = get_config()
|
||||
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
|
||||
LDAP_ENABLED = config.get("LDAP_ENABLED", False)
|
||||
|
||||
p = platform.uname()
|
||||
COMMIT_HASH = get_COMMIT_HASH()
|
||||
prnt(
|
||||
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={get_BUILD_TIME()}',
|
||||
f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]",
|
||||
f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}",
|
||||
f"BUILD_TIME={get_BUILD_TIME()}",
|
||||
)
|
||||
prnt(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
|
||||
f'ARCH={p.machine}',
|
||||
f'OS={p.system}',
|
||||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||
f"IN_DOCKER={IN_DOCKER}",
|
||||
f"IN_QEMU={SHELL_CONFIG.IN_QEMU}",
|
||||
f"ARCH={p.machine}",
|
||||
f"OS={p.system}",
|
||||
f"PLATFORM={platform.platform()}",
|
||||
f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""),
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
except Exception:
|
||||
OUTPUT_IS_REMOTE_FS = False
|
||||
|
||||
|
||||
try:
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
|
||||
f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}",
|
||||
f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}",
|
||||
f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}",
|
||||
f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}",
|
||||
)
|
||||
except Exception:
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
|
||||
)
|
||||
|
||||
|
||||
prnt(
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
f'SUDO={CONSTANTS.IS_ROOT}',
|
||||
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_ENABLED}',
|
||||
f"DEBUG={SHELL_CONFIG.DEBUG}",
|
||||
f"IS_TTY={SHELL_CONFIG.IS_TTY}",
|
||||
f"SUDO={CONSTANTS.IS_ROOT}",
|
||||
f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}",
|
||||
f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}",
|
||||
f"LDAP={LDAP_ENABLED}",
|
||||
)
|
||||
prnt()
|
||||
|
||||
|
||||
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||
PANEL_TEXT = '\n'.join((
|
||||
'',
|
||||
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||
'',
|
||||
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
|
||||
'',
|
||||
))
|
||||
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
PANEL_TEXT = "\n".join(
|
||||
(
|
||||
"",
|
||||
"[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...",
|
||||
" [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.",
|
||||
"",
|
||||
" [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]",
|
||||
"",
|
||||
),
|
||||
)
|
||||
prnt(
|
||||
Panel(
|
||||
PANEL_TEXT,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]",
|
||||
subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
|
||||
),
|
||||
)
|
||||
prnt()
|
||||
return []
|
||||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]")
|
||||
failures = []
|
||||
|
||||
# Setup Django before importing models
|
||||
try:
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
@@ -113,12 +127,17 @@ def version(quiet: bool=False,
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all binaries from the database with timeout protection
|
||||
all_installed = Binary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
all_installed = (
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("name")
|
||||
)
|
||||
|
||||
if not all_installed.exists():
|
||||
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
|
||||
else:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
@@ -126,71 +145,91 @@ def version(quiet: bool=False,
|
||||
continue
|
||||
|
||||
if installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
|
||||
version_str = (installed.version or "unknown")[:15]
|
||||
provider = (installed.binprovider or "env")[:8]
|
||||
prnt(
|
||||
"",
|
||||
"[green]√[/green]",
|
||||
"",
|
||||
installed.name.ljust(18),
|
||||
version_str.ljust(16),
|
||||
provider.ljust(8),
|
||||
display_path,
|
||||
overflow="ignore",
|
||||
crop=False,
|
||||
)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
|
||||
failures.append(installed.name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()
|
||||
if not has_any_installed:
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]")
|
||||
|
||||
except Exception as e:
|
||||
# Handle database errors gracefully (locked, missing, etc.)
|
||||
prnt()
|
||||
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
|
||||
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
|
||||
prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]")
|
||||
prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]")
|
||||
|
||||
if not binaries:
|
||||
# Show code and data locations
|
||||
prnt()
|
||||
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||
prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]")
|
||||
try:
|
||||
for name, path in get_code_locations().items():
|
||||
if isinstance(name, str) and isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting code locations: {e}[/red]')
|
||||
prnt(f" [red]Error getting code locations: {e}[/red]")
|
||||
|
||||
prnt()
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||
prnt("[bright_yellow][i] Data locations:[/bright_yellow]")
|
||||
try:
|
||||
for name, path in get_data_locations().items():
|
||||
if isinstance(name, str) and isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting data locations: {e}[/red]')
|
||||
|
||||
prnt(f" [red]Error getting data locations: {e}[/red]")
|
||||
|
||||
try:
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
|
||||
check_data_dir_permissions()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
prnt()
|
||||
prnt('[red][i] Data locations:[/red] (not in a data directory)')
|
||||
|
||||
prnt("[red][i] Data locations:[/red] (not in a data directory)")
|
||||
|
||||
prnt()
|
||||
|
||||
|
||||
if failures:
|
||||
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
|
||||
prnt(f' [red]{", ".join(failures)}[/red]')
|
||||
prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]")
|
||||
prnt(f" [red]{', '.join(failures)}[/red]")
|
||||
prnt()
|
||||
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
|
||||
prnt(' [green]archivebox install[/green]')
|
||||
prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:")
|
||||
prnt(" [green]archivebox install[/green]")
|
||||
prnt()
|
||||
return failures
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
|
||||
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
|
||||
@click.option(
|
||||
"--quiet",
|
||||
"-q",
|
||||
is_flag=True,
|
||||
help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)",
|
||||
)
|
||||
@click.option(
|
||||
"--binaries",
|
||||
"-b",
|
||||
help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)",
|
||||
)
|
||||
@docstring(version.__doc__)
|
||||
def main(**kwargs):
|
||||
failures = version(**kwargs)
|
||||
@@ -198,5 +237,5 @@ def main(**kwargs):
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,12 +5,10 @@ This module contains common utilities used across multiple CLI commands,
|
||||
extracted to avoid code duplication.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None):
|
||||
"""
|
||||
Apply Django-style filters from CLI kwargs to a QuerySet.
|
||||
|
||||
@@ -31,11 +29,11 @@ def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is None or key in ('limit', 'offset'):
|
||||
if value is None or key in ("limit", "offset"):
|
||||
continue
|
||||
# Handle CSV lists for __in filters
|
||||
if key.endswith('__in') and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(',')]
|
||||
if key.endswith("__in") and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(",")]
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
|
||||
@@ -5,16 +5,16 @@ This module provides backwards-compatible config exports for extractors
|
||||
and other modules that expect to import config values directly.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
__order__ = 200
|
||||
|
||||
from .paths import (
|
||||
PACKAGE_DIR, # noqa
|
||||
DATA_DIR, # noqa
|
||||
ARCHIVE_DIR, # noqa
|
||||
PACKAGE_DIR,
|
||||
DATA_DIR,
|
||||
ARCHIVE_DIR,
|
||||
)
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .version import VERSION # noqa
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .version import VERSION # noqa
|
||||
|
||||
|
||||
###############################################################################
|
||||
@@ -22,15 +22,18 @@ from .version import VERSION # noqa
|
||||
# These provide backwards compatibility with extractors that import from ..config
|
||||
###############################################################################
|
||||
|
||||
|
||||
def _get_config():
|
||||
"""Lazy import to avoid circular imports."""
|
||||
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
return ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
|
||||
# Direct exports (evaluated at import time for backwards compat)
|
||||
# These are recalculated each time the module attribute is accessed
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""
|
||||
Module-level __getattr__ for lazy config loading.
|
||||
@@ -40,38 +43,38 @@ def __getattr__(name: str):
|
||||
"""
|
||||
|
||||
# Generic timeout settings (used by multiple plugins)
|
||||
if name == 'TIMEOUT':
|
||||
if name == "TIMEOUT":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.TIMEOUT
|
||||
|
||||
# Generic SSL/Security settings (used by multiple plugins)
|
||||
if name == 'CHECK_SSL_VALIDITY':
|
||||
if name == "CHECK_SSL_VALIDITY":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.CHECK_SSL_VALIDITY
|
||||
|
||||
# Generic storage settings (used by multiple plugins)
|
||||
if name == 'RESTRICT_FILE_NAMES':
|
||||
if name == "RESTRICT_FILE_NAMES":
|
||||
_, storage = _get_config()
|
||||
return storage.RESTRICT_FILE_NAMES
|
||||
|
||||
# Generic user agent / cookies (used by multiple plugins)
|
||||
if name == 'COOKIES_FILE':
|
||||
if name == "COOKIES_FILE":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.COOKIES_FILE
|
||||
if name == 'USER_AGENT':
|
||||
if name == "USER_AGENT":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
|
||||
# Generic resolution settings (used by multiple plugins)
|
||||
if name == 'RESOLUTION':
|
||||
if name == "RESOLUTION":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.RESOLUTION
|
||||
|
||||
# Allowlist/Denylist patterns (compiled regexes)
|
||||
if name == 'SAVE_ALLOWLIST_PTN':
|
||||
if name == "SAVE_ALLOWLIST_PTN":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_ALLOWLIST_PTNS
|
||||
if name == 'SAVE_DENYLIST_PTN':
|
||||
if name == "SAVE_DENYLIST_PTN":
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_DENYLIST_PTNS
|
||||
|
||||
@@ -90,12 +93,13 @@ def get_CONFIG():
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
from .ldap import LDAP_CONFIG
|
||||
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
'LDAP_CONFIG': LDAP_CONFIG,
|
||||
"SHELL_CONFIG": SHELL_CONFIG,
|
||||
"STORAGE_CONFIG": STORAGE_CONFIG,
|
||||
"GENERAL_CONFIG": GENERAL_CONFIG,
|
||||
"SERVER_CONFIG": SERVER_CONFIG,
|
||||
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
|
||||
"SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
|
||||
"LDAP_CONFIG": LDAP_CONFIG,
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import json
|
||||
from typing import Any, Optional, Type, Tuple, Dict
|
||||
from typing import Any
|
||||
|
||||
from pathlib import Path
|
||||
from configparser import ConfigParser
|
||||
@@ -27,13 +27,15 @@ def get_real_name(key: str) -> str:
|
||||
return key
|
||||
|
||||
|
||||
def load_config_val(key: str,
|
||||
default: Any=None,
|
||||
type: Optional[Type]=None,
|
||||
aliases: Optional[Tuple[str, ...]]=None,
|
||||
config: Optional[benedict]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> Any:
|
||||
def load_config_val(
|
||||
key: str,
|
||||
default: Any = None,
|
||||
type: type | None = None,
|
||||
aliases: tuple[str, ...] | None = None,
|
||||
config: benedict | None = None,
|
||||
env_vars: os._Environ | None = None,
|
||||
config_file_vars: dict[str, str] | None = None,
|
||||
) -> Any:
|
||||
"""parse bool, int, and str key=value pairs from env"""
|
||||
|
||||
assert isinstance(config, dict)
|
||||
@@ -67,8 +69,8 @@ def load_config_val(key: str,
|
||||
assert isinstance(val, str)
|
||||
|
||||
# calculate value based on expected type
|
||||
BOOL_TRUEIES = ('true', 'yes', '1')
|
||||
BOOL_FALSEIES = ('false', 'no', '0')
|
||||
BOOL_TRUEIES = ("true", "yes", "1")
|
||||
BOOL_FALSEIES = ("false", "no", "0")
|
||||
|
||||
if type is bool:
|
||||
if val.lower() in BOOL_TRUEIES:
|
||||
@@ -76,28 +78,28 @@ def load_config_val(key: str,
|
||||
elif val.lower() in BOOL_FALSEIES:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
|
||||
raise ValueError(f"Invalid configuration option {key}={val} (expected a boolean: True/False)")
|
||||
|
||||
elif type is str:
|
||||
if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES):
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)')
|
||||
raise ValueError(f"Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)")
|
||||
return val.strip()
|
||||
|
||||
elif type is int:
|
||||
if not val.strip().isdigit():
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
|
||||
raise ValueError(f"Invalid configuration option {key}={val} (expected an integer)")
|
||||
return int(val.strip())
|
||||
|
||||
elif type is list or type is dict:
|
||||
return json.loads(val)
|
||||
|
||||
|
||||
elif type is Path:
|
||||
return Path(val)
|
||||
|
||||
raise Exception('Config values can only be str, bool, int, or json')
|
||||
raise Exception("Config values can only be str, bool, int, or json")
|
||||
|
||||
|
||||
def load_config_file() -> Optional[benedict]:
|
||||
def load_config_file() -> benedict | None:
|
||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
@@ -105,17 +107,16 @@ def load_config_file() -> Optional[benedict]:
|
||||
config_file = CaseConfigParser()
|
||||
config_file.read(config_path)
|
||||
# flatten into one namespace
|
||||
config_file_vars = benedict({
|
||||
key.upper(): val
|
||||
for section, options in config_file.items()
|
||||
for key, val in options.items()
|
||||
})
|
||||
config_file_vars = benedict({key.upper(): val for section, options in config_file.items() for key, val in options.items()})
|
||||
# print('[i] Loaded config file', os.path.abspath(config_path))
|
||||
# print(config_file_vars)
|
||||
return config_file_vars
|
||||
return None
|
||||
|
||||
|
||||
class PluginConfigSection:
|
||||
"""Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
|
||||
|
||||
toml_section_header = "PLUGINS"
|
||||
|
||||
def __init__(self, key: str):
|
||||
@@ -144,8 +145,14 @@ def section_for_key(key: str) -> Any:
|
||||
)
|
||||
|
||||
# First check core config sections
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
for section in [
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
]:
|
||||
if hasattr(section, key):
|
||||
return section
|
||||
|
||||
@@ -154,20 +161,19 @@ def section_for_key(key: str) -> Any:
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' in schema and key in schema['properties']:
|
||||
if "properties" in schema and key in schema["properties"]:
|
||||
# All plugin config goes to [PLUGINS] section
|
||||
return PluginConfigSection(key)
|
||||
|
||||
raise ValueError(f'No config section found for key: {key}')
|
||||
raise ValueError(f"No config section found for key: {key}")
|
||||
|
||||
|
||||
def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
def write_config_file(config: dict[str, str]) -> benedict:
|
||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||
|
||||
from archivebox.misc.system import atomic_write
|
||||
|
||||
CONFIG_HEADER = (
|
||||
"""# This is the config file for your ArchiveBox collection.
|
||||
CONFIG_HEADER = """# This is the config file for your ArchiveBox collection.
|
||||
#
|
||||
# You can add options here manually in INI format, or automatically by running:
|
||||
# archivebox config --set KEY=VALUE
|
||||
@@ -178,7 +184,7 @@ def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
# A list of all possible config with documentation and examples can be found here:
|
||||
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||
|
||||
""")
|
||||
"""
|
||||
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
|
||||
@@ -188,57 +194,56 @@ def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
config_file = CaseConfigParser()
|
||||
config_file.read(config_path)
|
||||
|
||||
with open(config_path, 'r', encoding='utf-8') as old:
|
||||
atomic_write(f'{config_path}.bak', old.read())
|
||||
with open(config_path, encoding="utf-8") as old:
|
||||
atomic_write(f"{config_path}.bak", old.read())
|
||||
|
||||
# Set up sections in empty config file
|
||||
for key, val in config.items():
|
||||
section = section_for_key(key)
|
||||
assert section is not None
|
||||
|
||||
if not hasattr(section, 'toml_section_header'):
|
||||
raise ValueError(f'{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.')
|
||||
|
||||
|
||||
if not hasattr(section, "toml_section_header"):
|
||||
raise ValueError(f"{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.")
|
||||
|
||||
section_name = section.toml_section_header
|
||||
|
||||
|
||||
if section_name in config_file:
|
||||
existing_config = dict(config_file[section_name])
|
||||
else:
|
||||
existing_config = {}
|
||||
|
||||
|
||||
config_file[section_name] = benedict({**existing_config, key: val})
|
||||
section.update_in_place(warn=False, persist=False, **{key: val})
|
||||
|
||||
with open(config_path, 'w+', encoding='utf-8') as new:
|
||||
with open(config_path, "w+", encoding="utf-8") as new:
|
||||
config_file.write(new)
|
||||
|
||||
updated_config = {}
|
||||
try:
|
||||
# validate the updated_config by attempting to re-parse it
|
||||
from archivebox.config.configset import get_flat_config
|
||||
|
||||
updated_config = {**load_all_config(), **get_flat_config()}
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
# something went horribly wrong, revert to the previous version
|
||||
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
||||
with open(f"{config_path}.bak", encoding="utf-8") as old:
|
||||
atomic_write(config_path, old.read())
|
||||
|
||||
raise
|
||||
|
||||
if os.access(f'{config_path}.bak', os.F_OK):
|
||||
os.remove(f'{config_path}.bak')
|
||||
if os.access(f"{config_path}.bak", os.F_OK):
|
||||
os.remove(f"{config_path}.bak")
|
||||
|
||||
return benedict({
|
||||
key.upper(): updated_config.get(key.upper())
|
||||
for key in config.keys()
|
||||
})
|
||||
return benedict({key.upper(): updated_config.get(key.upper()) for key in config.keys()})
|
||||
|
||||
|
||||
|
||||
def load_config(defaults: Dict[str, Any],
|
||||
config: Optional[benedict]=None,
|
||||
out_dir: Optional[str]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> benedict:
|
||||
def load_config(
|
||||
defaults: dict[str, Any],
|
||||
config: benedict | None = None,
|
||||
out_dir: str | None = None,
|
||||
env_vars: os._Environ | None = None,
|
||||
config_file_vars: dict[str, str] | None = None,
|
||||
) -> benedict:
|
||||
|
||||
env_vars = env_vars or os.environ
|
||||
config_file_vars = config_file_vars or load_config_file()
|
||||
@@ -249,9 +254,9 @@ def load_config(defaults: Dict[str, Any],
|
||||
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
|
||||
extended_config[key] = load_config_val(
|
||||
key,
|
||||
default=default['default'],
|
||||
type=default.get('type'),
|
||||
aliases=default.get('aliases'),
|
||||
default=default["default"],
|
||||
type=default.get("type"),
|
||||
aliases=default.get("aliases"),
|
||||
config=extended_config,
|
||||
env_vars=env_vars,
|
||||
config_file_vars=config_file_vars,
|
||||
@@ -260,19 +265,20 @@ def load_config(defaults: Dict[str, Any],
|
||||
raise SystemExit(0)
|
||||
except Exception as e:
|
||||
stderr()
|
||||
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
|
||||
stderr(' {}: {}'.format(e.__class__.__name__, e))
|
||||
stderr(f"[X] Error while loading configuration value: {key}", color="red", config=extended_config)
|
||||
stderr(f" {e.__class__.__name__}: {e}")
|
||||
stderr()
|
||||
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
||||
stderr(" Check your config for mistakes and try again (your archive data is unaffected).")
|
||||
stderr()
|
||||
stderr(' For config documentation and examples see:')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
||||
stderr(" For config documentation and examples see:")
|
||||
stderr(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration")
|
||||
stderr()
|
||||
# raise
|
||||
# raise SystemExit(2)
|
||||
|
||||
return benedict(extended_config)
|
||||
|
||||
|
||||
def load_all_config():
|
||||
"""Load all config sections and return as a flat dict."""
|
||||
from archivebox.config.common import (
|
||||
@@ -283,11 +289,17 @@ def load_all_config():
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
|
||||
flat_config = benedict()
|
||||
|
||||
for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
|
||||
for config_section in [
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
]:
|
||||
flat_config.update(dict(config_section))
|
||||
|
||||
|
||||
return flat_config
|
||||
|
||||
@@ -4,7 +4,7 @@ import re
|
||||
import secrets
|
||||
import sys
|
||||
import shutil
|
||||
from typing import ClassVar, Dict, Optional, List
|
||||
from typing import ClassVar
|
||||
from pathlib import Path
|
||||
|
||||
from rich.console import Console
|
||||
@@ -39,8 +39,8 @@ class ShellConfig(BaseConfigSet):
|
||||
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
||||
IN_QEMU: bool = Field(default=False)
|
||||
|
||||
ANSI: Dict[str, str] = Field(
|
||||
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
|
||||
ANSI: dict[str, str] = Field(
|
||||
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -50,7 +50,7 @@ class ShellConfig(BaseConfigSet):
|
||||
return shutil.get_terminal_size((140, 10)).columns
|
||||
|
||||
@property
|
||||
def COMMIT_HASH(self) -> Optional[str]:
|
||||
def COMMIT_HASH(self) -> str | None:
|
||||
return get_COMMIT_HASH()
|
||||
|
||||
@property
|
||||
@@ -112,7 +112,7 @@ class ServerConfig(BaseConfigSet):
|
||||
"danger-onedomain-fullreplay",
|
||||
)
|
||||
|
||||
SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
|
||||
SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
|
||||
BIND_ADDR: str = Field(default="127.0.0.1:8000")
|
||||
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
|
||||
ADMIN_BASE_URL: str = Field(default="")
|
||||
@@ -124,7 +124,7 @@ class ServerConfig(BaseConfigSet):
|
||||
SNAPSHOTS_PER_PAGE: int = Field(default=40)
|
||||
PREVIEW_ORIGINALS: bool = Field(default=True)
|
||||
FOOTER_INFO: str = Field(
|
||||
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
||||
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.",
|
||||
)
|
||||
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
|
||||
|
||||
@@ -132,8 +132,8 @@ class ServerConfig(BaseConfigSet):
|
||||
PUBLIC_SNAPSHOTS: bool = Field(default=True)
|
||||
PUBLIC_ADD_VIEW: bool = Field(default=False)
|
||||
|
||||
ADMIN_USERNAME: Optional[str] = Field(default=None)
|
||||
ADMIN_PASSWORD: Optional[str] = Field(default=None)
|
||||
ADMIN_USERNAME: str | None = Field(default=None)
|
||||
ADMIN_PASSWORD: str | None = Field(default=None)
|
||||
|
||||
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
|
||||
REVERSE_PROXY_WHITELIST: str = Field(default="")
|
||||
@@ -234,22 +234,22 @@ class ArchivingConfig(BaseConfigSet):
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
USER_AGENT: str = Field(
|
||||
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
|
||||
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)",
|
||||
)
|
||||
COOKIES_FILE: Path | None = Field(default=None)
|
||||
|
||||
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
|
||||
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
|
||||
|
||||
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
||||
SAVE_ALLOWLIST: dict[str, list[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||
SAVE_DENYLIST: dict[str, list[str]] = Field(default={})
|
||||
|
||||
DEFAULT_PERSONA: str = Field(default="Default")
|
||||
|
||||
def warn_if_invalid(self) -> None:
|
||||
if int(self.TIMEOUT) < 5:
|
||||
rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
||||
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
|
||||
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr)
|
||||
rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
|
||||
rprint(file=sys.stderr)
|
||||
rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
|
||||
@@ -274,7 +274,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
|
||||
|
||||
@property
|
||||
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||
def SAVE_ALLOWLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
|
||||
return (
|
||||
{
|
||||
# regexp: methods list
|
||||
@@ -286,7 +286,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
)
|
||||
|
||||
@property
|
||||
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||
def SAVE_DENYLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
|
||||
return (
|
||||
{
|
||||
# regexp: methods list
|
||||
|
||||
@@ -11,7 +11,7 @@ __package__ = "archivebox.config"
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Type, Tuple
|
||||
from typing import Any
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
|
||||
@@ -28,17 +28,18 @@ class IniConfigSettingsSource(PydanticBaseSettingsSource):
|
||||
Flattens all sections into a single namespace.
|
||||
"""
|
||||
|
||||
def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
|
||||
def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]:
|
||||
config_vals = self._load_config_file()
|
||||
field_value = config_vals.get(field_name.upper())
|
||||
return field_value, field_name, False
|
||||
|
||||
def __call__(self) -> Dict[str, Any]:
|
||||
def __call__(self) -> dict[str, Any]:
|
||||
return self._load_config_file()
|
||||
|
||||
def _load_config_file(self) -> Dict[str, Any]:
|
||||
def _load_config_file(self) -> dict[str, Any]:
|
||||
try:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
except ImportError:
|
||||
return {}
|
||||
@@ -78,25 +79,25 @@ class BaseConfigSet(BaseSettings):
|
||||
@classmethod
|
||||
def settings_customise_sources(
|
||||
cls,
|
||||
settings_cls: Type[BaseSettings],
|
||||
settings_cls: type[BaseSettings],
|
||||
init_settings: PydanticBaseSettingsSource,
|
||||
env_settings: PydanticBaseSettingsSource,
|
||||
dotenv_settings: PydanticBaseSettingsSource,
|
||||
file_secret_settings: PydanticBaseSettingsSource,
|
||||
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
||||
) -> tuple[PydanticBaseSettingsSource, ...]:
|
||||
"""
|
||||
Define the order of settings sources (first = highest priority).
|
||||
"""
|
||||
return (
|
||||
init_settings, # 1. Passed to __init__
|
||||
env_settings, # 2. Environment variables
|
||||
init_settings, # 1. Passed to __init__
|
||||
env_settings, # 2. Environment variables
|
||||
IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
|
||||
# dotenv_settings, # Skip .env files
|
||||
# file_secret_settings, # Skip secrets files
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
|
||||
def load_from_file(cls, config_path: Path) -> dict[str, str]:
|
||||
"""Load config values from INI file."""
|
||||
if not config_path.exists():
|
||||
return {}
|
||||
@@ -120,14 +121,14 @@ class BaseConfigSet(BaseSettings):
|
||||
|
||||
|
||||
def get_config(
|
||||
defaults: Optional[Dict] = None,
|
||||
defaults: dict | None = None,
|
||||
persona: Any = None,
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
archiveresult: Any = None,
|
||||
machine: Any = None,
|
||||
) -> Dict[str, Any]:
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Get merged config from all sources.
|
||||
|
||||
@@ -176,7 +177,7 @@ def get_config(
|
||||
if persona_id:
|
||||
persona = Persona.objects.filter(id=persona_id).first()
|
||||
if persona is None:
|
||||
raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
|
||||
raise Persona.DoesNotExist(f"Crawl {getattr(crawl, 'id', None)} references missing Persona {persona_id}")
|
||||
|
||||
if persona is None:
|
||||
crawl_config = getattr(crawl, "config", None) or {}
|
||||
@@ -200,6 +201,7 @@ def get_config(
|
||||
# Add plugin config defaults from JSONSchema config.json files
|
||||
try:
|
||||
from archivebox.hooks import get_config_defaults_from_plugins
|
||||
|
||||
plugin_defaults = get_config_defaults_from_plugins()
|
||||
config.update(plugin_defaults)
|
||||
except ImportError:
|
||||
@@ -224,6 +226,7 @@ def get_config(
|
||||
# Default to current machine if not provided
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
except Exception:
|
||||
pass # Machine might not be available during early init
|
||||
@@ -246,16 +249,17 @@ def get_config(
|
||||
# Also check plugin config aliases in environment
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop_schema in schema.get('properties', {}).items():
|
||||
for key, prop_schema in schema.get("properties", {}).items():
|
||||
# Check x-aliases
|
||||
for alias in prop_schema.get('x-aliases', []):
|
||||
for alias in prop_schema.get("x-aliases", []):
|
||||
if alias in os.environ and key not in os.environ:
|
||||
config[key] = _parse_env_value(os.environ[alias], config.get(key))
|
||||
break
|
||||
# Check x-fallback
|
||||
fallback = prop_schema.get('x-fallback')
|
||||
fallback = prop_schema.get("x-fallback")
|
||||
if fallback and fallback in config and key not in config:
|
||||
config[key] = config[fallback]
|
||||
except ImportError:
|
||||
@@ -275,33 +279,34 @@ def get_config(
|
||||
|
||||
# Add crawl path aliases for hooks that need shared crawl state.
|
||||
if crawl and hasattr(crawl, "output_dir"):
|
||||
config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
|
||||
config['CRAWL_DIR'] = str(crawl.output_dir)
|
||||
config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
|
||||
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
|
||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
|
||||
|
||||
# Apply snapshot config overrides (highest priority)
|
||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||
config.update(snapshot.config)
|
||||
|
||||
if snapshot:
|
||||
config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
|
||||
config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
|
||||
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID")
|
||||
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
|
||||
if hasattr(snapshot, "output_dir"):
|
||||
config['SNAP_DIR'] = str(snapshot.output_dir)
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
if getattr(snapshot, "crawl_id", None):
|
||||
config['CRAWL_ID'] = str(snapshot.crawl_id)
|
||||
config["CRAWL_ID"] = str(snapshot.crawl_id)
|
||||
|
||||
# Normalize all aliases to canonical names (after all sources merged)
|
||||
# This handles aliases that came from user/crawl/snapshot configs, not just env
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
aliases_to_normalize = {} # {alias_key: canonical_key}
|
||||
|
||||
# Build alias mapping from all plugin schemas
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for canonical_key, prop_schema in schema.get('properties', {}).items():
|
||||
for alias in prop_schema.get('x-aliases', []):
|
||||
for canonical_key, prop_schema in schema.get("properties", {}).items():
|
||||
for alias in prop_schema.get("x-aliases", []):
|
||||
aliases_to_normalize[alias] = canonical_key
|
||||
|
||||
# Normalize: copy alias values to canonical keys (aliases take precedence)
|
||||
@@ -314,10 +319,14 @@ def get_config(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if not config.get("DATA_DIR"):
|
||||
config["DATA_DIR"] = str(CONSTANTS.DATA_DIR)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def get_flat_config() -> Dict[str, Any]:
|
||||
def get_flat_config() -> dict[str, Any]:
|
||||
"""
|
||||
Get a flat dictionary of all config values.
|
||||
|
||||
@@ -326,20 +335,24 @@ def get_flat_config() -> Dict[str, Any]:
|
||||
return get_config()
|
||||
|
||||
|
||||
def get_all_configs() -> Dict[str, BaseConfigSet]:
|
||||
def get_all_configs() -> dict[str, BaseConfigSet]:
|
||||
"""
|
||||
Get all config section objects as a dictionary.
|
||||
|
||||
Replaces abx.pm.hook.get_CONFIGS()
|
||||
"""
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
SHELL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
"SHELL_CONFIG": SHELL_CONFIG,
|
||||
"SERVER_CONFIG": SERVER_CONFIG,
|
||||
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
|
||||
"SEARCH_BACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
|
||||
@@ -394,7 +407,7 @@ DEFAULT_WORKER_CONCURRENCY = {
|
||||
}
|
||||
|
||||
|
||||
def get_worker_concurrency() -> Dict[str, int]:
|
||||
def get_worker_concurrency() -> dict[str, int]:
|
||||
"""
|
||||
Get worker concurrency settings.
|
||||
|
||||
|
||||
@@ -5,17 +5,16 @@ Constants are for things that never change at runtime.
|
||||
DATA_DIR will never change at runtime, but you can run
|
||||
archivebox from inside a different DATA_DIR on the same machine.
|
||||
|
||||
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||
or imported from this file should import anything from archivebox.config.common,
|
||||
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||
or imported from this file should import anything from archivebox.config.common,
|
||||
django, other INSTALLED_APPS, or anything else that is not in a standard library.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
||||
from benedict import benedict
|
||||
@@ -46,184 +45,235 @@ from .version import detect_installed_version
|
||||
|
||||
|
||||
class ConstantsDict:
|
||||
PACKAGE_DIR: Path = PACKAGE_DIR
|
||||
DATA_DIR: Path = DATA_DIR
|
||||
ARCHIVE_DIR: Path = ARCHIVE_DIR
|
||||
|
||||
MACHINE_TYPE: str = get_machine_type()
|
||||
MACHINE_ID: str = get_machine_id()
|
||||
COLLECTION_ID: str = get_collection_id(DATA_DIR)
|
||||
|
||||
PACKAGE_DIR: Path = PACKAGE_DIR
|
||||
DATA_DIR: Path = DATA_DIR
|
||||
ARCHIVE_DIR: Path = ARCHIVE_DIR
|
||||
|
||||
MACHINE_TYPE: str = get_machine_type()
|
||||
MACHINE_ID: str = get_machine_id()
|
||||
COLLECTION_ID: str = get_collection_id(DATA_DIR)
|
||||
|
||||
# Host system
|
||||
VERSION: str = detect_installed_version(PACKAGE_DIR)
|
||||
IN_DOCKER: bool = IN_DOCKER
|
||||
|
||||
VERSION: str = detect_installed_version(PACKAGE_DIR)
|
||||
IN_DOCKER: bool = IN_DOCKER
|
||||
|
||||
# Permissions
|
||||
IS_ROOT: bool = IS_ROOT
|
||||
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
|
||||
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
|
||||
RUNNING_AS_UID: int = RUNNING_AS_UID
|
||||
RUNNING_AS_GID: int = RUNNING_AS_GID
|
||||
DEFAULT_PUID: int = DEFAULT_PUID
|
||||
DEFAULT_PGID: int = DEFAULT_PGID
|
||||
IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
|
||||
|
||||
IS_ROOT: bool = IS_ROOT
|
||||
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
|
||||
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
|
||||
RUNNING_AS_UID: int = RUNNING_AS_UID
|
||||
RUNNING_AS_GID: int = RUNNING_AS_GID
|
||||
DEFAULT_PUID: int = DEFAULT_PUID
|
||||
DEFAULT_PGID: int = DEFAULT_PGID
|
||||
IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
|
||||
|
||||
# Source code dirs
|
||||
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
|
||||
TEMPLATES_DIR_NAME: str = 'templates'
|
||||
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
|
||||
STATIC_DIR_NAME: str = 'static'
|
||||
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
|
||||
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
|
||||
TEMPLATES_DIR_NAME: str = "templates"
|
||||
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
|
||||
STATIC_DIR_NAME: str = "static"
|
||||
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
|
||||
|
||||
# Data dirs
|
||||
ARCHIVE_DIR_NAME: str = 'archive'
|
||||
SOURCES_DIR_NAME: str = 'sources'
|
||||
PERSONAS_DIR_NAME: str = 'personas'
|
||||
CACHE_DIR_NAME: str = 'cache'
|
||||
LOGS_DIR_NAME: str = 'logs'
|
||||
CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins'
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates'
|
||||
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
||||
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
||||
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
||||
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
|
||||
ARCHIVE_DIR_NAME: str = "archive"
|
||||
SOURCES_DIR_NAME: str = "sources"
|
||||
PERSONAS_DIR_NAME: str = "personas"
|
||||
CACHE_DIR_NAME: str = "cache"
|
||||
LOGS_DIR_NAME: str = "logs"
|
||||
CUSTOM_PLUGINS_DIR_NAME: str = "custom_plugins"
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = "custom_templates"
|
||||
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
||||
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
||||
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
||||
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
|
||||
|
||||
# Data dir files
|
||||
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||
|
||||
JSON_INDEX_FILENAME: str = 'index.json'
|
||||
JSONL_INDEX_FILENAME: str = 'index.jsonl'
|
||||
HTML_INDEX_FILENAME: str = 'index.html'
|
||||
ROBOTS_TXT_FILENAME: str = 'robots.txt'
|
||||
FAVICON_FILENAME: str = 'favicon.ico'
|
||||
|
||||
# Runtime dirs
|
||||
TMP_DIR_NAME: str = 'tmp'
|
||||
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
|
||||
CONFIG_FILENAME: str = "ArchiveBox.conf"
|
||||
SQL_INDEX_FILENAME: str = "index.sqlite3"
|
||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||
|
||||
LIB_DIR_NAME: str = 'lib'
|
||||
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
|
||||
DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / 'bin' # ./data/lib/arm64-linux-docker/bin
|
||||
JSON_INDEX_FILENAME: str = "index.json"
|
||||
JSONL_INDEX_FILENAME: str = "index.jsonl"
|
||||
HTML_INDEX_FILENAME: str = "index.html"
|
||||
ROBOTS_TXT_FILENAME: str = "robots.txt"
|
||||
FAVICON_FILENAME: str = "favicon.ico"
|
||||
|
||||
# Runtime dirs
|
||||
TMP_DIR_NAME: str = "tmp"
|
||||
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
|
||||
|
||||
LIB_DIR_NAME: str = "lib"
|
||||
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
|
||||
DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / "bin" # ./data/lib/arm64-linux-docker/bin
|
||||
|
||||
# Config constants
|
||||
TIMEZONE: str = 'UTC'
|
||||
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
|
||||
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
|
||||
TIMEZONE: str = "UTC"
|
||||
DEFAULT_CLI_COLORS: dict[str, str] = DEFAULT_CLI_COLORS
|
||||
DISABLED_CLI_COLORS: dict[str, str] = benedict({k: "" for k in DEFAULT_CLI_COLORS})
|
||||
|
||||
# Hard safety limits (seconds)
|
||||
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours
|
||||
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
|
||||
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
||||
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||
'atom', 'rss', 'css', 'js', 'json',
|
||||
'dmg', 'iso', 'img',
|
||||
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
||||
STATICFILE_EXTENSIONS: frozenset[str] = frozenset(
|
||||
(
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
"gif",
|
||||
"jpeg",
|
||||
"jpg",
|
||||
"png",
|
||||
"tif",
|
||||
"tiff",
|
||||
"wbmp",
|
||||
"ico",
|
||||
"jng",
|
||||
"bmp",
|
||||
"svg",
|
||||
"svgz",
|
||||
"webp",
|
||||
"ps",
|
||||
"eps",
|
||||
"ai",
|
||||
"mp3",
|
||||
"mp4",
|
||||
"m4a",
|
||||
"mpeg",
|
||||
"mpg",
|
||||
"mkv",
|
||||
"mov",
|
||||
"webm",
|
||||
"m4v",
|
||||
"flv",
|
||||
"wmv",
|
||||
"avi",
|
||||
"ogg",
|
||||
"ts",
|
||||
"m3u8",
|
||||
"pdf",
|
||||
"txt",
|
||||
"rtf",
|
||||
"rtfd",
|
||||
"doc",
|
||||
"docx",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"xls",
|
||||
"xlsx",
|
||||
"atom",
|
||||
"rss",
|
||||
"css",
|
||||
"js",
|
||||
"json",
|
||||
"dmg",
|
||||
"iso",
|
||||
"img",
|
||||
"rar",
|
||||
"war",
|
||||
"hqx",
|
||||
"zip",
|
||||
"gz",
|
||||
"bz2",
|
||||
"7z",
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
# These are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
),
|
||||
)
|
||||
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
|
||||
# These are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
))
|
||||
|
||||
PIP_RELATED_NAMES: frozenset[str] = frozenset((
|
||||
".venv",
|
||||
"venv",
|
||||
"virtualenv",
|
||||
".virtualenv",
|
||||
))
|
||||
NPM_RELATED_NAMES: frozenset[str] = frozenset((
|
||||
"node_modules",
|
||||
"package.json",
|
||||
"package-lock.json",
|
||||
"yarn.lock",
|
||||
))
|
||||
PIP_RELATED_NAMES: frozenset[str] = frozenset(
|
||||
(
|
||||
".venv",
|
||||
"venv",
|
||||
"virtualenv",
|
||||
".virtualenv",
|
||||
),
|
||||
)
|
||||
NPM_RELATED_NAMES: frozenset[str] = frozenset(
|
||||
(
|
||||
"node_modules",
|
||||
"package.json",
|
||||
"package-lock.json",
|
||||
"yarn.lock",
|
||||
),
|
||||
)
|
||||
|
||||
# When initializing archivebox in a new directory, we check to make sure the dir is
|
||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
|
||||
### Dirs:
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
CACHE_DIR_NAME,
|
||||
LIB_DIR_NAME,
|
||||
TMP_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
CUSTOM_TEMPLATES_DIR_NAME,
|
||||
CUSTOM_PLUGINS_DIR_NAME,
|
||||
"invalid",
|
||||
"users",
|
||||
"machine",
|
||||
# Backwards compatibility with old directory names
|
||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount / sonic FTS process
|
||||
".git",
|
||||
".svn",
|
||||
|
||||
### Files:
|
||||
CONFIG_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f"{SQL_INDEX_FILENAME}-wal",
|
||||
f"{SQL_INDEX_FILENAME}-shm",
|
||||
"search.sqlite3",
|
||||
"queue.sqlite3",
|
||||
"queue.sqlite3-wal",
|
||||
"queue.sqlite3-shm",
|
||||
JSON_INDEX_FILENAME,
|
||||
JSONL_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f"{CONFIG_FILENAME}.bak",
|
||||
f".{CONFIG_FILENAME}.bak",
|
||||
"static_index.json",
|
||||
".DS_Store",
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".env",
|
||||
".collection_id",
|
||||
".archivebox_id",
|
||||
"Dockerfile",
|
||||
))
|
||||
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset(
|
||||
(
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
### Dirs:
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
CACHE_DIR_NAME,
|
||||
LIB_DIR_NAME,
|
||||
TMP_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
CUSTOM_TEMPLATES_DIR_NAME,
|
||||
CUSTOM_PLUGINS_DIR_NAME,
|
||||
"invalid",
|
||||
"users",
|
||||
"machine",
|
||||
# Backwards compatibility with old directory names
|
||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount / sonic FTS process
|
||||
".git",
|
||||
".svn",
|
||||
### Files:
|
||||
CONFIG_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f"{SQL_INDEX_FILENAME}-wal",
|
||||
f"{SQL_INDEX_FILENAME}-shm",
|
||||
"search.sqlite3",
|
||||
"queue.sqlite3",
|
||||
"queue.sqlite3-wal",
|
||||
"queue.sqlite3-shm",
|
||||
JSON_INDEX_FILENAME,
|
||||
JSONL_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f"{CONFIG_FILENAME}.bak",
|
||||
f".{CONFIG_FILENAME}.bak",
|
||||
"static_index.json",
|
||||
".DS_Store",
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".env",
|
||||
".collection_id",
|
||||
".archivebox_id",
|
||||
"Dockerfile",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def __getitem__(cls, key: str):
|
||||
# so it behaves like a dict[key] == dict.key or object attr
|
||||
return getattr(cls, key)
|
||||
|
||||
|
||||
@classmethod
|
||||
def __benedict__(cls):
|
||||
# when casting to benedict, only include uppercase keys that don't start with an underscore
|
||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith("_")})
|
||||
|
||||
|
||||
CONSTANTS = ConstantsDict
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -17,9 +17,9 @@ from .common import SHELL_CONFIG
|
||||
|
||||
|
||||
if not SHELL_CONFIG.USE_COLOR:
|
||||
os.environ['NO_COLOR'] = '1'
|
||||
os.environ["NO_COLOR"] = "1"
|
||||
if not SHELL_CONFIG.SHOW_PROGRESS:
|
||||
os.environ['TERM'] = 'dumb'
|
||||
os.environ["TERM"] = "dumb"
|
||||
|
||||
# recreate rich console obj based on new config values
|
||||
STDOUT = CONSOLE = Console()
|
||||
@@ -32,7 +32,8 @@ def setup_django_minimal():
|
||||
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
|
||||
# os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
# django.setup()
|
||||
raise Exception('dont use this anymore')
|
||||
raise Exception("dont use this anymore")
|
||||
|
||||
|
||||
DJANGO_SET_UP = False
|
||||
|
||||
@@ -61,15 +62,18 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
# This warning can be triggered during django.setup() but is safe to ignore
|
||||
# since we're doing intentional setup operations
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore',
|
||||
message='.*Accessing the database during app initialization.*',
|
||||
category=RuntimeWarning)
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=".*Accessing the database during app initialization.*",
|
||||
category=RuntimeWarning,
|
||||
)
|
||||
|
||||
try:
|
||||
from django.core.management import call_command
|
||||
|
||||
if in_memory_db:
|
||||
raise Exception('dont use this anymore')
|
||||
raise Exception("dont use this anymore")
|
||||
|
||||
# some commands dont store a long-lived sqlite3 db file on disk.
|
||||
# in those cases we create a temporary in-memory db and run the migrations
|
||||
@@ -84,19 +88,22 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
try:
|
||||
django.setup()
|
||||
except Exception as e:
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ("help", "version", "--help", "--version"))
|
||||
if not is_using_meta_cmd:
|
||||
# show error message to user only if they're not running a meta command / just trying to get help
|
||||
STDERR.print()
|
||||
STDERR.print(Panel(
|
||||
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
|
||||
title='\n\n[red][X] Error while trying to load database![/red]',
|
||||
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
|
||||
expand=False,
|
||||
style='bold red',
|
||||
))
|
||||
STDERR.print(
|
||||
Panel(
|
||||
f"\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n",
|
||||
title="\n\n[red][X] Error while trying to load database![/red]",
|
||||
subtitle="[grey53]NO WRITES CAN BE PERFORMED[/grey53]",
|
||||
expand=False,
|
||||
style="bold red",
|
||||
),
|
||||
)
|
||||
STDERR.print()
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
@@ -104,28 +111,29 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG
|
||||
|
||||
# log startup message to the error log
|
||||
error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG)
|
||||
with open(error_log, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
error_log = getattr(settings, "ERROR_LOG", DEFAULT_ERROR_LOG)
|
||||
with open(error_log, "a", encoding="utf-8") as f:
|
||||
command = " ".join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d__%H:%M:%S")
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
if check_db:
|
||||
# make sure the data dir is owned by a non-root user
|
||||
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
|
||||
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
|
||||
STDERR.print(f' {CONSTANTS.DATA_DIR}')
|
||||
STDERR.print("[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]")
|
||||
STDERR.print(f" {CONSTANTS.DATA_DIR}")
|
||||
STDERR.print()
|
||||
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
|
||||
STDERR.print(' cd path/to/your/archive/data')
|
||||
STDERR.print(' archivebox [command]')
|
||||
STDERR.print("[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)")
|
||||
STDERR.print(" cd path/to/your/archive/data")
|
||||
STDERR.print(" archivebox [command]")
|
||||
STDERR.print()
|
||||
raise SystemExit(9)
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
cache.get('test', None)
|
||||
|
||||
cache.get("test", None)
|
||||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
@@ -133,12 +141,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
from django.db import connections
|
||||
|
||||
for conn in connections.all():
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
|
||||
sql_index_path = CONSTANTS.DATABASE_FILE
|
||||
assert os.access(sql_index_path, os.F_OK), (
|
||||
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
|
||||
f"No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)"
|
||||
)
|
||||
|
||||
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
|
||||
# if settings.DEBUG_LOGFIRE:
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import Field
|
||||
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
@@ -13,13 +12,14 @@ class LDAPConfig(BaseConfigSet):
|
||||
Only loads and validates if django-auth-ldap is installed.
|
||||
These settings integrate with Django's LDAP authentication backend.
|
||||
"""
|
||||
|
||||
toml_section_header: str = "LDAP_CONFIG"
|
||||
|
||||
LDAP_ENABLED: bool = Field(default=False)
|
||||
LDAP_SERVER_URI: Optional[str] = Field(default=None)
|
||||
LDAP_BIND_DN: Optional[str] = Field(default=None)
|
||||
LDAP_BIND_PASSWORD: Optional[str] = Field(default=None)
|
||||
LDAP_USER_BASE: Optional[str] = Field(default=None)
|
||||
LDAP_SERVER_URI: str | None = Field(default=None)
|
||||
LDAP_BIND_DN: str | None = Field(default=None)
|
||||
LDAP_BIND_PASSWORD: str | None = Field(default=None)
|
||||
LDAP_USER_BASE: str | None = Field(default=None)
|
||||
LDAP_USER_FILTER: str = Field(default="(uid=%(user)s)")
|
||||
LDAP_USERNAME_ATTR: str = Field(default="username")
|
||||
LDAP_FIRSTNAME_ATTR: str = Field(default="givenName")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import socket
|
||||
@@ -15,24 +15,25 @@ from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
|
||||
|
||||
#############################################################################################
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir
|
||||
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
|
||||
DATABASE_FILE = DATA_DIR / 'index.sqlite3'
|
||||
DATABASE_FILE = DATA_DIR / "index.sqlite3"
|
||||
|
||||
#############################################################################################
|
||||
|
||||
|
||||
def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
|
||||
collection_id_file = DATA_DIR / '.archivebox_id'
|
||||
|
||||
collection_id_file = DATA_DIR / ".archivebox_id"
|
||||
|
||||
try:
|
||||
return collection_id_file.read_text().strip()
|
||||
except (OSError, FileNotFoundError, PermissionError):
|
||||
pass
|
||||
|
||||
|
||||
# hash the machine_id + collection dir path + creation time to get a unique collection_id
|
||||
machine_id = get_machine_id()
|
||||
collection_path = DATA_DIR.resolve()
|
||||
@@ -40,55 +41,60 @@ def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
|
||||
creation_date = DATA_DIR.stat().st_ctime
|
||||
except Exception:
|
||||
creation_date = datetime.now().isoformat()
|
||||
collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8]
|
||||
|
||||
collection_id = hashlib.sha256(f"{machine_id}:{collection_path}@{creation_date}".encode()).hexdigest()[:8]
|
||||
|
||||
try:
|
||||
# only persist collection_id file if we already have an index.sqlite3 file present
|
||||
# otherwise we might be running in a directory that is not a collection, no point creating cruft files
|
||||
collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
|
||||
if collection_is_active or force_create:
|
||||
collection_id_file.write_text(collection_id)
|
||||
|
||||
|
||||
# if we're running as root right now, make sure the collection_id file is owned by the archivebox user
|
||||
if IS_ROOT:
|
||||
with SudoPermission(uid=0):
|
||||
if ARCHIVEBOX_USER == 0:
|
||||
os.system(f'chmod 777 "{collection_id_file}"')
|
||||
else:
|
||||
else:
|
||||
os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"')
|
||||
except (OSError, FileNotFoundError, PermissionError):
|
||||
pass
|
||||
return collection_id
|
||||
|
||||
|
||||
@cache
|
||||
def get_collection_id(DATA_DIR=DATA_DIR) -> str:
|
||||
"""Get a short, stable, unique ID for the current collection (e.g. abc45678)"""
|
||||
return _get_collection_id(DATA_DIR=DATA_DIR)
|
||||
|
||||
|
||||
@cache
|
||||
def get_machine_id() -> str:
|
||||
"""Get a short, stable, unique ID for the current machine (e.g. abc45678)"""
|
||||
|
||||
MACHINE_ID = 'unknown'
|
||||
|
||||
MACHINE_ID = "unknown"
|
||||
try:
|
||||
import machineid
|
||||
MACHINE_ID = machineid.hashed_id('archivebox')[:8]
|
||||
|
||||
MACHINE_ID = machineid.hashed_id("archivebox")[:8]
|
||||
except Exception:
|
||||
try:
|
||||
import uuid
|
||||
import hashlib
|
||||
|
||||
MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8]
|
||||
except Exception:
|
||||
pass
|
||||
return MACHINE_ID
|
||||
|
||||
|
||||
@cache
|
||||
def get_machine_type() -> str:
|
||||
"""Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)"""
|
||||
|
||||
OS: str = platform.system().lower() # darwin, linux, etc.
|
||||
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
|
||||
LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}'
|
||||
|
||||
OS: str = platform.system().lower() # darwin, linux, etc.
|
||||
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
|
||||
LIB_DIR_SCOPE: str = f"{ARCH}-{OS}-docker" if IN_DOCKER else f"{ARCH}-{OS}"
|
||||
return LIB_DIR_SCOPE
|
||||
|
||||
|
||||
@@ -97,27 +103,28 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
|
||||
current_uid, current_gid = os.geteuid(), os.getegid()
|
||||
uid, gid = uid or current_uid, gid or current_gid
|
||||
|
||||
test_file = dir_path / '.permissions_test'
|
||||
test_file = dir_path / ".permissions_test"
|
||||
try:
|
||||
with SudoPermission(uid=uid, fallback=fallback):
|
||||
test_file.exists()
|
||||
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
|
||||
test_file.write_text(f"Checking if PUID={uid} PGID={gid} can write to dir")
|
||||
test_file.unlink()
|
||||
return True
|
||||
except (IOError, OSError, PermissionError):
|
||||
if chown:
|
||||
except (OSError, PermissionError):
|
||||
if chown:
|
||||
# try fixing it using sudo permissions
|
||||
with SudoPermission(uid=uid, fallback=fallback):
|
||||
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
|
||||
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
|
||||
return False
|
||||
|
||||
|
||||
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
|
||||
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
|
||||
from archivebox.misc.logging_util import pretty_path
|
||||
|
||||
|
||||
try:
|
||||
socket_path = str(dir_path / '.test_socket.sock')
|
||||
socket_path = str(dir_path / ".test_socket.sock")
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
try:
|
||||
os.remove(socket_path)
|
||||
@@ -130,8 +137,8 @@ def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
|
||||
except OSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
|
||||
|
||||
raise Exception(f"ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}") from e
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -143,8 +150,9 @@ def create_and_chown_dir(dir_path: Path) -> None:
|
||||
|
||||
|
||||
def tmp_dir_socket_path_is_short_enough(dir_path: Path) -> bool:
|
||||
socket_file = dir_path.absolute().resolve() / 'supervisord.sock'
|
||||
return len(f'file://{socket_file}') <= 96
|
||||
socket_file = dir_path.absolute().resolve() / "supervisord.sock"
|
||||
return len(f"file://{socket_file}") <= 96
|
||||
|
||||
|
||||
@cache
|
||||
def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
@@ -154,14 +162,18 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
|
||||
# try a few potential directories in order of preference
|
||||
CANDIDATES = [
|
||||
STORAGE_CONFIG.TMP_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
|
||||
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
|
||||
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
|
||||
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
|
||||
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
|
||||
STORAGE_CONFIG.TMP_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
|
||||
Path("/var/run/archivebox") / get_collection_id(), # /var/run/archivebox/abc5d8512
|
||||
Path("/tmp") / "archivebox" / get_collection_id(), # /tmp/archivebox/abc5d8512
|
||||
Path("~/.tmp/archivebox").expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir())
|
||||
/ "archivebox"
|
||||
/ get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir())
|
||||
/ "archivebox"
|
||||
/ get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
|
||||
Path(tempfile.gettempdir()) / "abx" / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
|
||||
]
|
||||
fallback_candidate = None
|
||||
for candidate in CANDIDATES:
|
||||
@@ -174,7 +186,12 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
|
||||
return candidate
|
||||
try:
|
||||
if fallback_candidate is None and candidate.exists() and dir_is_writable(candidate) and tmp_dir_socket_path_is_short_enough(candidate):
|
||||
if (
|
||||
fallback_candidate is None
|
||||
and candidate.exists()
|
||||
and dir_is_writable(candidate)
|
||||
and tmp_dir_socket_path_is_short_enough(candidate)
|
||||
):
|
||||
fallback_candidate = candidate
|
||||
except Exception:
|
||||
pass
|
||||
@@ -186,25 +203,28 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
if autofix and STORAGE_CONFIG.TMP_DIR != fallback_candidate:
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=fallback_candidate)
|
||||
return fallback_candidate
|
||||
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
|
||||
raise OSError(f"ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!")
|
||||
|
||||
|
||||
@cache
|
||||
def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.misc.checks import check_lib_dir
|
||||
|
||||
|
||||
# try a few potential directories in order of preference
|
||||
CANDIDATES = [
|
||||
STORAGE_CONFIG.LIB_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
|
||||
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
|
||||
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
|
||||
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
|
||||
STORAGE_CONFIG.LIB_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
|
||||
Path("/usr/local/share/archivebox") / get_collection_id(), # /usr/local/share/archivebox/abc5
|
||||
*(
|
||||
[Path("/opt/homebrew/share/archivebox") / get_collection_id()] if os.path.isfile("/opt/homebrew/bin/archivebox") else []
|
||||
), # /opt/homebrew/share/archivebox/abc5
|
||||
Path("~/.local/share/archivebox").expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
|
||||
]
|
||||
|
||||
|
||||
for candidate in CANDIDATES:
|
||||
try:
|
||||
create_and_chown_dir(candidate)
|
||||
@@ -214,10 +234,9 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
||||
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
||||
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
|
||||
return candidate
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f"ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!")
|
||||
|
||||
|
||||
@cache
|
||||
@@ -229,57 +248,68 @@ def get_data_locations():
|
||||
tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR
|
||||
except Exception:
|
||||
tmp_dir = STORAGE_CONFIG.TMP_DIR
|
||||
|
||||
return benedict({
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
|
||||
return benedict(
|
||||
{
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONSTANTS.CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE)
|
||||
and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)
|
||||
and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": CONSTANTS.SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR)
|
||||
and os.access(CONSTANTS.SOURCES_DIR, os.R_OK)
|
||||
and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": CONSTANTS.PERSONAS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR)
|
||||
and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK)
|
||||
and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": CONSTANTS.LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR)
|
||||
and os.access(CONSTANTS.LOGS_DIR, os.R_OK)
|
||||
and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"TMP_DIR": {
|
||||
"path": tmp_dir.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
||||
# },
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONSTANTS.CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": CONSTANTS.SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": CONSTANTS.PERSONAS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": CONSTANTS.LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
'TMP_DIR': {
|
||||
'path': tmp_dir.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
||||
# },
|
||||
})
|
||||
)
|
||||
|
||||
|
||||
@cache
|
||||
def get_code_locations():
|
||||
@@ -291,41 +321,45 @@ def get_code_locations():
|
||||
except Exception:
|
||||
lib_dir = STORAGE_CONFIG.LIB_DIR
|
||||
|
||||
lib_bin_dir = lib_dir / 'bin'
|
||||
|
||||
return benedict({
|
||||
'PACKAGE_DIR': {
|
||||
'path': (PACKAGE_DIR).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
'enabled': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR),
|
||||
'is_valid': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
||||
},
|
||||
'USER_PLUGINS_DIR': {
|
||||
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
|
||||
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
|
||||
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
|
||||
},
|
||||
'LIB_DIR': {
|
||||
'path': lib_dir.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write
|
||||
},
|
||||
'LIB_BIN_DIR': {
|
||||
'path': lib_bin_dir.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(lib_bin_dir) and os.access(lib_bin_dir, os.R_OK) and os.access(lib_bin_dir, os.W_OK), # read + write
|
||||
},
|
||||
})
|
||||
lib_bin_dir = lib_dir / "bin"
|
||||
|
||||
return benedict(
|
||||
{
|
||||
"PACKAGE_DIR": {
|
||||
"path": (PACKAGE_DIR).resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.access(PACKAGE_DIR / "__main__.py", os.X_OK), # executable
|
||||
},
|
||||
"TEMPLATES_DIR": {
|
||||
"path": CONSTANTS.TEMPLATES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
"CUSTOM_TEMPLATES_DIR": {
|
||||
"path": STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
"enabled": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR),
|
||||
"is_valid": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR)
|
||||
and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
||||
},
|
||||
"USER_PLUGINS_DIR": {
|
||||
"path": CONSTANTS.USER_PLUGINS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
|
||||
},
|
||||
"LIB_DIR": {
|
||||
"path": lib_dir.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write
|
||||
},
|
||||
"LIB_BIN_DIR": {
|
||||
"path": lib_bin_dir.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(lib_bin_dir)
|
||||
and os.access(lib_bin_dir, os.R_OK)
|
||||
and os.access(lib_bin_dir, os.W_OK), # read + write
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# @cache
|
||||
@@ -340,9 +374,9 @@ def get_code_locations():
|
||||
# - ok to have a long path (doesnt contain SOCKETS)
|
||||
# """
|
||||
# from .version import detect_installed_version
|
||||
|
||||
|
||||
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
||||
|
||||
|
||||
# lib_dir = tempfile.gettempdir()
|
||||
# try:
|
||||
# if 'SYSTEM_LIB_DIR' in os.environ:
|
||||
@@ -350,7 +384,7 @@ def get_code_locations():
|
||||
# else:
|
||||
# with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
|
||||
# lib_dir = HOST_DIRS.site_data_path
|
||||
|
||||
|
||||
# # Docker: /usr/local/share/archivebox/0.8.5
|
||||
# # Ubuntu: /usr/local/share/archivebox/0.8.5
|
||||
# # macOS: /Library/Application Support/archivebox
|
||||
@@ -358,16 +392,16 @@ def get_code_locations():
|
||||
# with SudoPermission(uid=0, fallback=True):
|
||||
# lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
# except PermissionError:
|
||||
# # our user cannot
|
||||
# # our user cannot
|
||||
# lib_dir = HOST_DIRS.user_data_path
|
||||
# lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER):
|
||||
# if IS_ROOT:
|
||||
# # make sure lib dir is owned by the archivebox user, not root
|
||||
# with SudoPermission(uid=0):
|
||||
# if ARCHIVEBOX_USER == 0:
|
||||
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
|
||||
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
|
||||
# os.system(f'chmod -R 777 "{lib_dir}"')
|
||||
# else:
|
||||
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
|
||||
@@ -376,9 +410,9 @@ def get_code_locations():
|
||||
# except (PermissionError, AssertionError):
|
||||
# # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
# print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
|
||||
|
||||
|
||||
# return lib_dir
|
||||
|
||||
|
||||
# @cache
|
||||
# def get_TMP_DIR():
|
||||
# """
|
||||
@@ -390,9 +424,9 @@ def get_code_locations():
|
||||
# - must be cleared on every archivebox version upgrade
|
||||
# """
|
||||
# from .version import detect_installed_version
|
||||
|
||||
|
||||
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
||||
|
||||
|
||||
# # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
|
||||
# # print('RUNNING AS:', self.PUID, self.PGID)
|
||||
# run_dir = tempfile.gettempdir()
|
||||
@@ -405,7 +439,7 @@ def get_code_locations():
|
||||
# if IS_ROOT:
|
||||
# with SudoPermission(uid=0, fallback=False):
|
||||
# if ARCHIVEBOX_USER == 0:
|
||||
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# os.system(f'chmod -R 777 "{run_dir}"')
|
||||
# else:
|
||||
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
||||
@@ -413,30 +447,30 @@ def get_code_locations():
|
||||
# raise PermissionError()
|
||||
# assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
||||
# return run_dir
|
||||
|
||||
|
||||
# run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
|
||||
# try:
|
||||
# assert len(str(run_dir)) + len('/supervisord.sock') < 95
|
||||
# except AssertionError:
|
||||
# run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
|
||||
# assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
||||
|
||||
|
||||
# with SudoPermission(uid=0, fallback=True):
|
||||
# run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
|
||||
# if IS_ROOT:
|
||||
# with SudoPermission(uid=0):
|
||||
# if ARCHIVEBOX_USER == 0:
|
||||
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
||||
# os.system(f'chmod -R 777 "{run_dir}"')
|
||||
# else:
|
||||
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
||||
# else:
|
||||
# raise PermissionError()
|
||||
|
||||
|
||||
# except (PermissionError, AssertionError):
|
||||
# # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
# print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
|
||||
|
||||
|
||||
# return run_dir
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import pwd
|
||||
@@ -17,26 +17,26 @@ from contextlib import contextmanager
|
||||
DATA_DIR = Path(os.getcwd())
|
||||
|
||||
try:
|
||||
DATA_DIR_STAT = DATA_DIR.stat()
|
||||
DATA_DIR_UID = DATA_DIR_STAT.st_uid
|
||||
DATA_DIR_GID = DATA_DIR_STAT.st_gid
|
||||
DATA_DIR_STAT = DATA_DIR.stat()
|
||||
DATA_DIR_UID = DATA_DIR_STAT.st_uid
|
||||
DATA_DIR_GID = DATA_DIR_STAT.st_gid
|
||||
except PermissionError:
|
||||
DATA_DIR_UID = 0
|
||||
DATA_DIR_GID = 0
|
||||
DATA_DIR_UID = 0
|
||||
DATA_DIR_GID = 0
|
||||
|
||||
DEFAULT_PUID = 911
|
||||
DEFAULT_PGID = 911
|
||||
RUNNING_AS_UID = os.getuid()
|
||||
RUNNING_AS_GID = os.getgid()
|
||||
EUID = os.geteuid()
|
||||
EGID = os.getegid()
|
||||
SUDO_UID = int(os.environ.get('SUDO_UID', 0))
|
||||
SUDO_GID = int(os.environ.get('SUDO_GID', 0))
|
||||
USER: str = Path('~').expanduser().resolve().name
|
||||
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
|
||||
DEFAULT_PUID = 911
|
||||
DEFAULT_PGID = 911
|
||||
RUNNING_AS_UID = os.getuid()
|
||||
RUNNING_AS_GID = os.getgid()
|
||||
EUID = os.geteuid()
|
||||
EGID = os.getegid()
|
||||
SUDO_UID = int(os.environ.get("SUDO_UID", 0))
|
||||
SUDO_GID = int(os.environ.get("SUDO_GID", 0))
|
||||
USER: str = Path("~").expanduser().resolve().name
|
||||
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
|
||||
|
||||
IS_ROOT = RUNNING_AS_UID == 0
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose
|
||||
|
||||
|
||||
@@ -47,74 +47,79 @@ if RUNNING_AS_UID == 0:
|
||||
# if we are running as root it's really hard to figure out what the correct archivebox user should be
|
||||
# as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users)
|
||||
# check if 911:911 archivebox user exists on host system, and use it instead of 0
|
||||
if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox':
|
||||
if pwd.getpwuid(DEFAULT_PUID).pw_name == "archivebox":
|
||||
FALLBACK_UID = DEFAULT_PUID
|
||||
FALLBACK_GID = DEFAULT_PGID
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
os.environ.setdefault('PUID', str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID))
|
||||
os.environ.setdefault('PGID', str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID))
|
||||
os.environ.setdefault("PUID", str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID))
|
||||
os.environ.setdefault("PGID", str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID))
|
||||
|
||||
ARCHIVEBOX_USER = int(os.environ['PUID'])
|
||||
ARCHIVEBOX_GROUP = int(os.environ['PGID'])
|
||||
ARCHIVEBOX_USER = int(os.environ["PUID"])
|
||||
ARCHIVEBOX_GROUP = int(os.environ["PGID"])
|
||||
if not USER:
|
||||
try:
|
||||
# alternative method 1 to get username
|
||||
USER = pwd.getpwuid(ARCHIVEBOX_USER).pw_name
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if not USER:
|
||||
try:
|
||||
# alternative method 2 to get username
|
||||
import getpass
|
||||
|
||||
USER = getpass.getuser()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if not USER:
|
||||
try:
|
||||
# alternative method 3 to get username
|
||||
USER = os.getlogin() or 'archivebox'
|
||||
USER = os.getlogin() or "archivebox"
|
||||
except Exception:
|
||||
USER = 'archivebox'
|
||||
|
||||
USER = "archivebox"
|
||||
|
||||
ARCHIVEBOX_USER_EXISTS = False
|
||||
try:
|
||||
pwd.getpwuid(ARCHIVEBOX_USER)
|
||||
ARCHIVEBOX_USER_EXISTS = True
|
||||
except Exception:
|
||||
ARCHIVEBOX_USER_EXISTS = False
|
||||
|
||||
|
||||
|
||||
#############################################################################################
|
||||
|
||||
|
||||
def drop_privileges():
|
||||
"""If running as root, drop privileges to the user that owns the data dir (or PUID)"""
|
||||
|
||||
|
||||
# always run archivebox as the user that owns the data dir, never as root
|
||||
if os.getuid() == 0:
|
||||
# drop permissions to the user that owns the data dir / provided PUID
|
||||
if os.geteuid() != ARCHIVEBOX_USER and ARCHIVEBOX_USER != 0 and ARCHIVEBOX_USER_EXISTS:
|
||||
# drop our effective UID to the archivebox user's UID
|
||||
os.seteuid(ARCHIVEBOX_USER)
|
||||
|
||||
|
||||
# update environment variables so that subprocesses dont try to write to /root
|
||||
pw_record = pwd.getpwuid(ARCHIVEBOX_USER)
|
||||
os.environ['HOME'] = pw_record.pw_dir
|
||||
os.environ['LOGNAME'] = pw_record.pw_name
|
||||
os.environ['USER'] = pw_record.pw_name
|
||||
os.environ["HOME"] = pw_record.pw_dir
|
||||
os.environ["LOGNAME"] = pw_record.pw_name
|
||||
os.environ["USER"] = pw_record.pw_name
|
||||
|
||||
if ARCHIVEBOX_USER == 0 or not ARCHIVEBOX_USER_EXISTS:
|
||||
print('[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)', file=sys.stderr)
|
||||
print(
|
||||
"[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def SudoPermission(uid=0, fallback=False):
|
||||
"""Attempt to run code with sudo permissions for a given user (or root)"""
|
||||
|
||||
|
||||
if os.geteuid() == uid:
|
||||
# no need to change effective UID, we are already that user
|
||||
yield
|
||||
@@ -125,7 +130,7 @@ def SudoPermission(uid=0, fallback=False):
|
||||
os.seteuid(uid)
|
||||
except PermissionError as err:
|
||||
if not fallback:
|
||||
raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err
|
||||
raise PermissionError(f"Not enough permissions to run code as uid={uid}, please retry with sudo") from err
|
||||
try:
|
||||
# yield back to the caller so they can run code inside context as root
|
||||
yield
|
||||
@@ -135,4 +140,4 @@ def SudoPermission(uid=0, fallback=False):
|
||||
os.seteuid(ARCHIVEBOX_USER)
|
||||
except PermissionError as err:
|
||||
if not fallback:
|
||||
raise PermissionError(f'Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo') from err
|
||||
raise PermissionError(f"Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo") from err
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import importlib.metadata
|
||||
@@ -6,71 +6,71 @@ import importlib.metadata
|
||||
from pathlib import Path
|
||||
from functools import cache
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
#############################################################################################
|
||||
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir
|
||||
|
||||
#############################################################################################
|
||||
|
||||
|
||||
@cache
|
||||
def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
|
||||
def detect_installed_version(PACKAGE_DIR: Path = PACKAGE_DIR):
|
||||
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
|
||||
try:
|
||||
# if in production install, use pip-installed package metadata
|
||||
return importlib.metadata.version('archivebox').strip()
|
||||
return importlib.metadata.version("archivebox").strip()
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# if in dev Git repo dir, use pyproject.toml file
|
||||
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
|
||||
pyproject_config = (PACKAGE_DIR.parent / "pyproject.toml").read_text().split("\n")
|
||||
for line in pyproject_config:
|
||||
if line.startswith('version = '):
|
||||
return line.split(' = ', 1)[-1].strip('"').strip()
|
||||
if line.startswith("version = "):
|
||||
return line.split(" = ", 1)[-1].strip('"').strip()
|
||||
except FileNotFoundError:
|
||||
# building docs, pyproject.toml is not available
|
||||
pass
|
||||
|
||||
# raise Exception('Failed to detect installed archivebox version!')
|
||||
return 'dev'
|
||||
return "dev"
|
||||
|
||||
|
||||
@cache
|
||||
def get_COMMIT_HASH() -> Optional[str]:
|
||||
def get_COMMIT_HASH() -> str | None:
|
||||
try:
|
||||
git_dir = PACKAGE_DIR.parent / '.git'
|
||||
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
||||
git_dir = PACKAGE_DIR.parent / ".git"
|
||||
ref = (git_dir / "HEAD").read_text().strip().split(" ")[-1]
|
||||
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
||||
return commit_hash
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
return list((PACKAGE_DIR.parent / ".git/refs/heads/").glob("*"))[0].read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@cache
|
||||
def get_BUILD_TIME() -> str:
|
||||
if IN_DOCKER:
|
||||
try:
|
||||
# if we're in the archivebox official docker image, /VERSION.txt will contain the build time
|
||||
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
||||
docker_build_end_time = Path("/VERSION.txt").read_text().rsplit("BUILD_END_TIME=")[-1].split("\n", 1)[0]
|
||||
return docker_build_end_time
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
||||
src_last_modified_unix_timestamp = (PACKAGE_DIR / "README.md").stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime("%Y-%m-%d %H:%M:%S %s")
|
||||
|
||||
|
||||
# def get_versions_available_on_github(config):
|
||||
@@ -78,14 +78,14 @@ def get_BUILD_TIME() -> str:
|
||||
# returns a dictionary containing the ArchiveBox GitHub release info for
|
||||
# the recommended upgrade version and the currently installed version
|
||||
# """
|
||||
|
||||
|
||||
# # we only want to perform the (relatively expensive) check for new versions
|
||||
# # when its most relevant, e.g. when the user runs a long-running command
|
||||
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
|
||||
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
|
||||
# if subcommand_run_by_user not in long_running_commands:
|
||||
# return None
|
||||
|
||||
|
||||
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
|
||||
# response = requests.get(github_releases_api)
|
||||
# if response.status_code != 200:
|
||||
@@ -104,7 +104,7 @@ def get_BUILD_TIME() -> str:
|
||||
# break
|
||||
|
||||
# current_version = current_version or all_releases[-1]
|
||||
|
||||
|
||||
# # recommended version is whatever comes after current_version in the release list
|
||||
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
|
||||
# try:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import html
|
||||
import json
|
||||
@@ -6,7 +6,8 @@ import os
|
||||
import inspect
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict
|
||||
from typing import Any
|
||||
from collections.abc import Callable
|
||||
from urllib.parse import quote, urlencode
|
||||
from django.http import HttpRequest
|
||||
from django.utils import timezone
|
||||
@@ -21,30 +22,48 @@ from archivebox.misc.util import parse_date
|
||||
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
|
||||
LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
|
||||
ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
|
||||
INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
|
||||
ABX_PLUGINS_DOCS_BASE_URL = "https://archivebox.github.io/abx-plugins/"
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/"
|
||||
LIVE_CONFIG_BASE_URL = "/admin/environment/config/"
|
||||
ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/"
|
||||
INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/"
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
KNOWN_BINARIES = [
|
||||
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
|
||||
'node', 'npm', 'npx', 'yt-dlp',
|
||||
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
|
||||
'python3', 'python', 'bash', 'zsh',
|
||||
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
|
||||
"wget",
|
||||
"curl",
|
||||
"chromium",
|
||||
"chrome",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"node",
|
||||
"npm",
|
||||
"npx",
|
||||
"yt-dlp",
|
||||
"git",
|
||||
"singlefile",
|
||||
"readability-extractor",
|
||||
"mercury-parser",
|
||||
"python3",
|
||||
"python",
|
||||
"bash",
|
||||
"zsh",
|
||||
"ffmpeg",
|
||||
"ripgrep",
|
||||
"rg",
|
||||
"sonic",
|
||||
"archivebox",
|
||||
]
|
||||
|
||||
CANONICAL_BINARY_ALIASES = {
|
||||
'youtube-dl': 'yt-dlp',
|
||||
'ytdlp': 'yt-dlp',
|
||||
"youtube-dl": "yt-dlp",
|
||||
"ytdlp": "yt-dlp",
|
||||
}
|
||||
|
||||
|
||||
def is_superuser(request: HttpRequest) -> bool:
|
||||
return bool(getattr(request.user, 'is_superuser', False))
|
||||
return bool(getattr(request.user, "is_superuser", False))
|
||||
|
||||
|
||||
def format_parsed_datetime(value: object) -> str:
|
||||
@@ -55,9 +74,9 @@ def format_parsed_datetime(value: object) -> str:
|
||||
JSON_TOKEN_RE = re.compile(
|
||||
r'(?P<key>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
|
||||
r'|(?P<string>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
|
||||
r'|(?P<boolean>\btrue\b|\bfalse\b)'
|
||||
r'|(?P<null>\bnull\b)'
|
||||
r'|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
|
||||
r"|(?P<boolean>\btrue\b|\bfalse\b)"
|
||||
r"|(?P<null>\bnull\b)"
|
||||
r"|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)",
|
||||
)
|
||||
|
||||
|
||||
@@ -65,13 +84,14 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str:
|
||||
code = html.escape(text, quote=False)
|
||||
|
||||
if highlighted:
|
||||
|
||||
def _wrap_token(match: re.Match[str]) -> str:
|
||||
styles = {
|
||||
'key': 'color: #0550ae;',
|
||||
'string': 'color: #0a7f45;',
|
||||
'boolean': 'color: #8250df; font-weight: 600;',
|
||||
'null': 'color: #6e7781; font-style: italic;',
|
||||
'number': 'color: #b35900;',
|
||||
"key": "color: #0550ae;",
|
||||
"string": "color: #0a7f45;",
|
||||
"boolean": "color: #8250df; font-weight: 600;",
|
||||
"null": "color: #6e7781; font-style: italic;",
|
||||
"number": "color: #b35900;",
|
||||
}
|
||||
token_type = next(name for name, value in match.groupdict().items() if value is not None)
|
||||
return f'<span style="{styles[token_type]}">{match.group(0)}</span>'
|
||||
@@ -82,9 +102,9 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str:
|
||||
'<pre style="max-height: 600px; overflow: auto; background: #f6f8fa; '
|
||||
'border: 1px solid #d0d7de; border-radius: 6px; padding: 12px; margin: 0;">'
|
||||
'<code style="font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, '
|
||||
'\'Liberation Mono\', monospace; white-space: pre; line-height: 1.5;">'
|
||||
f'{code}'
|
||||
'</code></pre>'
|
||||
"'Liberation Mono', monospace; white-space: pre; line-height: 1.5;\">"
|
||||
f"{code}"
|
||||
"</code></pre>"
|
||||
)
|
||||
|
||||
|
||||
@@ -93,34 +113,35 @@ def render_highlighted_json_block(value: Any) -> str:
|
||||
|
||||
|
||||
def get_plugin_docs_url(plugin_name: str) -> str:
|
||||
return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
|
||||
return f"{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}"
|
||||
|
||||
|
||||
def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
|
||||
return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
|
||||
return f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}"
|
||||
|
||||
|
||||
def get_live_config_url(key: str) -> str:
|
||||
return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
|
||||
return f"{LIVE_CONFIG_BASE_URL}{quote(key)}/"
|
||||
|
||||
|
||||
def get_environment_binary_url(name: str) -> str:
|
||||
return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
|
||||
return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/"
|
||||
|
||||
|
||||
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
|
||||
binary_id = getattr(binary, 'id', None)
|
||||
binary_id = getattr(binary, "id", None)
|
||||
if not binary_id:
|
||||
return None
|
||||
|
||||
base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
|
||||
changelist_filters = urlencode({'q': canonical_binary_name(name)})
|
||||
return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
|
||||
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/"
|
||||
changelist_filters = urlencode({"q": canonical_binary_name(name)})
|
||||
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
|
||||
|
||||
|
||||
def get_machine_admin_url() -> str | None:
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.current().admin_change_url
|
||||
except Exception:
|
||||
return None
|
||||
@@ -130,12 +151,14 @@ def render_code_tag_list(values: list[str]) -> str:
|
||||
if not values:
|
||||
return '<span style="color: #6e7781;">(none)</span>'
|
||||
|
||||
tags = ''.join(
|
||||
str(format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
))
|
||||
tags = "".join(
|
||||
str(
|
||||
format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
),
|
||||
)
|
||||
for value in values
|
||||
)
|
||||
return f'<div style="display: flex; flex-wrap: wrap;">{tags}</div>'
|
||||
@@ -143,22 +166,21 @@ def render_code_tag_list(values: list[str]) -> str:
|
||||
|
||||
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
|
||||
rows = (
|
||||
('Title', config.get('title') or '(none)'),
|
||||
('Description', config.get('description') or '(none)'),
|
||||
('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
|
||||
('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
|
||||
('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
|
||||
("Title", config.get("title") or "(none)"),
|
||||
("Description", config.get("description") or "(none)"),
|
||||
("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))),
|
||||
("Required Binaries", mark_safe(render_link_tag_list(config.get("required_binaries") or [], get_environment_binary_url))),
|
||||
("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))),
|
||||
)
|
||||
|
||||
rendered_rows = ''.join(
|
||||
str(format_html(
|
||||
'<div style="margin: 0 0 14px 0;">'
|
||||
'<div style="font-weight: 600; margin-bottom: 4px;">{}</div>'
|
||||
'<div>{}</div>'
|
||||
'</div>',
|
||||
label,
|
||||
value,
|
||||
))
|
||||
rendered_rows = "".join(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 14px 0;"><div style="font-weight: 600; margin-bottom: 4px;">{}</div><div>{}</div></div>',
|
||||
label,
|
||||
value,
|
||||
),
|
||||
)
|
||||
for label, value in rows
|
||||
)
|
||||
return f'<div style="margin: 4px 0 0 0;">{rendered_rows}</div>'
|
||||
@@ -171,20 +193,28 @@ def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] |
|
||||
tags = []
|
||||
for value in values:
|
||||
if url_resolver is None:
|
||||
tags.append(str(format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
)))
|
||||
tags.append(
|
||||
str(
|
||||
format_html(
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
|
||||
value,
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
tags.append(str(format_html(
|
||||
'<a href="{}" style="text-decoration: none;">'
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
|
||||
'</a>',
|
||||
url_resolver(value),
|
||||
value,
|
||||
)))
|
||||
tags.append(
|
||||
str(
|
||||
format_html(
|
||||
'<a href="{}" style="text-decoration: none;">'
|
||||
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
|
||||
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
|
||||
"</a>",
|
||||
url_resolver(value),
|
||||
value,
|
||||
),
|
||||
),
|
||||
)
|
||||
return f'<div style="display: flex; flex-wrap: wrap;">{"".join(tags)}</div>'
|
||||
|
||||
|
||||
@@ -195,21 +225,21 @@ def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_adm
|
||||
if machine_admin_url:
|
||||
links.append(str(format_html('<a href="{}">Edit override</a>', machine_admin_url)))
|
||||
|
||||
fallback = prop_info.get('x-fallback')
|
||||
fallback = prop_info.get("x-fallback")
|
||||
if isinstance(fallback, str) and fallback:
|
||||
links.append(str(format_html('<a href="{}">Fallback: <code>{}</code></a>', get_live_config_url(fallback), fallback)))
|
||||
|
||||
aliases = prop_info.get('x-aliases') or []
|
||||
aliases = prop_info.get("x-aliases") or []
|
||||
if isinstance(aliases, list):
|
||||
for alias in aliases:
|
||||
if isinstance(alias, str) and alias:
|
||||
links.append(str(format_html('<a href="{}">Alias: <code>{}</code></a>', get_live_config_url(alias), alias)))
|
||||
|
||||
default = prop_info.get('default')
|
||||
if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
|
||||
default = prop_info.get("default")
|
||||
if prop_name.endswith("_BINARY") and isinstance(default, str) and default:
|
||||
links.append(str(format_html('<a href="{}">Binary: <code>{}</code></a>', get_environment_binary_url(default), default)))
|
||||
|
||||
return ' '.join(links)
|
||||
return " ".join(links)
|
||||
|
||||
|
||||
def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
|
||||
@@ -221,42 +251,48 @@ def render_config_properties_html(properties: dict[str, Any], machine_admin_url:
|
||||
header_links.insert(0, str(format_html('<a href="{}">Machine Config Editor</a>', machine_admin_url)))
|
||||
|
||||
cards = [
|
||||
f'<div style="margin: 0 0 16px 0;">{" | ".join(header_links)}</div>'
|
||||
f'<div style="margin: 0 0 16px 0;">{" | ".join(header_links)}</div>',
|
||||
]
|
||||
|
||||
for prop_name, prop_info in properties.items():
|
||||
prop_type = prop_info.get('type', 'unknown')
|
||||
prop_type = prop_info.get("type", "unknown")
|
||||
if isinstance(prop_type, list):
|
||||
prop_type = ' | '.join(str(type_name) for type_name in prop_type)
|
||||
prop_desc = prop_info.get('description', '')
|
||||
prop_type = " | ".join(str(type_name) for type_name in prop_type)
|
||||
prop_desc = prop_info.get("description", "")
|
||||
|
||||
default_html = ''
|
||||
if 'default' in prop_info:
|
||||
default_html = str(format_html(
|
||||
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
|
||||
prop_info['default'],
|
||||
))
|
||||
default_html = ""
|
||||
if "default" in prop_info:
|
||||
default_html = str(
|
||||
format_html(
|
||||
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
|
||||
prop_info["default"],
|
||||
),
|
||||
)
|
||||
|
||||
description_html = prop_desc or mark_safe('<span style="color: #6e7781;">(no description)</span>')
|
||||
cards.append(str(format_html(
|
||||
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
|
||||
'<div style="margin-bottom: 6px;">'
|
||||
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
|
||||
' <span style="color: #6e7781;">({})</span>'
|
||||
'</div>'
|
||||
'<div style="margin-bottom: 6px;">{}</div>'
|
||||
'<div style="font-size: 0.95em;">{}</div>'
|
||||
'{}'
|
||||
'</div>',
|
||||
get_live_config_url(prop_name),
|
||||
prop_name,
|
||||
prop_type,
|
||||
description_html,
|
||||
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
|
||||
mark_safe(default_html),
|
||||
)))
|
||||
cards.append(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
|
||||
'<div style="margin-bottom: 6px;">'
|
||||
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
|
||||
' <span style="color: #6e7781;">({})</span>'
|
||||
"</div>"
|
||||
'<div style="margin-bottom: 6px;">{}</div>'
|
||||
'<div style="font-size: 0.95em;">{}</div>'
|
||||
"{}"
|
||||
"</div>",
|
||||
get_live_config_url(prop_name),
|
||||
prop_name,
|
||||
prop_type,
|
||||
description_html,
|
||||
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
|
||||
mark_safe(default_html),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
return ''.join(cards)
|
||||
return "".join(cards)
|
||||
|
||||
|
||||
def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
|
||||
@@ -265,40 +301,47 @@ def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> s
|
||||
|
||||
items = []
|
||||
for hook_name in hooks:
|
||||
if source == 'builtin':
|
||||
items.append(str(format_html(
|
||||
'<div style="margin: 0 0 8px 0;">'
|
||||
'<a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a>'
|
||||
'</div>',
|
||||
get_plugin_hook_source_url(plugin_name, hook_name),
|
||||
hook_name,
|
||||
)))
|
||||
if source == "builtin":
|
||||
items.append(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a></div>',
|
||||
get_plugin_hook_source_url(plugin_name, hook_name),
|
||||
hook_name,
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
items.append(str(format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
|
||||
hook_name,
|
||||
)))
|
||||
return ''.join(items)
|
||||
items.append(
|
||||
str(
|
||||
format_html(
|
||||
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
|
||||
hook_name,
|
||||
),
|
||||
),
|
||||
)
|
||||
return "".join(items)
|
||||
|
||||
|
||||
def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
|
||||
installed_binary_url = get_installed_binary_change_url(name, db_binary)
|
||||
|
||||
if installed_binary_url:
|
||||
return str(format_html(
|
||||
'<code>{}</code><br/>'
|
||||
'<a href="{}">View Installed Binary Record</a>',
|
||||
merged['abspath'],
|
||||
installed_binary_url,
|
||||
))
|
||||
return str(
|
||||
format_html(
|
||||
'<code>{}</code><br/><a href="{}">View Installed Binary Record</a>',
|
||||
merged["abspath"],
|
||||
installed_binary_url,
|
||||
),
|
||||
)
|
||||
|
||||
return str(format_html('<code>{}</code>', merged['abspath']))
|
||||
return str(format_html("<code>{}</code>", merged["abspath"]))
|
||||
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
indent_str = " " * indent
|
||||
if indent == 0:
|
||||
indent_str = '\n' # put extra newline between top-level entries
|
||||
indent_str = "\n" # put extra newline between top-level entries
|
||||
|
||||
if isinstance(obj, dict):
|
||||
if not obj:
|
||||
@@ -326,11 +369,11 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
return f" {str(obj)}"
|
||||
|
||||
elif callable(obj):
|
||||
source = '\n'.join(
|
||||
'' if 'def ' in line else line
|
||||
for line in inspect.getsource(obj).split('\n')
|
||||
if line.strip()
|
||||
).split('lambda: ')[-1].rstrip(',')
|
||||
source = (
|
||||
"\n".join("" if "def " in line else line for line in inspect.getsource(obj).split("\n") if line.strip())
|
||||
.split("lambda: ")[-1]
|
||||
.rstrip(",")
|
||||
)
|
||||
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
|
||||
|
||||
else:
|
||||
@@ -350,67 +393,64 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
|
||||
)
|
||||
|
||||
|
||||
def get_db_binaries_by_name() -> Dict[str, Binary]:
|
||||
grouped: Dict[str, list[Binary]] = {}
|
||||
def get_db_binaries_by_name() -> dict[str, Binary]:
|
||||
grouped: dict[str, list[Binary]] = {}
|
||||
for binary in Binary.objects.all():
|
||||
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
|
||||
|
||||
return {
|
||||
name: max(records, key=_binary_sort_key)
|
||||
for name, records in grouped.items()
|
||||
}
|
||||
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
|
||||
|
||||
|
||||
def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
|
||||
def serialize_binary_record(name: str, binary: Binary | None) -> dict[str, Any]:
|
||||
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
|
||||
return {
|
||||
'name': canonical_binary_name(name),
|
||||
'version': str(getattr(binary, 'version', '') or ''),
|
||||
'binprovider': str(getattr(binary, 'binprovider', '') or ''),
|
||||
'abspath': str(getattr(binary, 'abspath', '') or ''),
|
||||
'sha256': str(getattr(binary, 'sha256', '') or ''),
|
||||
'status': str(getattr(binary, 'status', '') or ''),
|
||||
'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
|
||||
"name": canonical_binary_name(name),
|
||||
"version": str(getattr(binary, "version", "") or ""),
|
||||
"binprovider": str(getattr(binary, "binprovider", "") or ""),
|
||||
"abspath": str(getattr(binary, "abspath", "") or ""),
|
||||
"sha256": str(getattr(binary, "sha256", "") or ""),
|
||||
"status": str(getattr(binary, "status", "") or ""),
|
||||
"is_available": is_installed and bool(getattr(binary, "abspath", "") or ""),
|
||||
}
|
||||
|
||||
|
||||
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
|
||||
def get_filesystem_plugins() -> dict[str, dict[str, Any]]:
|
||||
"""Discover plugins from filesystem directories."""
|
||||
import json
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
|
||||
|
||||
plugins = {}
|
||||
|
||||
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
|
||||
for base_dir, source in [(BUILTIN_PLUGINS_DIR, "builtin"), (USER_PLUGINS_DIR, "user")]:
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
|
||||
plugin_id = f'{source}.{plugin_dir.name}'
|
||||
if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"):
|
||||
plugin_id = f"{source}.{plugin_dir.name}"
|
||||
|
||||
# Find hook scripts
|
||||
hooks = []
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
|
||||
for ext in ("sh", "py", "js"):
|
||||
hooks.extend(plugin_dir.glob(f"on_*__*.{ext}"))
|
||||
|
||||
# Load config.json if it exists
|
||||
config_file = plugin_dir / 'config.json'
|
||||
config_file = plugin_dir / "config.json"
|
||||
config_data = None
|
||||
if config_file.exists():
|
||||
try:
|
||||
with open(config_file, 'r') as f:
|
||||
with open(config_file) as f:
|
||||
config_data = json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
except (json.JSONDecodeError, OSError):
|
||||
config_data = None
|
||||
|
||||
plugins[plugin_id] = {
|
||||
'id': plugin_id,
|
||||
'name': plugin_dir.name,
|
||||
'path': str(plugin_dir),
|
||||
'source': source,
|
||||
'hooks': [str(h.name) for h in hooks],
|
||||
'config': config_data,
|
||||
"id": plugin_id,
|
||||
"name": plugin_dir.name,
|
||||
"path": str(plugin_dir),
|
||||
"source": source,
|
||||
"hooks": [str(h.name) for h in hooks],
|
||||
"config": config_data,
|
||||
}
|
||||
|
||||
return plugins
|
||||
@@ -418,7 +458,7 @@ def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
|
||||
|
||||
@render_with_table_view
|
||||
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
rows = {
|
||||
"Binary Name": [],
|
||||
@@ -433,16 +473,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
for name in all_binary_names:
|
||||
merged = serialize_binary_record(name, db_binaries.get(name))
|
||||
|
||||
rows['Binary Name'].append(ItemLink(name, key=name))
|
||||
rows["Binary Name"].append(ItemLink(name, key=name))
|
||||
|
||||
if merged['is_available']:
|
||||
rows['Found Version'].append(f"✅ {merged['version']}" if merged['version'] else '✅ found')
|
||||
rows['Provided By'].append(merged['binprovider'] or '-')
|
||||
rows['Found Abspath'].append(merged['abspath'] or '-')
|
||||
if merged["is_available"]:
|
||||
rows["Found Version"].append(f"✅ {merged['version']}" if merged["version"] else "✅ found")
|
||||
rows["Provided By"].append(merged["binprovider"] or "-")
|
||||
rows["Found Abspath"].append(merged["abspath"] or "-")
|
||||
else:
|
||||
rows['Found Version'].append('❌ missing')
|
||||
rows['Provided By'].append('-')
|
||||
rows['Found Abspath'].append('-')
|
||||
rows["Found Version"].append("❌ missing")
|
||||
rows["Provided By"].append("-")
|
||||
rows["Found Abspath"].append("-")
|
||||
|
||||
return TableContext(
|
||||
title="Binaries",
|
||||
@@ -452,23 +492,23 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
key = canonical_binary_name(key)
|
||||
|
||||
db_binary = get_db_binaries_by_name().get(key)
|
||||
merged = serialize_binary_record(key, db_binary)
|
||||
|
||||
if merged['is_available']:
|
||||
if merged["is_available"]:
|
||||
section: SectionData = {
|
||||
"name": key,
|
||||
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': merged['binprovider'] or '-',
|
||||
'abspath': merged['abspath'] or 'not found',
|
||||
'version': merged['version'] or 'unknown',
|
||||
'sha256': merged['sha256'],
|
||||
'status': merged['status'],
|
||||
"name": key,
|
||||
"binprovider": merged["binprovider"] or "-",
|
||||
"abspath": merged["abspath"] or "not found",
|
||||
"version": merged["version"] or "unknown",
|
||||
"sha256": merged["sha256"],
|
||||
"status": merged["status"],
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -482,11 +522,11 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"name": key,
|
||||
"description": "No persisted Binary record found",
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': merged['binprovider'] or 'not recorded',
|
||||
'abspath': merged['abspath'] or 'not recorded',
|
||||
'version': merged['version'] or 'N/A',
|
||||
'status': merged['status'] or 'unrecorded',
|
||||
"name": key,
|
||||
"binprovider": merged["binprovider"] or "not recorded",
|
||||
"abspath": merged["abspath"] or "not recorded",
|
||||
"version": merged["version"] or "N/A",
|
||||
"status": merged["status"] or "unrecorded",
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -499,7 +539,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
@render_with_table_view
|
||||
def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
rows = {
|
||||
"Name": [],
|
||||
@@ -512,26 +552,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
plugins = get_filesystem_plugins()
|
||||
|
||||
for plugin_id, plugin in plugins.items():
|
||||
rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
|
||||
rows['Source'].append(plugin['source'])
|
||||
rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
|
||||
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
|
||||
rows["Name"].append(ItemLink(plugin["name"], key=plugin_id))
|
||||
rows["Source"].append(plugin["source"])
|
||||
rows["Path"].append(format_html("<code>{}</code>", plugin["path"]))
|
||||
rows["Hooks"].append(", ".join(plugin["hooks"]) or "(none)")
|
||||
|
||||
# Show config status
|
||||
if plugin.get('config'):
|
||||
config_properties = plugin['config'].get('properties', {})
|
||||
if plugin.get("config"):
|
||||
config_properties = plugin["config"].get("properties", {})
|
||||
config_count = len(config_properties)
|
||||
rows['Config'].append(f'✅ {config_count} properties' if config_count > 0 else '✅ present')
|
||||
rows["Config"].append(f"✅ {config_count} properties" if config_count > 0 else "✅ present")
|
||||
else:
|
||||
rows['Config'].append('❌ none')
|
||||
rows["Config"].append("❌ none")
|
||||
|
||||
if not plugins:
|
||||
# Show a helpful message when no plugins found
|
||||
rows['Name'].append('(no plugins found)')
|
||||
rows['Source'].append('-')
|
||||
rows['Path'].append(mark_safe('<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>'))
|
||||
rows['Hooks'].append('-')
|
||||
rows['Config'].append('-')
|
||||
rows["Name"].append("(no plugins found)")
|
||||
rows["Source"].append("-")
|
||||
rows["Path"].append(mark_safe("<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>"))
|
||||
rows["Hooks"].append("-")
|
||||
rows["Config"].append("-")
|
||||
|
||||
return TableContext(
|
||||
title="Installed plugins",
|
||||
@@ -541,7 +581,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
plugins = get_filesystem_plugins()
|
||||
|
||||
@@ -549,65 +589,75 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
if not plugin:
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=f'Plugin not found: {key}',
|
||||
title=f"Plugin not found: {key}",
|
||||
data=[],
|
||||
)
|
||||
|
||||
# Base fields that all plugins have
|
||||
docs_url = get_plugin_docs_url(plugin['name'])
|
||||
docs_url = get_plugin_docs_url(plugin["name"])
|
||||
machine_admin_url = get_machine_admin_url()
|
||||
fields = {
|
||||
"id": plugin['id'],
|
||||
"name": plugin['name'],
|
||||
"source": plugin['source'],
|
||||
"id": plugin["id"],
|
||||
"name": plugin["name"],
|
||||
"source": plugin["source"],
|
||||
}
|
||||
|
||||
sections: list[SectionData] = [{
|
||||
"name": plugin['name'],
|
||||
"description": format_html(
|
||||
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
|
||||
plugin['path'],
|
||||
docs_url,
|
||||
),
|
||||
"fields": fields,
|
||||
"help_texts": {},
|
||||
}]
|
||||
|
||||
if plugin['hooks']:
|
||||
sections.append({
|
||||
"name": "Hooks",
|
||||
"description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
|
||||
"fields": {},
|
||||
sections: list[SectionData] = [
|
||||
{
|
||||
"name": plugin["name"],
|
||||
"description": format_html(
|
||||
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
|
||||
plugin["path"],
|
||||
docs_url,
|
||||
),
|
||||
"fields": fields,
|
||||
"help_texts": {},
|
||||
})
|
||||
},
|
||||
]
|
||||
|
||||
if plugin.get('config'):
|
||||
sections.append({
|
||||
"name": "Plugin Metadata",
|
||||
"description": mark_safe(render_plugin_metadata_html(plugin['config'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
sections.append({
|
||||
"name": "config.json",
|
||||
"description": mark_safe(render_highlighted_json_block(plugin['config'])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
|
||||
config_properties = plugin['config'].get('properties', {})
|
||||
if config_properties:
|
||||
sections.append({
|
||||
"name": "Config Properties",
|
||||
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
|
||||
if plugin["hooks"]:
|
||||
sections.append(
|
||||
{
|
||||
"name": "Hooks",
|
||||
"description": mark_safe(render_hook_links_html(plugin["name"], plugin["hooks"], plugin["source"])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
})
|
||||
},
|
||||
)
|
||||
|
||||
if plugin.get("config"):
|
||||
sections.append(
|
||||
{
|
||||
"name": "Plugin Metadata",
|
||||
"description": mark_safe(render_plugin_metadata_html(plugin["config"])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
},
|
||||
)
|
||||
|
||||
sections.append(
|
||||
{
|
||||
"name": "config.json",
|
||||
"description": mark_safe(render_highlighted_json_block(plugin["config"])),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
},
|
||||
)
|
||||
|
||||
config_properties = plugin["config"].get("properties", {})
|
||||
if config_properties:
|
||||
sections.append(
|
||||
{
|
||||
"name": "Config Properties",
|
||||
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
|
||||
"fields": {},
|
||||
"help_texts": {},
|
||||
},
|
||||
)
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=plugin['name'],
|
||||
title=plugin["name"],
|
||||
data=sections,
|
||||
)
|
||||
|
||||
@@ -648,20 +698,20 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
all_config[config_name] = config_data
|
||||
|
||||
# Add top row for supervisord process manager
|
||||
rows["Name"].append(ItemLink('supervisord', key='supervisord'))
|
||||
rows["Name"].append(ItemLink("supervisord", key="supervisord"))
|
||||
supervisor_state = supervisor.getState()
|
||||
rows["State"].append(str(supervisor_state.get('statename') if isinstance(supervisor_state, dict) else ''))
|
||||
rows['PID'].append(str(supervisor.getPID()))
|
||||
rows["Started"].append('-')
|
||||
rows["Command"].append('supervisord --configuration=tmp/supervisord.conf')
|
||||
rows["State"].append(str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""))
|
||||
rows["PID"].append(str(supervisor.getPID()))
|
||||
rows["Started"].append("-")
|
||||
rows["Command"].append("supervisord --configuration=tmp/supervisord.conf")
|
||||
rows["Logfile"].append(
|
||||
format_html(
|
||||
'<a href="/admin/environment/logs/{}/">{}</a>',
|
||||
'supervisord',
|
||||
'logs/supervisord.log',
|
||||
)
|
||||
"supervisord",
|
||||
"logs/supervisord.log",
|
||||
),
|
||||
)
|
||||
rows['Exit Status'].append('0')
|
||||
rows["Exit Status"].append("0")
|
||||
|
||||
# Add a row for each worker process managed by supervisord
|
||||
process_items = supervisor.getAllProcessInfo()
|
||||
@@ -678,15 +728,15 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
rows["Name"].append(ItemLink(proc_name, key=proc_name))
|
||||
rows["State"].append(str(proc_data.get("statename") or ""))
|
||||
rows['PID'].append(proc_description.replace('pid ', ''))
|
||||
rows["PID"].append(proc_description.replace("pid ", ""))
|
||||
rows["Started"].append(format_parsed_datetime(proc_start))
|
||||
rows["Command"].append(str(proc_config.get("command") or ""))
|
||||
rows["Logfile"].append(
|
||||
format_html(
|
||||
'<a href="/admin/environment/logs/{}/">{}</a>',
|
||||
proc_logfile.split("/")[-1].split('.')[0],
|
||||
proc_logfile.split("/")[-1].split(".")[0],
|
||||
proc_logfile,
|
||||
)
|
||||
),
|
||||
)
|
||||
rows["Exit Status"].append(str(proc_data.get("exitstatus") or ""))
|
||||
|
||||
@@ -708,8 +758,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is None:
|
||||
return ItemContext(
|
||||
slug='none',
|
||||
title='error: No running supervisord process.',
|
||||
slug="none",
|
||||
title="error: No running supervisord process.",
|
||||
data=[],
|
||||
)
|
||||
|
||||
@@ -721,7 +771,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
if isinstance(config_data, dict):
|
||||
all_config.append(config_data)
|
||||
|
||||
if key == 'supervisord':
|
||||
if key == "supervisord":
|
||||
relevant_config = CONFIG_FILE.read_text()
|
||||
relevant_logs = str(supervisor.readLog(0, 10_000_000))
|
||||
start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0]
|
||||
@@ -729,7 +779,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else ""
|
||||
supervisor_state = supervisor.getState()
|
||||
|
||||
proc: Dict[str, object] = {
|
||||
proc: dict[str, object] = {
|
||||
"name": "supervisord",
|
||||
"pid": supervisor.getPID(),
|
||||
"statename": str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""),
|
||||
@@ -737,12 +787,12 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"stop": None,
|
||||
"exitstatus": "",
|
||||
"stdout_logfile": "logs/supervisord.log",
|
||||
"description": f'pid 000, uptime {uptime}',
|
||||
"description": f"pid 000, uptime {uptime}",
|
||||
}
|
||||
else:
|
||||
worker_data = get_worker(supervisor, key)
|
||||
proc = worker_data if isinstance(worker_data, dict) else {}
|
||||
relevant_config = next((config for config in all_config if config.get('name') == key), {})
|
||||
relevant_config = next((config for config in all_config if config.get("name") == key), {})
|
||||
log_result = supervisor.tailProcessStdoutLog(key, 0, 10_000_000)
|
||||
relevant_logs = str(log_result[0] if isinstance(log_result, tuple) else log_result)
|
||||
|
||||
@@ -775,7 +825,6 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
|
||||
log_files: list[Path] = []
|
||||
for logfile in sorted(CONSTANTS.LOGS_DIR.glob("*.log"), key=os.path.getmtime)[::-1]:
|
||||
if isinstance(logfile, Path):
|
||||
@@ -793,14 +842,14 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
st = logfile.stat()
|
||||
rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name))
|
||||
rows["Last Updated"].append(format_parsed_datetime(st.st_mtime))
|
||||
rows["Size"].append(f'{st.st_size//1000} kb')
|
||||
rows["Size"].append(f"{st.st_size // 1000} kb")
|
||||
|
||||
with open(logfile, 'rb') as f:
|
||||
with open(logfile, "rb") as f:
|
||||
try:
|
||||
f.seek(-1024, os.SEEK_END)
|
||||
except OSError:
|
||||
f.seek(0)
|
||||
last_lines = f.read().decode('utf-8', errors='replace').split("\n")
|
||||
last_lines = f.read().decode("utf-8", errors="replace").split("\n")
|
||||
non_empty_lines = [line for line in last_lines if line.strip()]
|
||||
rows["Most Recent Lines"].append(non_empty_lines[-1])
|
||||
|
||||
@@ -814,7 +863,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
|
||||
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob("*.log") if key in logfile.name][0]
|
||||
|
||||
log_text = log_file.read_text()
|
||||
log_stat = log_file.stat()
|
||||
@@ -824,7 +873,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"description": key,
|
||||
"fields": {
|
||||
"Path": str(log_file),
|
||||
"Size": f"{log_stat.st_size//1000} kb",
|
||||
"Size": f"{log_stat.st_size // 1000} kb",
|
||||
"Last Updated": format_parsed_datetime(log_stat.st_mtime),
|
||||
"Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]),
|
||||
"Full Log": log_text,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
__order__ = 100
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from archivebox.core.admin import register_admin as do_register
|
||||
|
||||
do_register(admin_site)
|
||||
|
||||
|
||||
@@ -17,11 +18,12 @@ def get_CONFIG():
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
"SHELL_CONFIG": SHELL_CONFIG,
|
||||
"STORAGE_CONFIG": STORAGE_CONFIG,
|
||||
"GENERAL_CONFIG": GENERAL_CONFIG,
|
||||
"SERVER_CONFIG": SERVER_CONFIG,
|
||||
"ARCHIVING_CONFIG": ARCHIVING_CONFIG,
|
||||
"SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
import html
|
||||
import json
|
||||
@@ -21,57 +21,45 @@ from django.utils.text import smart_split
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.misc.paginators import AcceleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.widgets import InlineTagEditorWidget
|
||||
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
|
||||
from archivebox.machine.env_utils import env_to_shell_exports
|
||||
|
||||
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def _stringify_env_value(value) -> str:
|
||||
if value is None:
|
||||
return ''
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return json.dumps(value, separators=(',', ':'))
|
||||
|
||||
|
||||
def _quote_shell_string(value: str) -> str:
|
||||
return "'" + str(value).replace("'", "'\"'\"'") + "'"
|
||||
|
||||
|
||||
def _get_replay_source_url(result: ArchiveResult) -> str:
|
||||
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
|
||||
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
|
||||
process_env = getattr(getattr(result, "process", None), "env", None) or {}
|
||||
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "")
|
||||
|
||||
|
||||
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
||||
source_url = _get_replay_source_url(result)
|
||||
plugin_name = str(result.plugin or '').strip()
|
||||
plugin_name = str(result.plugin or "").strip()
|
||||
if not plugin_name and not source_url:
|
||||
return 'abx-dl'
|
||||
return "abx-dl"
|
||||
if not source_url:
|
||||
return f'abx-dl --plugins={plugin_name}'
|
||||
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
|
||||
return f"abx-dl --plugins={plugin_name}"
|
||||
return f"abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}"
|
||||
|
||||
|
||||
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
|
||||
display_command = build_abx_dl_display_command(result)
|
||||
process = getattr(result, 'process', None)
|
||||
env = getattr(process, 'env', None) or {}
|
||||
env_items = ' '.join(
|
||||
f'{key}={shlex.quote(_stringify_env_value(value))}'
|
||||
for key, value in sorted(env.items())
|
||||
if value is not None
|
||||
)
|
||||
process = getattr(result, "process", None)
|
||||
env_items = env_to_shell_exports(getattr(process, "env", None) or {})
|
||||
snapshot_dir = shlex.quote(str(result.snapshot_dir))
|
||||
if env_items:
|
||||
return f'cd {snapshot_dir}; env {env_items} {display_command}'
|
||||
return f'cd {snapshot_dir}; {display_command}'
|
||||
return f"cd {snapshot_dir}; env {env_items} {display_command}"
|
||||
return f"cd {snapshot_dir}; {display_command}"
|
||||
|
||||
|
||||
def get_plugin_admin_url(plugin_name: str) -> str:
|
||||
@@ -81,50 +69,87 @@ def get_plugin_admin_url(plugin_name: str) -> str:
|
||||
if plugin_dir:
|
||||
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(builtin_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/"
|
||||
|
||||
user_root = USER_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(user_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
|
||||
return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/"
|
||||
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/"
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
|
||||
|
||||
results = list(archiveresults_qs.order_by('plugin').select_related('snapshot')[:limit])
|
||||
result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit])
|
||||
if not result_ids:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
|
||||
results_by_id = {
|
||||
result.pk: result
|
||||
for result in ArchiveResult.objects.filter(pk__in=result_ids).select_related("snapshot", "process", "process__machine")
|
||||
}
|
||||
results = [results_by_id[result_id] for result_id in result_ids if result_id in results_by_id]
|
||||
|
||||
if not results:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
|
||||
# Status colors
|
||||
status_colors = {
|
||||
'succeeded': ('#166534', '#dcfce7'), # green
|
||||
'failed': ('#991b1b', '#fee2e2'), # red
|
||||
'queued': ('#6b7280', '#f3f4f6'), # gray
|
||||
'started': ('#92400e', '#fef3c7'), # amber
|
||||
'backoff': ('#92400e', '#fef3c7'),
|
||||
'skipped': ('#475569', '#f1f5f9'),
|
||||
'noresults': ('#475569', '#f1f5f9'),
|
||||
"succeeded": ("#166534", "#dcfce7"), # green
|
||||
"failed": ("#991b1b", "#fee2e2"), # red
|
||||
"queued": ("#6b7280", "#f3f4f6"), # gray
|
||||
"started": ("#92400e", "#fef3c7"), # amber
|
||||
"backoff": ("#92400e", "#fef3c7"),
|
||||
"skipped": ("#475569", "#f1f5f9"),
|
||||
"noresults": ("#475569", "#f1f5f9"),
|
||||
}
|
||||
|
||||
rows = []
|
||||
for idx, result in enumerate(results):
|
||||
status = result.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
|
||||
status = result.status or "queued"
|
||||
color, bg = status_colors.get(status, ("#6b7280", "#f3f4f6"))
|
||||
output_files = result.output_files or {}
|
||||
if isinstance(output_files, dict):
|
||||
output_file_count = len(output_files)
|
||||
elif isinstance(output_files, (list, tuple, set)):
|
||||
output_file_count = len(output_files)
|
||||
elif isinstance(output_files, str):
|
||||
try:
|
||||
parsed = json.loads(output_files)
|
||||
output_file_count = len(parsed) if isinstance(parsed, (dict, list, tuple, set)) else 0
|
||||
except Exception:
|
||||
output_file_count = 0
|
||||
else:
|
||||
output_file_count = 0
|
||||
|
||||
# Get plugin icon
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
|
||||
# Format timestamp
|
||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||
end_time = result.end_ts.strftime("%Y-%m-%d %H:%M:%S") if result.end_ts else "-"
|
||||
|
||||
process_display = "-"
|
||||
if result.process_id and result.process:
|
||||
process_display = f'''
|
||||
<a href="{reverse("admin:machine_process_change", args=[result.process_id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px;"
|
||||
title="View process">{result.process.pid or "-"}</a>
|
||||
'''
|
||||
|
||||
machine_display = "-"
|
||||
if result.process_id and result.process and result.process.machine_id:
|
||||
machine_display = f'''
|
||||
<a href="{reverse("admin:machine_machine_change", args=[result.process.machine_id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-size: 12px;"
|
||||
title="View machine">{result.process.machine.hostname}</a>
|
||||
'''
|
||||
|
||||
# Truncate output for display
|
||||
full_output = result.output_str or '-'
|
||||
full_output = result.output_str or "-"
|
||||
output_display = full_output[:60]
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
output_display += "..."
|
||||
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
@@ -132,23 +157,23 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
cmd_attr = html.escape(replay_cmd, quote=True)
|
||||
|
||||
# Build output link - use embed_path() which checks output_files first
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
snapshot_id = str(getattr(result, 'snapshot_id', ''))
|
||||
if embed_path and result.status == 'succeeded':
|
||||
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
snapshot_id = str(getattr(result, "snapshot_id", ""))
|
||||
if embed_path and result.status == "succeeded":
|
||||
output_link = build_snapshot_url(snapshot_id, embed_path)
|
||||
else:
|
||||
output_link = build_snapshot_url(snapshot_id, '')
|
||||
output_link = build_snapshot_url(snapshot_id, "")
|
||||
|
||||
# Get version - try cmd_version field
|
||||
version = result.cmd_version if result.cmd_version else '-'
|
||||
version = result.cmd_version if result.cmd_version else "-"
|
||||
|
||||
# Unique ID for this row's expandable output
|
||||
row_id = f'output_{idx}_{str(result.id)[:8]}'
|
||||
row_id = f"output_{idx}_{str(result.id)[:8]}"
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
<a href="{reverse("admin:core_archiveresult_change", args=[result.id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
|
||||
title="View/edit archive result">
|
||||
<code>{str(result.id)[-8:]}</code>
|
||||
@@ -178,9 +203,18 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
{output_display}
|
||||
</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px; text-align: right;">
|
||||
{output_file_count}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
|
||||
{end_time}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
{process_display}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
{machine_display}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
|
||||
{version}
|
||||
</td>
|
||||
@@ -189,14 +223,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="View output">📄</a>
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
<a href="{reverse("admin:core_archiveresult_change", args=[result.id])}"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="Edit">✏️</a>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr style="border-bottom: 1px solid #e2e8f0;">
|
||||
<td colspan="8" style="padding: 0 12px 10px 12px;">
|
||||
<td colspan="11" style="padding: 0 12px 10px 12px;">
|
||||
<details id="{row_id}" style="margin: 0;">
|
||||
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
|
||||
Details & Output
|
||||
@@ -205,7 +239,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
|
||||
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or "-"}</code></span>
|
||||
</div>
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<b>Output:</b>
|
||||
@@ -230,19 +264,19 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
''')
|
||||
|
||||
total_count = archiveresults_qs.count()
|
||||
footer = ''
|
||||
footer = ""
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
footer = f"""
|
||||
<tr>
|
||||
<td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
<td colspan="11" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
Showing {limit} of {total_count} results
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ""}"
|
||||
style="color: #2563eb;">View all →</a>
|
||||
</td>
|
||||
</tr>
|
||||
'''
|
||||
"""
|
||||
|
||||
return mark_safe(f'''
|
||||
return mark_safe(f"""
|
||||
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
@@ -252,86 +286,92 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
|
||||
<th style="padding: 10px 12px; text-align: right; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Files</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Process</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Machine</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
|
||||
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{''.join(rows)}
|
||||
{"".join(rows)}
|
||||
{footer}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
''')
|
||||
|
||||
""")
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
name = 'Archive Results Log'
|
||||
name = "Archive Results Log"
|
||||
model = ArchiveResult
|
||||
parent_model = Snapshot
|
||||
# fk_name = 'snapshot'
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
|
||||
sort_fields = ("end_ts", "plugin", "output_str", "status", "cmd_version")
|
||||
readonly_fields = ("id", "result_id", "completed", "command", "version")
|
||||
fields = ("start_ts", "end_ts", *readonly_fields, "plugin", "cmd", "cmd_version", "pwd", "status", "output_str")
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
ordering = ("end_ts",)
|
||||
show_change_link = True
|
||||
# # classes = ['collapse']
|
||||
|
||||
def get_parent_object_from_request(self, request):
|
||||
resolved = resolve(request.path_info)
|
||||
try:
|
||||
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
|
||||
return self.parent_model.objects.get(pk=resolved.kwargs["object_id"])
|
||||
except (self.parent_model.DoesNotExist, ValidationError):
|
||||
return None
|
||||
|
||||
@admin.display(
|
||||
description='Completed',
|
||||
ordering='end_ts',
|
||||
description="Completed",
|
||||
ordering="end_ts",
|
||||
)
|
||||
def completed(self, obj):
|
||||
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
def result_id(self, obj):
|
||||
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), str(obj.id)[:8])
|
||||
|
||||
return format_html(
|
||||
'<a href="{}"><code style="font-size: 10px">[{}]</code></a>',
|
||||
reverse("admin:core_archiveresult_change", args=(obj.id,)),
|
||||
str(obj.id)[:8],
|
||||
)
|
||||
|
||||
def command(self, obj):
|
||||
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
|
||||
|
||||
return format_html("<small><code>{}</code></small>", " ".join(obj.cmd or []))
|
||||
|
||||
def version(self, obj):
|
||||
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
|
||||
|
||||
return format_html("<small><code>{}</code></small>", obj.cmd_version or "-")
|
||||
|
||||
def get_formset(self, request, obj=None, **kwargs):
|
||||
formset = super().get_formset(request, obj, **kwargs)
|
||||
snapshot = self.get_parent_object_from_request(request)
|
||||
form_class = getattr(formset, 'form', None)
|
||||
base_fields = getattr(form_class, 'base_fields', {})
|
||||
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ''
|
||||
form_class = getattr(formset, "form", None)
|
||||
base_fields = getattr(form_class, "base_fields", {})
|
||||
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ""
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
|
||||
|
||||
|
||||
# default values for new entries
|
||||
base_fields['status'].initial = 'succeeded'
|
||||
base_fields['start_ts'].initial = timezone.now()
|
||||
base_fields['end_ts'].initial = timezone.now()
|
||||
base_fields['cmd_version'].initial = '-'
|
||||
base_fields['pwd'].initial = snapshot_output_dir
|
||||
base_fields['cmd'].initial = '["-"]'
|
||||
base_fields['output_str'].initial = 'Manually recorded cmd output...'
|
||||
base_fields["status"].initial = "succeeded"
|
||||
base_fields["start_ts"].initial = timezone.now()
|
||||
base_fields["end_ts"].initial = timezone.now()
|
||||
base_fields["cmd_version"].initial = "-"
|
||||
base_fields["pwd"].initial = snapshot_output_dir
|
||||
base_fields["cmd"].initial = '["-"]'
|
||||
base_fields["output_str"].initial = "Manually recorded cmd output..."
|
||||
|
||||
if obj is not None:
|
||||
# hidden values for existing entries and new entries
|
||||
base_fields['start_ts'].widget = base_fields['start_ts'].hidden_widget()
|
||||
base_fields['end_ts'].widget = base_fields['end_ts'].hidden_widget()
|
||||
base_fields['cmd'].widget = base_fields['cmd'].hidden_widget()
|
||||
base_fields['pwd'].widget = base_fields['pwd'].hidden_widget()
|
||||
base_fields['cmd_version'].widget = base_fields['cmd_version'].hidden_widget()
|
||||
base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget()
|
||||
base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget()
|
||||
base_fields["cmd"].widget = base_fields["cmd"].hidden_widget()
|
||||
base_fields["pwd"].widget = base_fields["pwd"].hidden_widget()
|
||||
base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget()
|
||||
return formset
|
||||
|
||||
|
||||
def get_readonly_fields(self, request, obj=None):
|
||||
if obj is not None:
|
||||
return self.readonly_fields
|
||||
@@ -339,62 +379,122 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
return []
|
||||
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
|
||||
list_display = (
|
||||
"details_link",
|
||||
"zip_link",
|
||||
"created_at",
|
||||
"snapshot_info",
|
||||
"tags_inline",
|
||||
"status_badge",
|
||||
"plugin_with_icon",
|
||||
"process_link",
|
||||
"machine_link",
|
||||
"cmd_str",
|
||||
"output_str_display",
|
||||
)
|
||||
list_display_links = None
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
|
||||
search_fields = ()
|
||||
autocomplete_fields = ['snapshot']
|
||||
sort_fields = ("id", "created_at", "plugin", "status")
|
||||
readonly_fields = (
|
||||
"admin_actions",
|
||||
"cmd",
|
||||
"cmd_version",
|
||||
"pwd",
|
||||
"cmd_str",
|
||||
"snapshot_info",
|
||||
"tags_str",
|
||||
"created_at",
|
||||
"modified_at",
|
||||
"output_summary",
|
||||
"plugin_with_icon",
|
||||
"process_link",
|
||||
)
|
||||
search_fields = (
|
||||
"snapshot__id",
|
||||
"snapshot__url",
|
||||
"snapshot__tags__name",
|
||||
"snapshot__crawl_id",
|
||||
"plugin",
|
||||
"hook_name",
|
||||
"output_str",
|
||||
"output_json",
|
||||
"process__cmd",
|
||||
)
|
||||
autocomplete_fields = ["snapshot"]
|
||||
|
||||
fieldsets = (
|
||||
('Snapshot', {
|
||||
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Plugin', {
|
||||
'fields': ('plugin_with_icon', 'process_link', 'status'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Command', {
|
||||
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
(
|
||||
"Actions",
|
||||
{
|
||||
"fields": ("admin_actions",),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Snapshot",
|
||||
{
|
||||
"fields": ("snapshot", "snapshot_info", "tags_str"),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Plugin",
|
||||
{
|
||||
"fields": ("plugin_with_icon", "process_link", "status"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Timing",
|
||||
{
|
||||
"fields": ("start_ts", "end_ts", "created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Command",
|
||||
{
|
||||
"fields": ("cmd", "cmd_str", "cmd_version", "pwd"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Output",
|
||||
{
|
||||
"fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'plugin', 'start_ts')
|
||||
ordering = ['-start_ts']
|
||||
list_filter = ("status", "plugin", "start_ts")
|
||||
ordering = ["-start_ts"]
|
||||
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
paginator = AcceleratedPaginator
|
||||
save_on_top = True
|
||||
|
||||
actions = ['delete_selected']
|
||||
actions = ["delete_selected"]
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results'
|
||||
verbose_name = "Archive Result"
|
||||
verbose_name_plural = "Archive Results"
|
||||
|
||||
def change_view(self, request, object_id, form_url="", extra_context=None):
|
||||
self.request = request
|
||||
return super().change_view(request, object_id, form_url, extra_context)
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
self.request = request
|
||||
return super().changelist_view(request, extra_context)
|
||||
|
||||
def get_queryset(self, request):
|
||||
return (
|
||||
super()
|
||||
.get_queryset(request)
|
||||
.select_related('snapshot', 'process')
|
||||
.prefetch_related('snapshot__tags')
|
||||
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
|
||||
.select_related("snapshot", "process")
|
||||
.prefetch_related("snapshot__tags")
|
||||
.annotate(snapshot_first_tag=Min("snapshot__tags__name"))
|
||||
)
|
||||
|
||||
def get_search_results(self, request, queryset, search_term):
|
||||
@@ -402,15 +502,14 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
return queryset, False
|
||||
|
||||
queryset = queryset.annotate(
|
||||
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
|
||||
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
|
||||
output_json_text=Cast('output_json', output_field=TextField()),
|
||||
cmd_text=Cast('process__cmd', output_field=TextField()),
|
||||
snapshot_id_text=Cast("snapshot__id", output_field=TextField()),
|
||||
snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()),
|
||||
output_json_text=Cast("output_json", output_field=TextField()),
|
||||
cmd_text=Cast("process__cmd", output_field=TextField()),
|
||||
)
|
||||
|
||||
search_bits = [
|
||||
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
|
||||
for bit in smart_split(search_term)
|
||||
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term)
|
||||
]
|
||||
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
|
||||
if not search_bits:
|
||||
@@ -427,22 +526,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
| Q(hook_name__icontains=bit)
|
||||
| Q(output_str__icontains=bit)
|
||||
| Q(output_json_text__icontains=bit)
|
||||
| Q(cmd_text__icontains=bit)
|
||||
| Q(cmd_text__icontains=bit),
|
||||
)
|
||||
|
||||
return queryset.filter(reduce(and_, filters)).distinct(), True
|
||||
|
||||
@admin.display(description='Details', ordering='id')
|
||||
def get_snapshot_view_url(self, result: ArchiveResult) -> str:
|
||||
return build_snapshot_url(str(result.snapshot_id), request=getattr(self, "request", None))
|
||||
|
||||
def get_output_view_url(self, result: ArchiveResult) -> str:
|
||||
output_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
if not output_path:
|
||||
output_path = result.plugin or ""
|
||||
return build_snapshot_url(str(result.snapshot_id), output_path, request=getattr(self, "request", None))
|
||||
|
||||
def get_output_files_url(self, result: ArchiveResult) -> str:
|
||||
return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=getattr(self, 'request', None))}/?files=1"
|
||||
|
||||
def get_output_zip_url(self, result: ArchiveResult) -> str:
|
||||
return f"{self.get_output_files_url(result)}&download=zip"
|
||||
|
||||
@admin.display(description="Details", ordering="id")
|
||||
def details_link(self, result):
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:core_archiveresult_change', args=[result.id]),
|
||||
reverse("admin:core_archiveresult_change", args=[result.id]),
|
||||
str(result.id)[-8:],
|
||||
)
|
||||
|
||||
@admin.display(description="Zip")
|
||||
def zip_link(self, result):
|
||||
return format_html(
|
||||
'<a href="{}" class="archivebox-zip-button" data-loading-mode="spinner-only" onclick="return window.archiveboxHandleZipClick(this, event);" style="display:inline-flex; align-items:center; justify-content:center; gap:4px; width:48px; min-width:48px; height:24px; padding:0; box-sizing:border-box; border-radius:999px; border:1px solid #bfdbfe; background:#eff6ff; color:#1d4ed8; font-size:11px; font-weight:600; line-height:1; text-decoration:none;"><span class="archivebox-zip-spinner" aria-hidden="true"></span><span class="archivebox-zip-label">⬇ ZIP</span></a>',
|
||||
self.get_output_zip_url(result),
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot',
|
||||
ordering='snapshot__url',
|
||||
description="Snapshot",
|
||||
ordering="snapshot__url",
|
||||
)
|
||||
def snapshot_info(self, result):
|
||||
snapshot_id = str(result.snapshot_id)
|
||||
@@ -450,29 +571,28 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'<a href="{}"><b><code>[{}]</code></b> {} {}</a><br/>',
|
||||
build_snapshot_url(snapshot_id, "index.html"),
|
||||
snapshot_id[:8],
|
||||
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
|
||||
result.snapshot.bookmarked_at.strftime("%Y-%m-%d %H:%M"),
|
||||
result.snapshot.url[:128],
|
||||
)
|
||||
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot Tags'
|
||||
description="Snapshot Tags",
|
||||
)
|
||||
def tags_str(self, result):
|
||||
return result.snapshot.tags_str()
|
||||
|
||||
@admin.display(description='Tags', ordering='snapshot_first_tag')
|
||||
@admin.display(description="Tags", ordering="snapshot_first_tag")
|
||||
def tags_inline(self, result):
|
||||
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
|
||||
tags_html = widget.render(
|
||||
name=f'tags_{result.snapshot_id}',
|
||||
name=f"tags_{result.snapshot_id}",
|
||||
value=result.snapshot.tags.all(),
|
||||
attrs={'id': f'tags_{result.snapshot_id}'},
|
||||
attrs={"id": f"tags_{result.snapshot_id}"},
|
||||
snapshot_id=str(result.snapshot_id),
|
||||
)
|
||||
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
|
||||
|
||||
@admin.display(description='Status', ordering='status')
|
||||
@admin.display(description="Status", ordering="status")
|
||||
def status_badge(self, result):
|
||||
status = result.status or ArchiveResult.StatusChoices.QUEUED
|
||||
return format_html(
|
||||
@@ -482,7 +602,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.get_status_display() or status,
|
||||
)
|
||||
|
||||
@admin.display(description='Plugin', ordering='plugin')
|
||||
@admin.display(description="Plugin", ordering="plugin")
|
||||
def plugin_with_icon(self, result):
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
return format_html(
|
||||
@@ -494,36 +614,36 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.plugin,
|
||||
)
|
||||
|
||||
@admin.display(description='Process', ordering='process__pid')
|
||||
@admin.display(description="Process", ordering="process__pid")
|
||||
def process_link(self, result):
|
||||
if not result.process_id:
|
||||
return '-'
|
||||
process_label = result.process.pid if result.process and result.process.pid else '-'
|
||||
return "-"
|
||||
process_label = result.process.pid if result.process and result.process.pid else "-"
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:machine_process_change', args=[result.process_id]),
|
||||
reverse("admin:machine_process_change", args=[result.process_id]),
|
||||
process_label,
|
||||
)
|
||||
|
||||
@admin.display(description='Machine', ordering='process__machine__hostname')
|
||||
@admin.display(description="Machine", ordering="process__machine__hostname")
|
||||
def machine_link(self, result):
|
||||
if not result.process_id or not result.process or not result.process.machine_id:
|
||||
return '-'
|
||||
return "-"
|
||||
machine = result.process.machine
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code> {}</a>',
|
||||
reverse('admin:machine_machine_change', args=[machine.id]),
|
||||
reverse("admin:machine_machine_change", args=[machine.id]),
|
||||
str(machine.id)[:8],
|
||||
machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Command')
|
||||
@admin.display(description="Command")
|
||||
def cmd_str(self, result):
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
return format_html(
|
||||
'''
|
||||
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
|
||||
"""
|
||||
<div style="position: relative; width: 100%; max-width: 100%; overflow: hidden; box-sizing: border-box;">
|
||||
<button type="button"
|
||||
data-command="{}"
|
||||
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
|
||||
@@ -534,7 +654,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
{}
|
||||
</code>
|
||||
</div>
|
||||
''',
|
||||
""",
|
||||
replay_cmd,
|
||||
replay_cmd,
|
||||
display_cmd,
|
||||
@@ -542,8 +662,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
|
||||
def output_display(self, result):
|
||||
# Determine output link path - use embed_path() which checks output_files
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
|
||||
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html"
|
||||
snapshot_id = str(result.snapshot_id)
|
||||
return format_html(
|
||||
'<a href="{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
@@ -551,13 +671,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.output_str,
|
||||
)
|
||||
|
||||
@admin.display(description='Output', ordering='output_str')
|
||||
@admin.display(description="Output", ordering="output_str")
|
||||
def output_str_display(self, result):
|
||||
output_text = str(result.output_str or '').strip()
|
||||
output_text = str(result.output_str or "").strip()
|
||||
if not output_text:
|
||||
return '-'
|
||||
return "-"
|
||||
|
||||
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
live_path = result.embed_path() if hasattr(result, "embed_path") else None
|
||||
if live_path:
|
||||
return format_html(
|
||||
'<a href="{}" title="{}"><code>{}</code></a>',
|
||||
@@ -572,8 +692,48 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
output_text,
|
||||
)
|
||||
|
||||
@admin.display(description="")
|
||||
def admin_actions(self, result):
|
||||
return format_html(
|
||||
"""
|
||||
<div style="display:flex; flex-wrap:wrap; gap:12px; align-items:center;">
|
||||
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📄 View Output
|
||||
</a>
|
||||
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📁 Output files
|
||||
</a>
|
||||
<a class="btn archivebox-zip-button" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#eff6ff; border:1px solid #bfdbfe; border-radius:8px; color:#1d4ed8; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
data-loading-label="Preparing..."
|
||||
onclick="return window.archiveboxHandleZipClick(this, event);"
|
||||
onmouseover="this.style.background='#dbeafe'; this.style.borderColor='#93c5fd';"
|
||||
onmouseout="this.style.background='#eff6ff'; this.style.borderColor='#bfdbfe';">
|
||||
<span class="archivebox-zip-spinner" aria-hidden="true"></span>
|
||||
<span class="archivebox-zip-label">⬇ Download Zip</span>
|
||||
</a>
|
||||
<a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
🗂 Snapshot
|
||||
</a>
|
||||
</div>
|
||||
""",
|
||||
self.get_output_view_url(result),
|
||||
self.get_output_files_url(result),
|
||||
self.get_output_zip_url(result),
|
||||
self.get_snapshot_view_url(result),
|
||||
)
|
||||
|
||||
def output_summary(self, result):
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split("data/", 1)[-1]
|
||||
output_html = format_html(
|
||||
'<pre style="display: inline-block">{}</pre><br/>',
|
||||
result.output_str,
|
||||
@@ -583,9 +743,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'<a href="{}#all">See result files ...</a><br/><pre><code>',
|
||||
build_snapshot_url(snapshot_id, "index.html"),
|
||||
)
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||
path_from_embed = (snapshot_dir / (embed_path or ''))
|
||||
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
|
||||
embed_path = result.embed_path() if hasattr(result, "embed_path") else ""
|
||||
path_from_embed = snapshot_dir / (embed_path or "")
|
||||
output_html += format_html(
|
||||
'<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>',
|
||||
str(snapshot_dir),
|
||||
str(embed_path),
|
||||
)
|
||||
if os.access(path_from_embed, os.R_OK):
|
||||
root_dir = str(path_from_embed)
|
||||
else:
|
||||
@@ -594,19 +758,22 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
# print(root_dir, str(list(os.walk(root_dir))))
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
depth = root.replace(root_dir, '').count(os.sep) + 1
|
||||
depth = root.replace(root_dir, "").count(os.sep) + 1
|
||||
if depth > 2:
|
||||
continue
|
||||
indent = ' ' * 4 * (depth)
|
||||
indent = " " * 4 * (depth)
|
||||
output_html += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
|
||||
indentation_str = ' ' * 4 * (depth + 1)
|
||||
indentation_str = " " * 4 * (depth + 1)
|
||||
for filename in sorted(files):
|
||||
is_hidden = filename.startswith('.')
|
||||
output_html += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
||||
|
||||
return output_html + mark_safe('</code></pre>')
|
||||
|
||||
is_hidden = filename.startswith(".")
|
||||
output_html += format_html(
|
||||
'<span style="opacity: {}.2">{}{}</span><br/>',
|
||||
int(not is_hidden),
|
||||
indentation_str,
|
||||
filename.strip(),
|
||||
)
|
||||
|
||||
return output_html + mark_safe("</code></pre>")
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@@ -18,23 +18,23 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Admin Views'
|
||||
site_title = 'Admin'
|
||||
namespace = 'admin'
|
||||
site_header = "ArchiveBox"
|
||||
index_title = "Admin Views"
|
||||
site_title = "Admin"
|
||||
namespace = "admin"
|
||||
|
||||
def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']:
|
||||
def get_app_list(self, request: "HttpRequest", app_label: str | None = None) -> list["AppDict"]:
|
||||
if app_label is None:
|
||||
return adv_get_app_list(self, request)
|
||||
return adv_get_app_list(self, request, app_label)
|
||||
|
||||
def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse':
|
||||
def admin_data_index_view(self, request: "HttpRequest", **kwargs: Any) -> "TemplateResponse":
|
||||
return adv_admin_data_index_view(self, request, **kwargs)
|
||||
|
||||
def get_admin_data_urls(self) -> list['URLResolver | URLPattern']:
|
||||
def get_admin_data_urls(self) -> list["URLResolver | URLPattern"]:
|
||||
return adv_get_admin_data_urls(self)
|
||||
|
||||
def get_urls(self) -> list['URLResolver | URLPattern']:
|
||||
def get_urls(self) -> list["URLResolver | URLPattern"]:
|
||||
return self.get_admin_data_urls() + super().get_urls()
|
||||
|
||||
|
||||
@@ -43,7 +43,6 @@ archivebox_admin = ArchiveBoxAdmin()
|
||||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
|
||||
|
||||
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from urllib.parse import quote
|
||||
|
||||
@@ -28,92 +28,107 @@ from archivebox.core.host_utils import build_snapshot_url
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
model = SnapshotTag
|
||||
fields = ('id', 'tag')
|
||||
fields = ("id", "tag")
|
||||
extra = 1
|
||||
max_num = 1000
|
||||
autocomplete_fields = (
|
||||
'tag',
|
||||
)
|
||||
autocomplete_fields = ("tag",)
|
||||
|
||||
|
||||
class TagAdminForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = Tag
|
||||
fields = '__all__'
|
||||
fields = "__all__"
|
||||
widgets = {
|
||||
'name': forms.TextInput(attrs={
|
||||
'placeholder': 'research, receipts, product-design...',
|
||||
'autocomplete': 'off',
|
||||
'spellcheck': 'false',
|
||||
'data-tag-name-input': '1',
|
||||
}),
|
||||
"name": forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "research, receipts, product-design...",
|
||||
"autocomplete": "off",
|
||||
"spellcheck": "false",
|
||||
"data-tag-name-input": "1",
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
def clean_name(self):
|
||||
name = (self.cleaned_data.get('name') or '').strip()
|
||||
name = (self.cleaned_data.get("name") or "").strip()
|
||||
if not name:
|
||||
raise forms.ValidationError('Tag name is required.')
|
||||
raise forms.ValidationError("Tag name is required.")
|
||||
return name
|
||||
|
||||
|
||||
class TagAdmin(BaseModelAdmin):
|
||||
form = TagAdminForm
|
||||
change_list_template = 'admin/core/tag/change_list.html'
|
||||
change_form_template = 'admin/core/tag/change_form.html'
|
||||
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
|
||||
list_filter = ('created_at', 'created_by')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
actions = ['delete_selected']
|
||||
ordering = ['name', 'id']
|
||||
change_list_template = "admin/core/tag/change_list.html"
|
||||
change_form_template = "admin/core/tag/change_form.html"
|
||||
list_display = ("name", "num_snapshots", "created_at", "created_by")
|
||||
list_filter = ("created_at", "created_by")
|
||||
search_fields = ("id", "name", "slug")
|
||||
readonly_fields = ("slug", "id", "created_at", "modified_at", "snapshots")
|
||||
actions = ["delete_selected"]
|
||||
ordering = ["name", "id"]
|
||||
|
||||
fieldsets = (
|
||||
('Tag', {
|
||||
'fields': ('name', 'slug'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Recent Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
(
|
||||
"Tag",
|
||||
{
|
||||
"fields": ("name", "slug"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Metadata",
|
||||
{
|
||||
"fields": ("id", "created_by", "created_at", "modified_at"),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Recent Snapshots",
|
||||
{
|
||||
"fields": ("snapshots",),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
add_fieldsets = (
|
||||
('Tag', {
|
||||
'fields': ('name',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
(
|
||||
"Tag",
|
||||
{
|
||||
"fields": ("name",),
|
||||
"classes": ("card", "wide"),
|
||||
},
|
||||
),
|
||||
(
|
||||
"Metadata",
|
||||
{
|
||||
"fields": ("created_by",),
|
||||
"classes": ("card",),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
|
||||
return self.fieldsets if obj else self.add_fieldsets
|
||||
|
||||
def changelist_view(self, request: HttpRequest, extra_context=None):
|
||||
query = (request.GET.get('q') or '').strip()
|
||||
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
|
||||
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
|
||||
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
|
||||
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
|
||||
query = (request.GET.get("q") or "").strip()
|
||||
sort = normalize_tag_sort((request.GET.get("sort") or "created_desc").strip())
|
||||
created_by = normalize_created_by_filter((request.GET.get("created_by") or "").strip())
|
||||
year = normalize_created_year_filter((request.GET.get("year") or "").strip())
|
||||
has_snapshots = normalize_has_snapshots_filter((request.GET.get("has_snapshots") or "all").strip())
|
||||
extra_context = {
|
||||
**(extra_context or {}),
|
||||
'initial_query': query,
|
||||
'initial_sort': sort,
|
||||
'initial_created_by': created_by,
|
||||
'initial_year': year,
|
||||
'initial_has_snapshots': has_snapshots,
|
||||
'tag_sort_choices': TAG_SORT_CHOICES,
|
||||
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
'tag_created_by_choices': get_tag_creator_choices(),
|
||||
'tag_year_choices': get_tag_year_choices(),
|
||||
'initial_tag_cards': build_tag_cards(
|
||||
"initial_query": query,
|
||||
"initial_sort": sort,
|
||||
"initial_created_by": created_by,
|
||||
"initial_year": year,
|
||||
"initial_has_snapshots": has_snapshots,
|
||||
"tag_sort_choices": TAG_SORT_CHOICES,
|
||||
"tag_has_snapshots_choices": TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
"tag_created_by_choices": get_tag_creator_choices(),
|
||||
"tag_year_choices": get_tag_year_choices(),
|
||||
"initial_tag_cards": build_tag_cards(
|
||||
query=query,
|
||||
request=request,
|
||||
sort=sort,
|
||||
@@ -121,62 +136,67 @@ class TagAdmin(BaseModelAdmin):
|
||||
year=year,
|
||||
has_snapshots=has_snapshots,
|
||||
),
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_create_api_url': reverse('api-1:tags_create'),
|
||||
"tag_search_api_url": reverse("api-1:search_tags"),
|
||||
"tag_create_api_url": reverse("api-1:tags_create"),
|
||||
}
|
||||
return super().changelist_view(request, extra_context=extra_context)
|
||||
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
|
||||
current_name = (request.POST.get('name') or '').strip()
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None):
|
||||
current_name = (request.POST.get("name") or "").strip()
|
||||
if not current_name and obj:
|
||||
current_name = obj.name
|
||||
|
||||
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
|
||||
similar_tag_cards = (
|
||||
build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
|
||||
)
|
||||
if obj:
|
||||
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
|
||||
similar_tag_cards = [card for card in similar_tag_cards if card["id"] != obj.pk]
|
||||
|
||||
context.update({
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_similar_cards': similar_tag_cards,
|
||||
'tag_similar_query': current_name,
|
||||
})
|
||||
context.update(
|
||||
{
|
||||
"tag_search_api_url": reverse("api-1:search_tags"),
|
||||
"tag_similar_cards": similar_tag_cards,
|
||||
"tag_similar_query": current_name,
|
||||
},
|
||||
)
|
||||
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
|
||||
|
||||
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
|
||||
if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST:
|
||||
return super().response_add(request, obj, post_url_continue=post_url_continue)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def response_change(self, request: HttpRequest, obj: Tag):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
|
||||
if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST or "_saveasnew" in request.POST:
|
||||
return super().response_change(request, obj)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
|
||||
changelist_url = reverse('admin:core_tag_changelist')
|
||||
def _redirect_to_changelist(self, query: str = "") -> HttpResponseRedirect:
|
||||
changelist_url = reverse("admin:core_tag_changelist")
|
||||
if query:
|
||||
changelist_url = f'{changelist_url}?q={quote(query)}'
|
||||
changelist_url = f"{changelist_url}?q={quote(query)}"
|
||||
return HttpResponseRedirect(changelist_url)
|
||||
|
||||
@admin.display(description='Snapshots')
|
||||
@admin.display(description="Snapshots")
|
||||
def snapshots(self, tag: Tag):
|
||||
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
|
||||
snapshots = tag.snapshot_set.select_related("crawl__created_by").order_by("-downloaded_at", "-created_at", "-pk")[:10]
|
||||
total_count = tag.snapshot_set.count()
|
||||
if not snapshots:
|
||||
return mark_safe(
|
||||
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
|
||||
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
|
||||
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>',
|
||||
)
|
||||
|
||||
cards = []
|
||||
for snapshot in snapshots:
|
||||
title = (snapshot.title or '').strip() or snapshot.url
|
||||
cards.append(format_html(
|
||||
'''
|
||||
title = (snapshot.title or "").strip() or snapshot.url
|
||||
cards.append(
|
||||
format_html(
|
||||
"""
|
||||
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
|
||||
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
|
||||
<span style="min-width:0;">
|
||||
@@ -184,23 +204,26 @@ class TagAdmin(BaseModelAdmin):
|
||||
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
|
||||
</span>
|
||||
</a>
|
||||
''',
|
||||
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
|
||||
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
|
||||
title[:120],
|
||||
snapshot.url[:120],
|
||||
))
|
||||
""",
|
||||
reverse("admin:core_snapshot_change", args=[snapshot.pk]),
|
||||
build_snapshot_url(str(snapshot.pk), "favicon.ico"),
|
||||
title[:120],
|
||||
snapshot.url[:120],
|
||||
),
|
||||
)
|
||||
|
||||
cards.append(format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
|
||||
tag.id,
|
||||
total_count,
|
||||
))
|
||||
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
|
||||
cards.append(
|
||||
format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
|
||||
tag.id,
|
||||
total_count,
|
||||
),
|
||||
)
|
||||
return mark_safe('<div style="display:grid;gap:10px;">' + "".join(cards) + "</div>")
|
||||
|
||||
@admin.display(description='Snapshots', ordering='num_snapshots')
|
||||
@admin.display(description="Snapshots", ordering="num_snapshots")
|
||||
def num_snapshots(self, tag: Tag):
|
||||
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
|
||||
count = getattr(tag, "num_snapshots", tag.snapshot_set.count())
|
||||
return format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
||||
tag.id,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.admin import UserAdmin
|
||||
@@ -8,87 +8,100 @@ from django.utils.safestring import mark_safe
|
||||
|
||||
|
||||
class CustomUserAdmin(UserAdmin):
|
||||
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
|
||||
list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined']
|
||||
readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set')
|
||||
sort_fields = ["id", "email", "username", "is_superuser", "last_login", "date_joined"]
|
||||
list_display = ["username", "id", "email", "is_superuser", "last_login", "date_joined"]
|
||||
readonly_fields = ("snapshot_set", "archiveresult_set", "tag_set", "apitoken_set", "outboundwebhook_set")
|
||||
|
||||
# Preserve Django's default user creation form and fieldsets
|
||||
# This ensures passwords are properly hashed and permissions are set correctly
|
||||
add_fieldsets = UserAdmin.add_fieldsets
|
||||
|
||||
# Extend fieldsets for change form only (not user creation)
|
||||
fieldsets = [*(UserAdmin.fieldsets or ()), ('Data', {'fields': readonly_fields})]
|
||||
fieldsets = [*(UserAdmin.fieldsets or ()), ("Data", {"fields": readonly_fields})]
|
||||
|
||||
@admin.display(description='Snapshots')
|
||||
@admin.display(description="Snapshots")
|
||||
def snapshot_set(self, obj):
|
||||
total_count = obj.snapshot_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
|
||||
snap.pk,
|
||||
str(snap.id)[:8],
|
||||
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
|
||||
snap.url[:64],
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
|
||||
snap.pk,
|
||||
str(snap.id)[:8],
|
||||
snap.downloaded_at.strftime("%Y-%m-%d %H:%M") if snap.downloaded_at else "pending...",
|
||||
snap.url[:64],
|
||||
)
|
||||
for snap in obj.snapshot_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for snap in obj.snapshot_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='Archive Result Logs')
|
||||
@admin.display(description="Archive Result Logs")
|
||||
def archiveresult_set(self, obj):
|
||||
total_count = obj.archiveresult_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
|
||||
result.pk,
|
||||
str(result.id)[:8],
|
||||
result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...',
|
||||
result.extractor,
|
||||
result.snapshot.url[:64],
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
|
||||
result.pk,
|
||||
str(result.id)[:8],
|
||||
result.snapshot.downloaded_at.strftime("%Y-%m-%d %H:%M") if result.snapshot.downloaded_at else "pending...",
|
||||
result.extractor,
|
||||
result.snapshot.url[:64],
|
||||
)
|
||||
for result in obj.archiveresult_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for result in obj.archiveresult_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='Tags')
|
||||
@admin.display(description="Tags")
|
||||
def tag_set(self, obj):
|
||||
total_count = obj.tag_set.count()
|
||||
return mark_safe(', '.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
|
||||
tag.pk,
|
||||
tag.name,
|
||||
return mark_safe(
|
||||
", ".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
|
||||
tag.pk,
|
||||
tag.name,
|
||||
)
|
||||
for tag in obj.tag_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for tag in obj.tag_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='API Tokens')
|
||||
@admin.display(description="API Tokens")
|
||||
def apitoken_set(self, obj):
|
||||
total_count = obj.apitoken_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
|
||||
apitoken.pk,
|
||||
str(apitoken.id)[:8],
|
||||
apitoken.token_redacted[:64],
|
||||
apitoken.expires,
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
|
||||
apitoken.pk,
|
||||
str(apitoken.id)[:8],
|
||||
apitoken.token_redacted[:64],
|
||||
apitoken.expires,
|
||||
)
|
||||
for apitoken in obj.apitoken_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for apitoken in obj.apitoken_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
+ f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
@admin.display(description='API Outbound Webhooks')
|
||||
@admin.display(description="API Outbound Webhooks")
|
||||
def outboundwebhook_set(self, obj):
|
||||
total_count = obj.outboundwebhook_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
|
||||
outboundwebhook.pk,
|
||||
str(outboundwebhook.id)[:8],
|
||||
outboundwebhook.referenced_model,
|
||||
outboundwebhook.endpoint,
|
||||
return mark_safe(
|
||||
"<br/>".join(
|
||||
format_html(
|
||||
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
|
||||
outboundwebhook.pk,
|
||||
str(outboundwebhook.id)[:8],
|
||||
outboundwebhook.referenced_model,
|
||||
outboundwebhook.endpoint,
|
||||
)
|
||||
for outboundwebhook in obj.outboundwebhook_set.order_by("-modified_at")[:10]
|
||||
)
|
||||
for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10]
|
||||
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
|
||||
|
||||
+ f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>',
|
||||
)
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django.apps import AppConfig
|
||||
import os
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'archivebox.core'
|
||||
label = 'core'
|
||||
name = "archivebox.core"
|
||||
label = "core"
|
||||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
@@ -14,29 +14,30 @@ class CoreConfig(AppConfig):
|
||||
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
|
||||
|
||||
from archivebox.core.admin_site import register_admin_site
|
||||
|
||||
register_admin_site()
|
||||
|
||||
# Import models to register state machines with the registry
|
||||
# Skip during makemigrations to avoid premature state machine access
|
||||
if 'makemigrations' not in sys.argv:
|
||||
if "makemigrations" not in sys.argv:
|
||||
from archivebox.core import models # noqa: F401
|
||||
|
||||
pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE')
|
||||
pidfile = os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
|
||||
if pidfile:
|
||||
should_write_pid = True
|
||||
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
|
||||
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1":
|
||||
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == "true"
|
||||
if should_write_pid:
|
||||
try:
|
||||
with open(pidfile, 'w') as handle:
|
||||
with open(pidfile, "w") as handle:
|
||||
handle.write(str(os.getpid()))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _should_prepare_runtime() -> bool:
|
||||
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
|
||||
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
|
||||
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if os.environ.get("ARCHIVEBOX_RUNSERVER") == "1":
|
||||
if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1":
|
||||
return os.environ.get(DJANGO_AUTORELOAD_ENV) == "true"
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -44,4 +45,5 @@ class CoreConfig(AppConfig):
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Process.cleanup_orphaned_workers()
|
||||
Machine.current()
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.misc.util import URL_REGEX, find_all_urls
|
||||
from archivebox.misc.util import URL_REGEX, find_all_urls, parse_filesize_to_bytes
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
from archivebox.crawls.schedule_utils import validate_schedule
|
||||
@@ -13,11 +13,11 @@ from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_ic
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (+ URLs one hop away)'),
|
||||
('2', 'depth = 2 (+ URLs two hops away)'),
|
||||
('3', 'depth = 3 (+ URLs three hops away)'),
|
||||
('4', 'depth = 4 (+ URLs four hops away)'),
|
||||
("0", "depth = 0 (archive just these URLs)"),
|
||||
("1", "depth = 1 (+ URLs one hop away)"),
|
||||
("2", "depth = 2 (+ URLs two hops away)"),
|
||||
("3", "depth = 3 (+ URLs three hops away)"),
|
||||
("4", "depth = 4 (+ URLs four hops away)"),
|
||||
)
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ def get_plugin_choices():
|
||||
|
||||
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
|
||||
schema = plugin_configs.get(plugin_name, {})
|
||||
description = str(schema.get('description') or '').strip()
|
||||
description = str(schema.get("description") or "").strip()
|
||||
if not description:
|
||||
return plugin_name
|
||||
icon_html = get_plugin_icon(plugin_name)
|
||||
@@ -45,7 +45,7 @@ def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -
|
||||
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
|
||||
field = form.fields[name]
|
||||
if not isinstance(field, forms.ChoiceField):
|
||||
raise TypeError(f'{name} must be a ChoiceField')
|
||||
raise TypeError(f"{name} must be a ChoiceField")
|
||||
return field
|
||||
|
||||
|
||||
@@ -54,10 +54,12 @@ class AddLinkForm(forms.Form):
|
||||
url = forms.CharField(
|
||||
label="URLs",
|
||||
strip=True,
|
||||
widget=forms.Textarea(attrs={
|
||||
'data-url-regex': URL_REGEX.pattern,
|
||||
}),
|
||||
required=True
|
||||
widget=forms.Textarea(
|
||||
attrs={
|
||||
"data-url-regex": URL_REGEX.pattern,
|
||||
},
|
||||
),
|
||||
required=True,
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags",
|
||||
@@ -68,16 +70,41 @@ class AddLinkForm(forms.Form):
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
choices=DEPTH_CHOICES,
|
||||
initial='0',
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
|
||||
initial="0",
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"}),
|
||||
)
|
||||
max_urls = forms.IntegerField(
|
||||
label="Max URLs",
|
||||
required=False,
|
||||
min_value=0,
|
||||
initial=0,
|
||||
widget=forms.NumberInput(
|
||||
attrs={
|
||||
"min": 0,
|
||||
"step": 1,
|
||||
"placeholder": "0 = unlimited",
|
||||
},
|
||||
),
|
||||
)
|
||||
max_size = forms.CharField(
|
||||
label="Max size",
|
||||
required=False,
|
||||
initial="0",
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "0 = unlimited, or e.g. 45mb / 1gb",
|
||||
},
|
||||
),
|
||||
)
|
||||
notes = forms.CharField(
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'Optional notes about this crawl',
|
||||
})
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "Optional notes about this crawl",
|
||||
},
|
||||
),
|
||||
)
|
||||
url_filters = forms.Field(
|
||||
label="URL allowlist / denylist",
|
||||
@@ -128,16 +155,18 @@ class AddLinkForm(forms.Form):
|
||||
label="Repeat schedule",
|
||||
max_length=64,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
widget=forms.TextInput(
|
||||
attrs={
|
||||
"placeholder": "e.g., daily, weekly, 0 */6 * * * (every 6 hours)",
|
||||
},
|
||||
),
|
||||
)
|
||||
persona = forms.ModelChoiceField(
|
||||
label="Persona (authentication profile)",
|
||||
required=False,
|
||||
queryset=Persona.objects.none(),
|
||||
empty_label=None,
|
||||
to_field_name='name',
|
||||
to_field_name="name",
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only dry run (add crawl but don't archive yet)",
|
||||
@@ -155,8 +184,8 @@ class AddLinkForm(forms.Form):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
default_persona = Persona.get_or_create_default()
|
||||
self.fields['persona'].queryset = Persona.objects.order_by('name')
|
||||
self.fields['persona'].initial = default_persona.name
|
||||
self.fields["persona"].queryset = Persona.objects.order_by("name")
|
||||
self.fields["persona"].initial = default_persona.name
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
@@ -164,86 +193,136 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
|
||||
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
"accessibility",
|
||||
"chrome",
|
||||
"consolelog",
|
||||
"dom",
|
||||
"headers",
|
||||
"parse_dom_outlinks",
|
||||
"pdf",
|
||||
"redirects",
|
||||
"responses",
|
||||
"screenshot",
|
||||
"seo",
|
||||
"singlefile",
|
||||
"ssl",
|
||||
"staticfile",
|
||||
"title",
|
||||
}
|
||||
archiving = {
|
||||
'archivedotorg', 'defuddle', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'mercury', 'papersdl', 'readability', 'trafilatura', 'wget', 'ytdlp'
|
||||
"archivedotorg",
|
||||
"defuddle",
|
||||
"favicon",
|
||||
"forumdl",
|
||||
"gallerydl",
|
||||
"git",
|
||||
"htmltotext",
|
||||
"mercury",
|
||||
"papersdl",
|
||||
"readability",
|
||||
"trafilatura",
|
||||
"wget",
|
||||
"ytdlp",
|
||||
}
|
||||
parsing = {
|
||||
'parse_html_urls', 'parse_jsonl_urls',
|
||||
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
|
||||
"parse_html_urls",
|
||||
"parse_jsonl_urls",
|
||||
"parse_netscape_urls",
|
||||
"parse_rss_urls",
|
||||
"parse_txt_urls",
|
||||
}
|
||||
search = {
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
"search_backend_ripgrep",
|
||||
"search_backend_sonic",
|
||||
"search_backend_sqlite",
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
|
||||
binary = {"apt", "brew", "custom", "env", "npm", "pip"}
|
||||
extensions = {"twocaptcha", "istilldontcareaboutcookies", "ublock"}
|
||||
|
||||
# Populate plugin field choices
|
||||
get_choice_field(self, 'chrome_plugins').choices = [
|
||||
get_choice_field(self, "chrome_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
get_choice_field(self, 'archiving_plugins').choices = [
|
||||
get_choice_field(self, "archiving_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
get_choice_field(self, 'parsing_plugins').choices = [
|
||||
get_choice_field(self, "parsing_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
get_choice_field(self, 'search_plugins').choices = [
|
||||
get_choice_field(self, "search_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
get_choice_field(self, 'binary_plugins').choices = [
|
||||
get_choice_field(self, "binary_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
get_choice_field(self, 'extension_plugins').choices = [
|
||||
get_choice_field(self, "extension_plugins").choices = [
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
|
||||
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
|
||||
required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip()
|
||||
search_choices = [choice[0] for choice in get_choice_field(self, "search_plugins").choices]
|
||||
if required_search_plugin in search_choices:
|
||||
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
|
||||
get_choice_field(self, "search_plugins").initial = [required_search_plugin]
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean() or {}
|
||||
|
||||
# Combine all plugin groups into single list
|
||||
all_selected_plugins = []
|
||||
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
|
||||
'search_plugins', 'binary_plugins', 'extension_plugins']:
|
||||
for field in [
|
||||
"chrome_plugins",
|
||||
"archiving_plugins",
|
||||
"parsing_plugins",
|
||||
"search_plugins",
|
||||
"binary_plugins",
|
||||
"extension_plugins",
|
||||
]:
|
||||
selected = cleaned_data.get(field)
|
||||
if isinstance(selected, list):
|
||||
all_selected_plugins.extend(selected)
|
||||
|
||||
# Store combined list for easy access
|
||||
cleaned_data['plugins'] = all_selected_plugins
|
||||
cleaned_data["plugins"] = all_selected_plugins
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def clean_url(self):
|
||||
value = self.cleaned_data.get('url') or ''
|
||||
urls = '\n'.join(find_all_urls(value))
|
||||
value = self.cleaned_data.get("url") or ""
|
||||
urls = "\n".join(find_all_urls(value))
|
||||
if not urls:
|
||||
raise forms.ValidationError('Enter at least one valid URL.')
|
||||
raise forms.ValidationError("Enter at least one valid URL.")
|
||||
return urls
|
||||
|
||||
def clean_url_filters(self):
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
value = self.cleaned_data.get('url_filters') or {}
|
||||
value = self.cleaned_data.get("url_filters") or {}
|
||||
return {
|
||||
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
|
||||
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
|
||||
'same_domain_only': bool(value.get('same_domain_only')),
|
||||
"allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))),
|
||||
"denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))),
|
||||
"same_domain_only": bool(value.get("same_domain_only")),
|
||||
}
|
||||
|
||||
def clean_max_urls(self):
|
||||
value = self.cleaned_data.get("max_urls")
|
||||
return int(value or 0)
|
||||
|
||||
def clean_max_size(self):
|
||||
raw_value = str(self.cleaned_data.get("max_size") or "").strip()
|
||||
if not raw_value:
|
||||
return 0
|
||||
try:
|
||||
value = parse_filesize_to_bytes(raw_value)
|
||||
except ValueError as err:
|
||||
raise forms.ValidationError(str(err))
|
||||
if value < 0:
|
||||
raise forms.ValidationError("Max size must be 0 or a positive number of bytes.")
|
||||
return value
|
||||
|
||||
def clean_schedule(self):
|
||||
schedule = (self.cleaned_data.get('schedule') or '').strip()
|
||||
schedule = (self.cleaned_data.get("schedule") or "").strip()
|
||||
if not schedule:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
try:
|
||||
validate_schedule(schedule)
|
||||
@@ -269,7 +348,7 @@ class TagField(forms.CharField):
|
||||
return parse_tags(value)
|
||||
except ValueError:
|
||||
raise forms.ValidationError(
|
||||
"Please provide a comma-separated list of tags."
|
||||
"Please provide a comma-separated list of tags.",
|
||||
)
|
||||
|
||||
def has_changed(self, initial, data):
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -9,6 +7,7 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
|
||||
|
||||
_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$")
|
||||
_SNAPSHOT_SUBDOMAIN_RE = re.compile(r"^snap-(?P<suffix>[0-9a-fA-F]{12})$")
|
||||
|
||||
|
||||
def split_host_port(host: str) -> tuple[str, str | None]:
|
||||
@@ -71,21 +70,29 @@ def get_web_host() -> str:
|
||||
return urlparse(override).netloc.lower()
|
||||
return _build_listen_host("web")
|
||||
|
||||
|
||||
def get_api_host() -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return get_listen_host().lower()
|
||||
return _build_listen_host("api")
|
||||
|
||||
|
||||
def get_public_host() -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return get_listen_host().lower()
|
||||
return _build_listen_host("public")
|
||||
|
||||
|
||||
def get_snapshot_subdomain(snapshot_id: str) -> str:
|
||||
normalized = re.sub(r"[^0-9a-fA-F]", "", snapshot_id or "")
|
||||
suffix = (normalized[-12:] if len(normalized) >= 12 else normalized).lower()
|
||||
return f"snap-{suffix}"
|
||||
|
||||
|
||||
def get_snapshot_host(snapshot_id: str) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return get_listen_host().lower()
|
||||
return _build_listen_host(snapshot_id)
|
||||
return _build_listen_host(get_snapshot_subdomain(snapshot_id))
|
||||
|
||||
|
||||
def get_original_host(domain: str) -> str:
|
||||
@@ -95,7 +102,16 @@ def get_original_host(domain: str) -> str:
|
||||
|
||||
|
||||
def is_snapshot_subdomain(subdomain: str) -> bool:
|
||||
return bool(_SNAPSHOT_ID_RE.match(subdomain or ""))
|
||||
value = (subdomain or "").strip()
|
||||
return bool(_SNAPSHOT_SUBDOMAIN_RE.match(value) or _SNAPSHOT_ID_RE.match(value))
|
||||
|
||||
|
||||
def get_snapshot_lookup_key(snapshot_ref: str) -> str:
|
||||
value = (snapshot_ref or "").strip().lower()
|
||||
match = _SNAPSHOT_SUBDOMAIN_RE.match(value)
|
||||
if match:
|
||||
return match.group("suffix")
|
||||
return value
|
||||
|
||||
|
||||
def get_listen_subdomain(request_host: str) -> str:
|
||||
@@ -141,22 +157,23 @@ def _build_base_url_for_host(host: str, request=None) -> str:
|
||||
|
||||
|
||||
def get_admin_base_url(request=None) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
|
||||
if override:
|
||||
return override
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
return _build_base_url_for_host(get_admin_host(), request=request)
|
||||
|
||||
|
||||
def get_web_base_url(request=None) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
|
||||
if override:
|
||||
return override
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
return _build_base_url_for_host(get_web_host(), request=request)
|
||||
|
||||
|
||||
def get_api_base_url(request=None) -> str:
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
return _build_base_url_for_host(get_listen_host(), request=request)
|
||||
@@ -191,6 +208,7 @@ def build_admin_url(path: str = "", request=None) -> str:
|
||||
def build_web_url(path: str = "", request=None) -> str:
|
||||
return _build_url(get_web_base_url(request), path)
|
||||
|
||||
|
||||
def build_api_url(path: str = "", request=None) -> str:
|
||||
return _build_url(get_api_base_url(request), path)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox'
|
||||
__package__ = "archivebox"
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
@@ -6,13 +6,12 @@ from archivebox.cli import main as run_cli
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
|
||||
help = "Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
|
||||
parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
|
||||
|
||||
parser.add_argument("subcommand", type=str, help="The subcommand you want to run")
|
||||
parser.add_argument("command_args", nargs="*", help="Arguments to pass to the subcommand")
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
command_args = [kwargs['subcommand'], *kwargs['command_args']]
|
||||
command_args = [kwargs["subcommand"], *kwargs["command_args"]]
|
||||
run_cli(args=command_args)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
import ipaddress
|
||||
import re
|
||||
@@ -16,6 +16,7 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
from archivebox.core.host_utils import (
|
||||
build_snapshot_url,
|
||||
build_admin_url,
|
||||
build_web_url,
|
||||
get_api_host,
|
||||
@@ -31,10 +32,10 @@ from archivebox.core.host_utils import (
|
||||
from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
|
||||
|
||||
|
||||
def detect_timezone(request, activate: bool=True):
|
||||
gmt_offset = (request.COOKIES.get('GMT_OFFSET') or '').strip()
|
||||
def detect_timezone(request, activate: bool = True):
|
||||
gmt_offset = (request.COOKIES.get("GMT_OFFSET") or "").strip()
|
||||
tz = None
|
||||
if gmt_offset.replace('-', '').isdigit():
|
||||
if gmt_offset.replace("-", "").isdigit():
|
||||
tz = timezone.get_fixed_timezone(int(gmt_offset))
|
||||
if activate:
|
||||
timezone.activate(tz)
|
||||
@@ -53,11 +54,12 @@ def TimezoneMiddleware(get_response):
|
||||
def CacheControlMiddleware(get_response):
|
||||
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
|
||||
static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip()
|
||||
|
||||
def middleware(request):
|
||||
response = get_response(request)
|
||||
|
||||
if request.path.startswith('/static/'):
|
||||
rel_path = request.path[len('/static/'):]
|
||||
if request.path.startswith("/static/"):
|
||||
rel_path = request.path[len("/static/") :]
|
||||
static_path = finders.find(rel_path)
|
||||
if static_path:
|
||||
try:
|
||||
@@ -81,10 +83,10 @@ def CacheControlMiddleware(get_response):
|
||||
response.headers["Last-Modified"] = http_date(mtime)
|
||||
return response
|
||||
|
||||
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
|
||||
if not response.get('Cache-Control'):
|
||||
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
|
||||
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
|
||||
if "/archive/" in request.path or "/static/" in request.path or snapshot_path_re.match(request.path):
|
||||
if not response.get("Cache-Control"):
|
||||
policy = "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private"
|
||||
response["Cache-Control"] = f"{policy}, max-age=60, stale-while-revalidate=300"
|
||||
# print('Set Cache-Control header to', response['Cache-Control'])
|
||||
return response
|
||||
|
||||
@@ -115,6 +117,10 @@ def ServerSecurityModeMiddleware(get_response):
|
||||
|
||||
|
||||
def HostRoutingMiddleware(get_response):
|
||||
snapshot_path_re = re.compile(
|
||||
r"^/(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?$",
|
||||
)
|
||||
|
||||
def middleware(request):
|
||||
request_host = (request.get_host() or "").lower()
|
||||
admin_host = get_admin_host()
|
||||
@@ -124,6 +130,23 @@ def HostRoutingMiddleware(get_response):
|
||||
listen_host = get_listen_host()
|
||||
subdomain = get_listen_subdomain(request_host)
|
||||
|
||||
# Framework-owned assets must bypass snapshot/original-domain replay routing.
|
||||
# Otherwise pages on snapshot subdomains can receive HTML for JS/CSS requests.
|
||||
if request.path.startswith("/static/") or request.path in {"/favicon.ico", "/robots.txt"}:
|
||||
return get_response(request)
|
||||
|
||||
if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and not host_matches(request_host, admin_host):
|
||||
if (
|
||||
request.path == "/admin"
|
||||
or request.path.startswith("/admin/")
|
||||
or request.path == "/accounts"
|
||||
or request.path.startswith("/accounts/")
|
||||
):
|
||||
target = build_admin_url(request.path, request=request)
|
||||
if request.META.get("QUERY_STRING"):
|
||||
target = f"{target}?{request.META['QUERY_STRING']}"
|
||||
return redirect(target)
|
||||
|
||||
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
|
||||
if host_matches(request_host, listen_host):
|
||||
return get_response(request)
|
||||
@@ -140,6 +163,16 @@ def HostRoutingMiddleware(get_response):
|
||||
return get_response(request)
|
||||
|
||||
if host_matches(request_host, admin_host):
|
||||
snapshot_match = snapshot_path_re.match(request.path)
|
||||
if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and snapshot_match:
|
||||
snapshot_id = snapshot_match.group("snapshot_id")
|
||||
replay_path = (snapshot_match.group("path") or "").strip("/")
|
||||
if replay_path == "index.html":
|
||||
replay_path = ""
|
||||
target = build_snapshot_url(snapshot_id, replay_path, request=request)
|
||||
if request.META.get("QUERY_STRING"):
|
||||
target = f"{target}?{request.META['QUERY_STRING']}"
|
||||
return redirect(target)
|
||||
return get_response(request)
|
||||
|
||||
if host_matches(request_host, api_host):
|
||||
@@ -160,16 +193,9 @@ def HostRoutingMiddleware(get_response):
|
||||
if host_matches(request_host, web_host):
|
||||
request.user = AnonymousUser()
|
||||
request._cached_user = request.user
|
||||
if request.path.startswith("/admin"):
|
||||
target = build_admin_url(request.path, request=request)
|
||||
if request.META.get("QUERY_STRING"):
|
||||
target = f"{target}?{request.META['QUERY_STRING']}"
|
||||
return redirect(target)
|
||||
return get_response(request)
|
||||
|
||||
if host_matches(request_host, public_host):
|
||||
request.user = AnonymousUser()
|
||||
request._cached_user = request.user
|
||||
return get_response(request)
|
||||
|
||||
if subdomain:
|
||||
@@ -196,24 +222,26 @@ def HostRoutingMiddleware(get_response):
|
||||
|
||||
return middleware
|
||||
|
||||
|
||||
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
|
||||
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
|
||||
header = "HTTP_{normalized}".format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace("-", "_").upper())
|
||||
|
||||
def process_request(self, request):
|
||||
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
|
||||
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == "":
|
||||
return
|
||||
|
||||
ip = request.META.get('REMOTE_ADDR')
|
||||
ip = request.META.get("REMOTE_ADDR")
|
||||
if not isinstance(ip, str):
|
||||
return
|
||||
|
||||
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
|
||||
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(","):
|
||||
try:
|
||||
network = ipaddress.ip_network(cidr)
|
||||
except ValueError:
|
||||
raise ImproperlyConfigured(
|
||||
"The REVERSE_PROXY_WHITELIST config paramater is in invalid format, or "
|
||||
"contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.")
|
||||
"The REVERSE_PROXY_WHITELIST config parameter is in invalid format, or "
|
||||
"contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.",
|
||||
)
|
||||
|
||||
if ipaddress.ip_address(ip) in network:
|
||||
return super().process_request(request)
|
||||
|
||||
@@ -5,23 +5,21 @@ import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Snapshot',
|
||||
name="Snapshot",
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
('url', models.URLField(unique=True)),
|
||||
('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
|
||||
('title', models.CharField(default=None, max_length=128, null=True)),
|
||||
('tags', models.CharField(default=None, max_length=256, null=True)),
|
||||
('added', models.DateTimeField(auto_now_add=True)),
|
||||
('updated', models.DateTimeField(default=None, null=True)),
|
||||
("id", models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
("url", models.URLField(unique=True)),
|
||||
("timestamp", models.CharField(default=None, max_length=32, null=True, unique=True)),
|
||||
("title", models.CharField(default=None, max_length=128, null=True)),
|
||||
("tags", models.CharField(default=None, max_length=256, null=True)),
|
||||
("added", models.DateTimeField(auto_now_add=True)),
|
||||
("updated", models.DateTimeField(default=None, null=True)),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
("core", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(default=None, max_length=32, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,35 +4,34 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0002_auto_20200625_1521'),
|
||||
("core", "0002_auto_20200625_1521"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='added',
|
||||
model_name="snapshot",
|
||||
name="added",
|
||||
field=models.DateTimeField(auto_now_add=True, db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.CharField(db_index=True, default=None, max_length=256, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(db_index=True, default=None, max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(db_index=True, default=None, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
model_name="snapshot",
|
||||
name="updated",
|
||||
field=models.DateTimeField(db_index=True, default=None, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0003_auto_20200630_1034'),
|
||||
("core", "0003_auto_20200630_1034"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
model_name="snapshot",
|
||||
name="timestamp",
|
||||
field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
|
||||
preserve_default=False,
|
||||
),
|
||||
|
||||
@@ -4,25 +4,24 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0004_auto_20200713_1552'),
|
||||
("core", "0004_auto_20200713_1552"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
model_name="snapshot",
|
||||
name="updated",
|
||||
field=models.DateTimeField(blank=True, db_index=True, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -3,19 +3,18 @@
|
||||
from django.db import migrations, models
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
SnapshotModel = apps.get_model("core", "Snapshot")
|
||||
TagModel = apps.get_model("core", "Tag")
|
||||
|
||||
snapshots = SnapshotModel.objects.all()
|
||||
for snapshot in snapshots:
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
|
||||
)
|
||||
tag_set = {tag.strip() for tag in (snapshot.tags_old or "").split(",")}
|
||||
tag_set.discard("")
|
||||
|
||||
for tag in tag_set:
|
||||
to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={'slug': slugify(tag)})
|
||||
to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={"slug": slugify(tag)})
|
||||
snapshot.tags.add(to_add)
|
||||
|
||||
|
||||
@@ -30,37 +29,36 @@ def reverse_func(apps, schema_editor):
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0005_auto_20200728_0326'),
|
||||
("core", "0005_auto_20200728_0326"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='tags',
|
||||
new_name='tags_old',
|
||||
model_name="snapshot",
|
||||
old_name="tags",
|
||||
new_name="tags_old",
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Tag',
|
||||
name="Tag",
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
|
||||
('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
|
||||
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("name", models.CharField(max_length=100, unique=True, verbose_name="name")),
|
||||
("slug", models.SlugField(max_length=100, unique=True, verbose_name="slug")),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Tag',
|
||||
'verbose_name_plural': 'Tags',
|
||||
"verbose_name": "Tag",
|
||||
"verbose_name_plural": "Tags",
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(to='core.Tag'),
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(to="core.Tag"),
|
||||
),
|
||||
migrations.RunPython(forwards_func, reverse_func),
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='tags_old',
|
||||
model_name="snapshot",
|
||||
name="tags_old",
|
||||
),
|
||||
]
|
||||
|
||||
@@ -9,13 +9,15 @@ import django.db.models.deletion
|
||||
# Handle old vs new import paths
|
||||
try:
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
|
||||
except ImportError:
|
||||
try:
|
||||
from archivebox.config import CONFIG
|
||||
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
|
||||
|
||||
ARCHIVE_DIR = Path(CONFIG.get("ARCHIVE_DIR", "./archive"))
|
||||
except ImportError:
|
||||
ARCHIVE_DIR = Path('./archive')
|
||||
ARCHIVE_DIR = Path("./archive")
|
||||
|
||||
try:
|
||||
from archivebox.misc.util import to_json
|
||||
@@ -29,6 +31,7 @@ try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
import jsonfield
|
||||
|
||||
JSONField = jsonfield.JSONField
|
||||
|
||||
|
||||
@@ -41,7 +44,7 @@ def forwards_func(apps, schema_editor):
|
||||
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
||||
|
||||
try:
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
with open(out_dir / "index.json") as f:
|
||||
fs_index = json.load(f)
|
||||
except Exception:
|
||||
continue
|
||||
@@ -56,37 +59,46 @@ def forwards_func(apps, schema_editor):
|
||||
snapshot=snapshot,
|
||||
pwd=result["pwd"],
|
||||
cmd=result.get("cmd") or [],
|
||||
cmd_version=result.get("cmd_version") or 'unknown',
|
||||
cmd_version=result.get("cmd_version") or "unknown",
|
||||
start_ts=result["start_ts"],
|
||||
end_ts=result["end_ts"],
|
||||
status=result["status"],
|
||||
output=result.get("output") or 'null',
|
||||
output=result.get("output") or "null",
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
' ! Skipping import due to missing/invalid index.json:',
|
||||
" ! Skipping import due to missing/invalid index.json:",
|
||||
out_dir,
|
||||
e,
|
||||
'(open an issue with this index.json for help)',
|
||||
"(open an issue with this index.json for help)",
|
||||
)
|
||||
|
||||
|
||||
def verify_json_index_integrity(snapshot):
|
||||
results = snapshot.archiveresult_set.all()
|
||||
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
with open(out_dir / "index.json") as f:
|
||||
index = json.load(f)
|
||||
|
||||
history = index["history"]
|
||||
index_results = [result for extractor in history for result in history[extractor]]
|
||||
flattened_results = [result["start_ts"] for result in index_results]
|
||||
|
||||
|
||||
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
|
||||
|
||||
for missing in missing_results:
|
||||
index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
|
||||
"start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
|
||||
"schema": "ArchiveResult", "status": missing.status})
|
||||
index["history"][missing.extractor].append(
|
||||
{
|
||||
"cmd": missing.cmd,
|
||||
"cmd_version": missing.cmd_version,
|
||||
"end_ts": missing.end_ts.isoformat(),
|
||||
"start_ts": missing.start_ts.isoformat(),
|
||||
"pwd": missing.pwd,
|
||||
"output": missing.output,
|
||||
"schema": "ArchiveResult",
|
||||
"status": missing.status,
|
||||
},
|
||||
)
|
||||
|
||||
json_index = to_json(index)
|
||||
with open(out_dir / "index.json", "w") as f:
|
||||
@@ -103,25 +115,47 @@ def reverse_func(apps, schema_editor):
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0006_auto_20201012_1520'),
|
||||
("core", "0006_auto_20201012_1520"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='ArchiveResult',
|
||||
name="ArchiveResult",
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('cmd', JSONField()),
|
||||
('pwd', models.CharField(max_length=256)),
|
||||
('cmd_version', models.CharField(max_length=32)),
|
||||
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
|
||||
('output', models.CharField(max_length=512)),
|
||||
('start_ts', models.DateTimeField()),
|
||||
('end_ts', models.DateTimeField()),
|
||||
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
|
||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
|
||||
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("cmd", JSONField()),
|
||||
("pwd", models.CharField(max_length=256)),
|
||||
("cmd_version", models.CharField(max_length=32)),
|
||||
(
|
||||
"status",
|
||||
models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16),
|
||||
),
|
||||
("output", models.CharField(max_length=512)),
|
||||
("start_ts", models.DateTimeField()),
|
||||
("end_ts", models.DateTimeField()),
|
||||
(
|
||||
"extractor",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("title", "title"),
|
||||
("favicon", "favicon"),
|
||||
("wget", "wget"),
|
||||
("singlefile", "singlefile"),
|
||||
("pdf", "pdf"),
|
||||
("screenshot", "screenshot"),
|
||||
("dom", "dom"),
|
||||
("readability", "readability"),
|
||||
("mercury", "mercury"),
|
||||
("git", "git"),
|
||||
("media", "media"),
|
||||
("headers", "headers"),
|
||||
("archivedotorg", "archivedotorg"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")),
|
||||
],
|
||||
),
|
||||
migrations.RunPython(forwards_func, reverse_func),
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0007_archiveresult'),
|
||||
("core", "0007_archiveresult"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
model_name="archiveresult",
|
||||
name="cmd_version",
|
||||
field=models.CharField(blank=True, default=None, max_length=32, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0008_auto_20210105_1421'),
|
||||
("core", "0008_auto_20210105_1421"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
model_name="snapshot",
|
||||
name="updated",
|
||||
field=models.DateTimeField(auto_now=True, db_index=True, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0009_auto_20210216_1038'),
|
||||
("core", "0009_auto_20210216_1038"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
model_name="archiveresult",
|
||||
name="start_ts",
|
||||
field=models.DateTimeField(db_index=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -5,20 +5,36 @@ import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0010_auto_20210216_1055'),
|
||||
("core", "0010_auto_20210216_1055"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
model_name="archiveresult",
|
||||
name="uuid",
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
model_name="archiveresult",
|
||||
name="extractor",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("title", "title"),
|
||||
("favicon", "favicon"),
|
||||
("headers", "headers"),
|
||||
("singlefile", "singlefile"),
|
||||
("pdf", "pdf"),
|
||||
("screenshot", "screenshot"),
|
||||
("dom", "dom"),
|
||||
("wget", "wget"),
|
||||
("readability", "readability"),
|
||||
("mercury", "mercury"),
|
||||
("git", "git"),
|
||||
("media", "media"),
|
||||
("archivedotorg", "archivedotorg"),
|
||||
],
|
||||
max_length=32,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,20 +4,19 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0011_auto_20210216_1331'),
|
||||
("core", "0011_auto_20210216_1331"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
model_name="archiveresult",
|
||||
name="cmd_version",
|
||||
field=models.CharField(blank=True, default=None, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
model_name="archiveresult",
|
||||
name="output",
|
||||
field=models.CharField(max_length=1024),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0012_auto_20210216_1425'),
|
||||
("core", "0012_auto_20210216_1425"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0013_auto_20210218_0729'),
|
||||
("core", "0013_auto_20210218_0729"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0014_auto_20210218_0729'),
|
||||
("core", "0014_auto_20210218_0729"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
model_name="snapshot",
|
||||
name="title",
|
||||
field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0015_auto_20210218_0730'),
|
||||
("core", "0015_auto_20210218_0730"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, to='core.Tag'),
|
||||
model_name="snapshot",
|
||||
name="tags",
|
||||
field=models.ManyToManyField(blank=True, to="core.Tag"),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,15 +4,14 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0016_auto_20210218_1204'),
|
||||
("core", "0016_auto_20210218_1204"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name="slug"),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -4,20 +4,19 @@ from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0017_auto_20210219_0211'),
|
||||
("core", "0017_auto_20210219_0211"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='name',
|
||||
model_name="tag",
|
||||
name="name",
|
||||
field=models.CharField(max_length=100, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
model_name="tag",
|
||||
name="slug",
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True),
|
||||
),
|
||||
]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user