From b749b26c5dda39c4167ab8b3333378a88febd536 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Mar 2026 03:58:32 -0700 Subject: [PATCH] wip --- .github/ISSUE_TEMPLATE/2-feature_request.yml | 4 +- .github/ISSUE_TEMPLATE/config.yml | 2 +- .github/workflows/claude.yml | 1 - .github/workflows/lint.yml | 30 +- .github/workflows/pip.yml | 1 - .pre-commit-config.yaml | 67 + CLAUDE.md | 4 +- Dockerfile | 8 +- README.md | 8 +- archivebox/__init__.py | 46 +- archivebox/__main__.py | 7 +- archivebox/api/__init__.py | 2 +- archivebox/api/admin.py | 108 +- archivebox/api/apps.py | 7 +- archivebox/api/auth.py | 42 +- archivebox/api/middleware.py | 18 +- archivebox/api/migrations/0001_initial.py | 179 +- archivebox/api/models.py | 12 +- archivebox/api/urls.py | 12 +- archivebox/api/v1_api.py | 55 +- archivebox/api/v1_auth.py | 28 +- archivebox/api/v1_cli.py | 157 +- archivebox/api/v1_core.py | 424 ++-- archivebox/api/v1_crawls.py | 102 +- archivebox/api/v1_machine.py | 60 +- archivebox/base_models/__init__.py | 2 +- archivebox/base_models/admin.py | 50 +- archivebox/base_models/models.py | 36 +- archivebox/cli/__init__.py | 151 +- archivebox/cli/archivebox_add.py | 184 +- archivebox/cli/archivebox_archiveresult.py | 176 +- archivebox/cli/archivebox_binary.py | 142 +- archivebox/cli/archivebox_config.py | 98 +- archivebox/cli/archivebox_crawl.py | 179 +- archivebox/cli/archivebox_crawl_compat.py | 18 +- archivebox/cli/archivebox_extract.py | 131 +- archivebox/cli/archivebox_help.py | 120 +- archivebox/cli/archivebox_init.py | 140 +- archivebox/cli/archivebox_install.py | 49 +- archivebox/cli/archivebox_list.py | 69 +- archivebox/cli/archivebox_machine.py | 47 +- archivebox/cli/archivebox_manage.py | 19 +- archivebox/cli/archivebox_mcp.py | 6 +- archivebox/cli/archivebox_persona.py | 396 ++-- archivebox/cli/archivebox_pluginmap.py | 140 +- archivebox/cli/archivebox_process.py | 53 +- archivebox/cli/archivebox_remove.py | 52 +- archivebox/cli/archivebox_run.py | 70 +- archivebox/cli/archivebox_schedule.py | 143 +- archivebox/cli/archivebox_search.py | 192 +- archivebox/cli/archivebox_server.py | 190 +- archivebox/cli/archivebox_shell.py | 13 +- archivebox/cli/archivebox_snapshot.py | 283 ++- archivebox/cli/archivebox_snapshot_compat.py | 16 +- archivebox/cli/archivebox_status.py | 107 +- archivebox/cli/archivebox_tag.py | 98 +- archivebox/cli/archivebox_update.py | 355 ++- archivebox/cli/archivebox_version.py | 191 +- archivebox/cli/cli_utils.py | 12 +- archivebox/config/__init__.py | 46 +- archivebox/config/collection.py | 144 +- archivebox/config/common.py | 28 +- archivebox/config/configset.py | 77 +- archivebox/config/constants.py | 358 +-- archivebox/config/django.py | 64 +- archivebox/config/ldap.py | 10 +- archivebox/config/paths.py | 338 +-- archivebox/config/permissions.py | 79 +- archivebox/config/version.py | 48 +- archivebox/config/views.py | 569 ++--- archivebox/core/__init__.py | 16 +- archivebox/core/admin.py | 2 +- archivebox/core/admin_archiveresults.py | 517 +++-- archivebox/core/admin_site.py | 19 +- archivebox/core/admin_snapshots.py | 818 ++++--- archivebox/core/admin_tags.py | 209 +- archivebox/core/admin_users.py | 125 +- archivebox/core/apps.py | 24 +- archivebox/core/forms.py | 191 +- archivebox/core/host_utils.py | 34 +- .../core/management/commands/archivebox.py | 11 +- archivebox/core/middleware.py | 74 +- archivebox/core/migrations/0001_initial.py | 20 +- .../migrations/0002_auto_20200625_1521.py | 7 +- .../migrations/0003_auto_20200630_1034.py | 23 +- .../migrations/0004_auto_20200713_1552.py | 7 +- .../migrations/0005_auto_20200728_0326.py | 15 +- .../migrations/0006_auto_20201012_1520.py | 38 +- .../core/migrations/0007_archiveresult.py | 84 +- .../migrations/0008_auto_20210105_1421.py | 7 +- .../migrations/0009_auto_20210216_1038.py | 7 +- .../migrations/0010_auto_20210216_1055.py | 7 +- .../migrations/0011_auto_20210216_1331.py | 30 +- .../migrations/0012_auto_20210216_1425.py | 11 +- .../migrations/0013_auto_20210218_0729.py | 7 +- .../migrations/0014_auto_20210218_0729.py | 7 +- .../migrations/0015_auto_20210218_0730.py | 7 +- .../migrations/0016_auto_20210218_1204.py | 9 +- .../migrations/0017_auto_20210219_0211.py | 9 +- .../migrations/0018_auto_20210327_0952.py | 11 +- .../migrations/0019_auto_20210401_0654.py | 7 +- .../migrations/0020_auto_20210410_1031.py | 15 +- .../migrations/0021_auto_20220914_0934.py | 26 +- .../migrations/0022_auto_20231023_2008.py | 27 +- .../core/migrations/0023_upgrade_to_0_9_0.py | 149 +- .../migrations/0024_assign_default_crawl.py | 33 +- ...options_alter_snapshot_options_and_more.py | 277 ++- .../0026_add_process_to_archiveresult.py | 26 +- .../0027_copy_archiveresult_to_process.py | 153 +- .../0028_alter_snapshot_fs_version.py | 13 +- .../0029_migrate_archiveresult_to_uuid_pk.py | 64 +- .../migrations/0030_alter_archiveresult_id.py | 7 +- ...add_archiveresult_snapshot_status_index.py | 7 +- .../0032_remove_archiveresult_retry_at.py | 1 - .../0033_alter_archiveresult_status.py | 28 + archivebox/core/models.py | 1932 ++++++++++------- archivebox/core/settings.py | 5 +- archivebox/core/settings_logging.py | 28 +- archivebox/core/tag_utils.py | 179 +- archivebox/core/templatetags/core_tags.py | 455 ++-- archivebox/core/urls.py | 99 +- archivebox/core/views.py | 1355 ++++++------ archivebox/core/widgets.py | 89 +- archivebox/core/wsgi.py | 2 +- archivebox/crawls/__init__.py | 3 +- archivebox/crawls/admin.py | 537 +++-- archivebox/crawls/apps.py | 2 +- archivebox/crawls/migrations/0001_initial.py | 118 +- .../migrations/0002_upgrade_from_0_8_6.py | 11 +- ..._crawlschedule_num_uses_failed_and_more.py | 11 +- .../0004_remove_crawl_output_dir.py | 7 +- .../migrations/0005_add_crawl_limits.py | 31 + archivebox/crawls/models.py | 426 ++-- archivebox/crawls/schedule_utils.py | 2 +- archivebox/hooks.py | 350 +-- archivebox/ideas/process_plugin.py | 64 +- archivebox/ldap/apps.py | 6 +- archivebox/ldap/auth.py | 1 + archivebox/machine/__init__.py | 2 +- archivebox/machine/admin.py | 715 ++++-- archivebox/machine/apps.py | 13 +- archivebox/machine/detect.py | 166 +- archivebox/machine/env_utils.py | 51 + archivebox/machine/migrations/0001_initial.py | 184 +- .../migrations/0005_converge_binary_model.py | 17 +- archivebox/machine/migrations/0006_process.py | 106 +- .../0007_add_process_type_and_parent.py | 34 +- .../migrations/0008_add_worker_type_field.py | 15 +- .../migrations/0009_alter_binary_status.py | 14 +- .../0010_alter_process_process_type.py | 22 +- .../0011_remove_binary_output_dir.py | 13 +- archivebox/machine/models.py | 804 ++++--- archivebox/manage.py | 18 +- archivebox/mcp/__init__.py | 2 +- archivebox/mcp/apps.py | 8 +- archivebox/mcp/server.py | 124 +- archivebox/misc/__init__.py | 2 +- archivebox/misc/checks.py | 254 ++- archivebox/misc/db.py | 24 +- archivebox/misc/debugging.py | 11 +- archivebox/misc/folders.py | 13 +- archivebox/misc/hashing.py | 113 +- archivebox/misc/jsonl.py | 55 +- archivebox/misc/legacy.py | 44 +- archivebox/misc/logging.py | 63 +- archivebox/misc/logging_util.py | 612 +++--- archivebox/misc/monkey_patches.py | 32 +- archivebox/misc/paginators.py | 12 +- archivebox/misc/serve_static.py | 477 +++- archivebox/misc/shell_welcome_message.py | 74 +- archivebox/misc/system.py | 80 +- archivebox/misc/toml_util.py | 24 +- archivebox/misc/util.py | 442 ++-- archivebox/personas/admin.py | 73 +- archivebox/personas/forms.py | 14 +- archivebox/personas/importers.py | 52 +- .../personas/migrations/0001_initial.py | 20 +- .../migrations/0002_alter_persona_id.py | 7 +- archivebox/personas/models.py | 115 +- archivebox/personas/views.py | 1 - archivebox/search/__init__.py | 114 +- archivebox/search/admin.py | 51 +- archivebox/services/__init__.py | 2 + archivebox/services/archive_result_service.py | 160 +- archivebox/services/binary_service.py | 4 +- archivebox/services/live_ui.py | 52 +- .../services/process_request_service.py | 179 ++ archivebox/services/process_service.py | 25 +- archivebox/services/runner.py | 116 +- archivebox/services/snapshot_service.py | 28 +- archivebox/templates/admin/actions.html | 34 + archivebox/templates/admin/base.html | 322 +-- archivebox/templates/admin/change_list.html | 175 ++ .../templates/admin/change_list_results.html | 38 + .../admin/core/archiveresult/change_list.html | 142 ++ .../templates/admin/core/tag/change_list.html | 30 +- archivebox/templates/admin/private_index.html | 11 + .../templates/admin/private_index_grid.html | 11 + .../templates/admin/progress_monitor.html | 4 +- archivebox/templates/admin/search_form.html | 43 + .../templates/admin/snapshots_grid.html | 8 +- archivebox/templates/core/add.html | 67 +- archivebox/templates/core/public_index.html | 7 +- archivebox/templates/core/snapshot.html | 1307 ++++++++--- archivebox/templates/core/snapshot_live.html | 1325 ----------- archivebox/templates/static/add.css | 20 + archivebox/templates/static/admin.css | 208 +- archivebox/templates/static/bootstrap.min.css | 2 +- .../templates/static/directory_index.html | 396 ++++ archivebox/templates/static/select2.min.js | 2 +- archivebox/tests/conftest.py | 211 +- archivebox/tests/fixtures.py | 38 +- archivebox/tests/migrations_helpers.py | 710 +++--- archivebox/tests/test_add_view.py | 244 ++- archivebox/tests/test_admin_config_widget.py | 142 +- archivebox/tests/test_admin_links.py | 252 ++- archivebox/tests/test_admin_views.py | 1537 +++++++++++-- archivebox/tests/test_api_cli_schedule.py | 22 +- .../tests/test_archive_result_service.py | 297 ++- archivebox/tests/test_auth_ldap.py | 52 +- archivebox/tests/test_cli_add.py | 122 +- archivebox/tests/test_cli_archiveresult.py | 128 +- archivebox/tests/test_cli_config.py | 60 +- archivebox/tests/test_cli_crawl.py | 90 +- archivebox/tests/test_cli_extract.py | 8 +- archivebox/tests/test_cli_extract_input.py | 89 +- archivebox/tests/test_cli_help.py | 10 +- archivebox/tests/test_cli_init.py | 52 +- archivebox/tests/test_cli_install.py | 56 +- archivebox/tests/test_cli_list.py | 107 +- archivebox/tests/test_cli_manage.py | 12 +- archivebox/tests/test_cli_piping.py | 37 +- archivebox/tests/test_cli_remove.py | 50 +- archivebox/tests/test_cli_run.py | 104 +- archivebox/tests/test_cli_schedule.py | 15 +- archivebox/tests/test_cli_search.py | 60 +- archivebox/tests/test_cli_server.py | 66 +- archivebox/tests/test_cli_shell.py | 6 +- archivebox/tests/test_cli_snapshot.py | 131 +- archivebox/tests/test_cli_status.py | 72 +- archivebox/tests/test_cli_update.py | 22 +- archivebox/tests/test_cli_version.py | 30 +- archivebox/tests/test_config.py | 43 +- archivebox/tests/test_config_views.py | 361 +-- archivebox/tests/test_crawl.py | 67 +- archivebox/tests/test_crawl_admin.py | 166 +- archivebox/tests/test_hooks.py | 394 ++-- archivebox/tests/test_machine_models.py | 274 ++- archivebox/tests/test_migrations_04_to_09.py | 65 +- archivebox/tests/test_migrations_07_to_09.py | 114 +- archivebox/tests/test_migrations_08_to_09.py | 312 +-- archivebox/tests/test_migrations_fresh.py | 84 +- archivebox/tests/test_persona_runtime.py | 40 +- .../tests/test_process_runtime_paths.py | 27 +- archivebox/tests/test_recursive_crawl.py | 267 +-- archivebox/tests/test_runner.py | 567 +++-- archivebox/tests/test_savepagenow.py | 260 ++- archivebox/tests/test_schedule.py | 41 +- archivebox/tests/test_schedule_e2e.py | 150 +- .../tests/test_server_security_browser.py | 42 +- archivebox/tests/test_snapshot.py | 131 +- archivebox/tests/test_tag_admin.py | 119 +- archivebox/tests/test_title.py | 6 +- archivebox/tests/test_update.py | 178 +- archivebox/tests/test_urls.py | 403 +++- archivebox/tests/test_util.py | 1 + archivebox/uuid_compat.py | 6 +- archivebox/workers/__init__.py | 3 +- archivebox/workers/admin.py | 2 +- archivebox/workers/apps.py | 7 +- .../management/commands/runner_watch.py | 3 +- archivebox/workers/models.py | 237 +- archivebox/workers/supervisord_util.py | 147 +- archivebox/workers/tasks.py | 12 +- bin/build_git.sh | 1 - bin/docker_entrypoint.sh | 6 +- bin/setup.sh | 2 +- etc/ArchiveBox.conf.default | 2 +- etc/nginx.conf | 1 - old/Architecture.md | 2 +- old/TODO_hook_statemachine_cleanup.md | 1 - old/archivebox.ts | 1 - pyproject.toml | 9 +- uv.lock | 24 + website/CNAME | 2 +- website/shadcn-theme.css | 2 +- 286 files changed, 21704 insertions(+), 13480 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 archivebox/core/migrations/0033_alter_archiveresult_status.py create mode 100644 archivebox/crawls/migrations/0005_add_crawl_limits.py create mode 100644 archivebox/machine/env_utils.py create mode 100644 archivebox/services/process_request_service.py create mode 100644 archivebox/templates/admin/change_list.html create mode 100644 archivebox/templates/admin/change_list_results.html create mode 100644 archivebox/templates/admin/core/archiveresult/change_list.html create mode 100644 archivebox/templates/admin/search_form.html delete mode 100644 archivebox/templates/core/snapshot_live.html create mode 100644 archivebox/templates/static/directory_index.html diff --git a/.github/ISSUE_TEMPLATE/2-feature_request.yml b/.github/ISSUE_TEMPLATE/2-feature_request.yml index 71effaec..7a30d3b9 100644 --- a/.github/ISSUE_TEMPLATE/2-feature_request.yml +++ b/.github/ISSUE_TEMPLATE/2-feature_request.yml @@ -57,9 +57,9 @@ body: - type: textarea id: version attributes: - label: Share the entire output of the `archivebox version` command for the current verison you are using. + label: Share the entire output of the `archivebox version` command for the current version you are using. description: | - DO NOT JUST ENTER "the latest verion" OR YOUR ISSUE WILL BE CLOSED. + DO NOT JUST ENTER "the latest version" OR YOUR ISSUE WILL BE CLOSED. We need to know what version of ArchiveBox and what feature flags you're currently running with in order to contextualize your feature request. Sometimes we've already fixed the issues in newer BETA versions, sometimes features already exist but may not be available in your specific environment. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 4cc6265f..110053cc 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -8,4 +8,4 @@ contact_links: about: "Join us on our Zulip forum to chat with the developers and other users (it's similar to Discord but self-hosted)." - name: 💁‍♂️ Hire us for professional support with fast response times url: https://docs.monadical.com/s/archivebox-consulting-services - about: "We provide hosting, develoment, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc." + about: "We provide hosting, development, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc." diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index a9e72708..2feee2e3 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -47,4 +47,3 @@ jobs: # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md # or https://code.claude.com/docs/en/cli-reference for available options claude_args: '--allowed-tools Bash(gh pr:*)' - diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 5a402b25..272b8869 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -4,32 +4,28 @@ on: workflow_dispatch: push: -env: - MAX_LINE_LENGTH: 110 - jobs: lint: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: "3.13" architecture: x64 - - name: Install flake8 - run: | - pip install flake8 + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true - - name: Lint with flake8 - run: | - cd archivebox - # one pass for show-stopper syntax errors or undefined names - flake8 . --count --show-source --statistics - # one pass for small stylistic things - flake8 . --count --max-line-length="$MAX_LINE_LENGTH" --statistics + - name: Install dependencies with uv + run: uv sync --all-extras --all-groups --no-sources --no-cache + + - name: Run prek + run: uv run prek run --all-files diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index f4e75503..709c95c3 100755 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -63,4 +63,3 @@ jobs: # && uv run archivebox add 'https://example.com' \ # && uv run archivebox status \ # || (echo "UV Failed to run archivebox!" && exit 1) - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..62a3d0b4 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,67 @@ +default_language_version: + python: python3.13 + +repos: + - repo: https://github.com/asottile/yesqa + rev: v1.5.0 + hooks: + - id: yesqa + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: + - tomli + + - repo: https://github.com/asottile/pyupgrade + rev: v3.20.0 + hooks: + - id: pyupgrade + args: [--py313-plus] + + - repo: https://github.com/asottile/add-trailing-comma + rev: v3.1.0 + hooks: + - id: add-trailing-comma + + - repo: local + hooks: + - id: ruff-format + name: ruff-format + entry: uv run --active ruff format + language: system + types_or: [python, pyi] + - id: ruff-check + name: ruff-check + entry: uv run --active ruff check --fix + language: system + types_or: [python, pyi] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-ast + - id: check-toml + - id: check-yaml + exclude: ^\.github/workflows/homebrew\.yml$ + - id: check-json + - id: check-merge-conflict + - id: check-symlinks + - id: destroyed-symlinks + - id: check-case-conflict + - id: check-illegal-windows-names + - id: check-shebang-scripts-are-executable + exclude: ^(archivebox/.*\.py|archivebox/tests/.*\.py|archivebox/personas/export_browser_state\.js)$ + - id: mixed-line-ending + - id: fix-byte-order-marker + - id: end-of-file-fixer + - id: detect-private-key + - id: debug-statements + - id: forbid-submodules + exclude: ^docs$ + - id: check-added-large-files + args: ["--maxkb=600"] + - id: name-tests-test + args: ["--pytest-test-first"] + exclude: ^archivebox/tests/(data/|fixtures\.py$|migrations_helpers\.py$) diff --git a/CLAUDE.md b/CLAUDE.md index 5adf1178..f923e3c1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -196,9 +196,9 @@ with tempfile.TemporaryDirectory() as tmpdir: # Run hook in its output directory result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=snap-456'], + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com'], cwd=str(screenshot_dir), - env=get_test_env(), + env={**get_test_env(), 'EXTRA_CONTEXT': '{"snapshot_id":"snap-456"}'}, capture_output=True, timeout=120 ) diff --git a/Dockerfile b/Dockerfile index 5e670837..e483bcb8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ ARG TARGETARCH ARG TARGETVARIANT ######### Environment Variables ################################# -# Global built-time and runtime environment constants + default pkg manager config +# Global build-time and runtime environment constants + default pkg manager config ENV TZ=UTC \ LANGUAGE=en_US:en \ LC_ALL=C.UTF-8 \ @@ -121,7 +121,7 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \ && groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \ && echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER PUID=$(id -u $ARCHIVEBOX_USER) PGID=$(id -g $ARCHIVEBOX_USER)\n\n" \ | tee -a /VERSION.txt - # DEFAULT_PUID and DEFAULT_PID are overriden by PUID and PGID in /bin/docker_entrypoint.sh at runtime + # DEFAULT_PUID and DEFAULT_PID are overridden by PUID and PGID in /bin/docker_entrypoint.sh at runtime # https://docs.linuxserver.io/general/understanding-puid-and-pgid # Install system apt dependencies (adding backports to access more recent apt updates) @@ -139,7 +139,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # nano iputils-ping dnsutils htop procps jq yq && rm -rf /var/lib/apt/lists/* -# Install apt binary dependencies for exractors +# Install apt binary dependencies for extractors # COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \ @@ -373,7 +373,7 @@ RUN openssl rand -hex 16 > /etc/machine-id \ && echo -e "\nTMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt # Print version for nice docker finish summary -RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \ +RUN (echo -e "\n\n[√] Finished Docker build successfully. Saving build summary in: /VERSION.txt" \ && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \ && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \ ) | tee -a /VERSION.txt diff --git a/README.md b/README.md index ea9e84e1..d6c9f63e 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur - **Individuals:** `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` - **Governments:** - `snapshoting public service sites`, `recordkeeping compliance` + `snapshotting public service sites`, `recordkeeping compliance` > ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* > We offer: setup & support, CAPTCHA/ratelimit unblocking, SSO, audit logging/chain-of-custody, and more @@ -423,7 +423,7 @@ See below for usage examples using the CLI, W Served by ArchiveBox v{VERSION} ({COMMIT_HASH[:8]}), API powered by django-ninja. -''' +""" def register_urls(api: NinjaAPI) -> NinjaAPI: - api.add_router('/auth/', 'archivebox.api.v1_auth.router') - api.add_router('/core/', 'archivebox.api.v1_core.router') - api.add_router('/crawls/', 'archivebox.api.v1_crawls.router') - api.add_router('/cli/', 'archivebox.api.v1_cli.router') - api.add_router('/machine/', 'archivebox.api.v1_machine.router') + api.add_router("/auth/", "archivebox.api.v1_auth.router") + api.add_router("/core/", "archivebox.api.v1_core.router") + api.add_router("/crawls/", "archivebox.api.v1_crawls.router") + api.add_router("/cli/", "archivebox.api.v1_cli.router") + api.add_router("/machine/", "archivebox.api.v1_machine.router") return api -class NinjaAPIWithIOCapture(NinjaAPI): +class NinjaAPIWithIOCapture(NinjaAPI): def create_temporal_response(self, request: HttpRequest) -> HttpResponse: stdout, stderr = StringIO(), StringIO() with redirect_stderr(stderr): with redirect_stdout(stdout): - setattr(request, 'stdout', stdout) - setattr(request, 'stderr', stderr) + setattr(request, "stdout", stdout) + setattr(request, "stderr", stderr) response = super().create_temporal_response(request) - # Diable caching of API responses entirely - response['Cache-Control'] = 'no-store' + # Disable caching of API responses entirely + response["Cache-Control"] = "no-store" # Add debug stdout and stderr headers to response - response['X-ArchiveBox-Stdout'] = stdout.getvalue().replace('\n', '\\n')[:200] - response['X-ArchiveBox-Stderr'] = stderr.getvalue().replace('\n', '\\n')[:200] + response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200] + response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200] # response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown' # Add Auth Headers to response - api_token_attr = getattr(request, '_api_token', None) + api_token_attr = getattr(request, "_api_token", None) api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None - token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else 'Never' + token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never" - response['X-ArchiveBox-Auth-Method'] = str(getattr(request, '_api_auth_method', 'None')) - response['X-ArchiveBox-Auth-Expires'] = token_expiry - response['X-ArchiveBox-Auth-Token-Id'] = str(api_token.id) if api_token else 'None' - response['X-ArchiveBox-Auth-User-Id'] = str(request.user.pk) if getattr(request.user, 'pk', None) else 'None' - response['X-ArchiveBox-Auth-User-Username'] = request.user.username if isinstance(request.user, User) else 'None' + response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None")) + response["X-ArchiveBox-Auth-Expires"] = token_expiry + response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None" + response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None" + response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None" # import ipdb; ipdb.set_trace() # print('RESPONDING NOW', response) @@ -84,7 +84,7 @@ class NinjaAPIWithIOCapture(NinjaAPI): api = NinjaAPIWithIOCapture( - title='ArchiveBox API', + title="ArchiveBox API", description=html_description, version=VERSION, auth=API_AUTH_METHODS, @@ -103,15 +103,15 @@ def generic_exception_handler(request, err): if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)): status = 404 - print(''.join(format_exception(err))) + print("".join(format_exception(err))) return api.create_response( request, { "succeeded": False, - "message": f'{err.__class__.__name__}: {err}', + "message": f"{err.__class__.__name__}: {err}", "errors": [ - ''.join(format_exception(err)), + "".join(format_exception(err)), # or send simpler parent-only traceback: # *([str(err.__context__)] if getattr(err, '__context__', None) else []), ], @@ -120,7 +120,6 @@ def generic_exception_handler(request, err): ) - # import orjson # from ninja.renderers import BaseRenderer # class ORJSONRenderer(BaseRenderer): diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py index e5c829fb..e8c61e17 100644 --- a/archivebox/api/v1_auth.py +++ b/archivebox/api/v1_auth.py @@ -1,6 +1,5 @@ -__package__ = 'archivebox.api' +__package__ = "archivebox.api" -from typing import Optional from django.http import HttpRequest from ninja import Router, Schema @@ -8,16 +7,21 @@ from ninja import Router, Schema from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token -router = Router(tags=['Authentication'], auth=None) +router = Router(tags=["Authentication"], auth=None) class PasswordAuthSchema(Schema): """Schema for a /get_api_token request""" - username: Optional[str] = None - password: Optional[str] = None + + username: str | None = None + password: str | None = None -@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet +@router.post( + "/get_api_token", + auth=None, + summary="Generate an API token for a given username & password (or currently logged-in user)", +) # auth=None because they are not authed yet def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema): user = auth_using_password( username=auth_data.username, @@ -35,17 +39,21 @@ def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema): "token": api_token.token, "expires": api_token.expires.isoformat() if api_token.expires else None, } - - return {"success": False, "errors": ["Invalid credentials"]} + return {"success": False, "errors": ["Invalid credentials"]} class TokenAuthSchema(Schema): """Schema for a /check_api_token request""" + token: str -@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet +@router.post( + "/check_api_token", + auth=None, + summary="Validate an API token to make sure its valid and non-expired", +) # auth=None because they are not authed yet def check_api_token(request: HttpRequest, token_data: TokenAuthSchema): user = auth_using_token( token=token_data.token, @@ -53,5 +61,5 @@ def check_api_token(request: HttpRequest, token_data: TokenAuthSchema): ) if user: return {"success": True, "user_id": str(user.pk)} - + return {"success": False, "user_id": None} diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 1cae7231..2c317ad4 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -1,8 +1,8 @@ -__package__ = 'archivebox.api' +__package__ = "archivebox.api" import json from io import StringIO -from typing import List, Dict, Any, Optional +from typing import Any from enum import Enum from django.http import HttpRequest @@ -16,44 +16,47 @@ from archivebox.config.common import ARCHIVING_CONFIG # from .auth import API_AUTH_METHODS # router for API that exposes archivebox cli subcommands as REST endpoints -router = Router(tags=['ArchiveBox CLI Sub-Commands']) +router = Router(tags=["ArchiveBox CLI Sub-Commands"]) # Schemas -JSONType = List[Any] | Dict[str, Any] | bool | int | str | None +JSONType = list[Any] | dict[str, Any] | bool | int | str | None + class CLICommandResponseSchema(Schema): success: bool - errors: List[str] + errors: list[str] result: JSONType - result_format: str = 'str' + result_format: str = "str" stdout: str stderr: str + class FilterTypeChoices(str, Enum): - exact = 'exact' - substring = 'substring' - regex = 'regex' - domain = 'domain' - tag = 'tag' - timestamp = 'timestamp' + exact = "exact" + substring = "substring" + regex = "regex" + domain = "domain" + tag = "tag" + timestamp = "timestamp" + class StatusChoices(str, Enum): - indexed = 'indexed' - archived = 'archived' - unarchived = 'unarchived' - present = 'present' - valid = 'valid' - invalid = 'invalid' - duplicate = 'duplicate' - orphaned = 'orphaned' - corrupted = 'corrupted' - unrecognized = 'unrecognized' + indexed = "indexed" + archived = "archived" + unarchived = "unarchived" + present = "present" + valid = "valid" + invalid = "invalid" + duplicate = "duplicate" + orphaned = "orphaned" + corrupted = "corrupted" + unrecognized = "unrecognized" class AddCommandSchema(Schema): - urls: List[str] + urls: list[str] tag: str = "" depth: int = 0 parser: str = "auto" @@ -62,53 +65,54 @@ class AddCommandSchema(Schema): overwrite: bool = False index_only: bool = False + class UpdateCommandSchema(Schema): - resume: Optional[str] = None - after: Optional[float] = 0 - before: Optional[float] = 999999999999999 - filter_type: Optional[str] = FilterTypeChoices.substring - filter_patterns: Optional[List[str]] = ['https://example.com'] + resume: str | None = None + after: float | None = 0 + before: float | None = 999999999999999 + filter_type: str | None = FilterTypeChoices.substring + filter_patterns: list[str] | None = ["https://example.com"] batch_size: int = 100 continuous: bool = False + class ScheduleCommandSchema(Schema): - import_path: Optional[str] = None + import_path: str | None = None add: bool = False show: bool = False foreground: bool = False run_all: bool = False quiet: bool = False - every: Optional[str] = None - tag: str = '' + every: str | None = None + tag: str = "" depth: int = 0 overwrite: bool = False update: bool = not ARCHIVING_CONFIG.ONLY_NEW clear: bool = False + class ListCommandSchema(Schema): - filter_patterns: Optional[List[str]] = ['https://example.com'] + filter_patterns: list[str] | None = ["https://example.com"] filter_type: str = FilterTypeChoices.substring status: StatusChoices = StatusChoices.indexed - after: Optional[float] = 0 - before: Optional[float] = 999999999999999 - sort: str = 'bookmarked_at' + after: float | None = 0 + before: float | None = 999999999999999 + sort: str = "bookmarked_at" as_json: bool = True as_html: bool = False - as_csv: str | None = 'timestamp,url' + as_csv: str | None = "timestamp,url" with_headers: bool = False + class RemoveCommandSchema(Schema): delete: bool = True - after: Optional[float] = 0 - before: Optional[float] = 999999999999999 + after: float | None = 0 + before: float | None = 999999999999999 filter_type: str = FilterTypeChoices.exact - filter_patterns: Optional[List[str]] = ['https://example.com'] + filter_patterns: list[str] | None = ["https://example.com"] - - - -@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]') +@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]") def cli_add(request: HttpRequest, args: AddCommandSchema): from archivebox.cli.archivebox_add import add @@ -125,30 +129,30 @@ def cli_add(request: HttpRequest, args: AddCommandSchema): created_by_id=request.user.pk, ) - snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)] + snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)] result_payload = { "crawl_id": str(crawl.id), "num_snapshots": len(snapshot_ids), "snapshot_ids": snapshot_ids, "queued_urls": args.urls, } - stdout = getattr(request, 'stdout', None) - stderr = getattr(request, 'stderr', None) + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) return { "success": True, "errors": [], "result": result_payload, "result_format": "json", - "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '', - "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '', + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", } -@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]') +@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]") def cli_update(request: HttpRequest, args: UpdateCommandSchema): from archivebox.cli.archivebox_update import update - + result = update( filter_patterns=args.filter_patterns or [], filter_type=args.filter_type or FilterTypeChoices.substring, @@ -158,21 +162,21 @@ def cli_update(request: HttpRequest, args: UpdateCommandSchema): batch_size=args.batch_size, continuous=args.continuous, ) - stdout = getattr(request, 'stdout', None) - stderr = getattr(request, 'stderr', None) + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) return { "success": True, "errors": [], "result": result, - "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '', - "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '', + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", } -@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]') +@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]") def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema): from archivebox.cli.archivebox_schedule import schedule - + result = schedule( import_path=args.import_path, add=args.add, @@ -188,23 +192,22 @@ def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema): update=args.update, ) - stdout = getattr(request, 'stdout', None) - stderr = getattr(request, 'stderr', None) + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) return { "success": True, "errors": [], "result": result, "result_format": "json", - "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '', - "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '', + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", } - -@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]') +@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]") def cli_search(request: HttpRequest, args: ListCommandSchema): from archivebox.cli.archivebox_search import search - + result = search( filter_patterns=args.filter_patterns, filter_type=args.filter_type, @@ -218,7 +221,7 @@ def cli_search(request: HttpRequest, args: ListCommandSchema): with_headers=args.with_headers, ) - result_format = 'txt' + result_format = "txt" if args.as_json: result_format = "json" result = json.loads(result) @@ -227,20 +230,19 @@ def cli_search(request: HttpRequest, args: ListCommandSchema): elif args.as_csv: result_format = "csv" - stdout = getattr(request, 'stdout', None) - stderr = getattr(request, 'stderr', None) + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) return { "success": True, "errors": [], "result": result, "result_format": result_format, - "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '', - "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '', + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", } - -@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]') +@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]") def cli_remove(request: HttpRequest, args: RemoveCommandSchema): from archivebox.cli.archivebox_remove import remove from archivebox.cli.archivebox_search import get_snapshots @@ -253,10 +255,10 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema): after=args.after, before=args.before, ) - removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)] - + removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)] + remove( - yes=True, # no way to interactively ask for confirmation via API, so we force yes + yes=True, # no way to interactively ask for confirmation via API, so we force yes delete=args.delete, snapshots=snapshots_to_remove, before=args.before, @@ -270,14 +272,13 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema): "removed_snapshot_ids": removed_snapshot_ids, "remaining_snapshots": Snapshot.objects.count(), } - stdout = getattr(request, 'stdout', None) - stderr = getattr(request, 'stderr', None) + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) return { "success": True, "errors": [], "result": result, "result_format": "json", - "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '', - "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '', + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", } - diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 51dab0e9..8f4b4ae0 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -1,11 +1,13 @@ -__package__ = 'archivebox.api' +__package__ = "archivebox.api" import math +from collections import defaultdict from uuid import UUID -from typing import List, Optional, Union, Any, Annotated +from typing import Union, Any, Annotated from datetime import datetime -from django.db.models import Model, Q +from django.db.models import Model, Q, Sum +from django.db.models.functions import Coalesce from django.conf import settings from django.http import HttpRequest, HttpResponse from django.core.exceptions import ValidationError @@ -39,7 +41,7 @@ from archivebox.crawls.models import Crawl from archivebox.api.v1_crawls import CrawlSchema -router = Router(tags=['Core Models']) +router = Router(tags=["Core Models"]) class CustomPagination(PaginationBase): @@ -49,13 +51,14 @@ class CustomPagination(PaginationBase): page: int = 0 class Output(PaginationBase.Output): + count: int total_items: int total_pages: int page: int limit: int offset: int num_items: int - items: List[Any] + items: list[Any] def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params): limit = min(pagination.limit, 500) @@ -65,27 +68,29 @@ class CustomPagination(PaginationBase): current_page = math.ceil(offset / (limit + 1)) items = queryset[offset : offset + limit] return { - 'total_items': total, - 'total_pages': total_pages, - 'page': current_page, - 'limit': limit, - 'offset': offset, - 'num_items': len(items), - 'items': items, + "count": total, + "total_items": total, + "total_pages": total_pages, + "page": current_page, + "limit": limit, + "offset": offset, + "num_items": len(items), + "items": items, } ### ArchiveResult ######################################################################### + class MinimalArchiveResultSchema(Schema): - TYPE: str = 'core.models.ArchiveResult' + TYPE: str = "core.models.ArchiveResult" id: UUID created_at: datetime | None modified_at: datetime | None created_by_id: str created_by_username: str status: str - retry_at: datetime | None + retry_at: datetime | None = None plugin: str hook_name: str process_id: UUID | None @@ -93,8 +98,8 @@ class MinimalArchiveResultSchema(Schema): cmd: list[str] | None pwd: str | None output_str: str - output_json: dict | None - output_files: dict | None + output_json: dict[str, Any] | None + output_files: dict[str, dict[str, Any]] | None output_size: int output_mimetypes: str start_ts: datetime | None @@ -108,13 +113,34 @@ class MinimalArchiveResultSchema(Schema): def resolve_created_by_username(obj) -> str: return obj.created_by.username + @staticmethod + def resolve_output_files(obj): + return obj.output_file_map() + + @staticmethod + def resolve_output_mimetypes(obj) -> str: + mime_sizes: dict[str, int] = defaultdict(int) + for metadata in obj.output_file_map().values(): + if not isinstance(metadata, dict): + continue + mimetype = str(metadata.get("mimetype") or "").strip() + try: + size = max(int(metadata.get("size") or 0), 0) + except (TypeError, ValueError): + size = 0 + if mimetype and size: + mime_sizes[mimetype] += size + if mime_sizes: + return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return obj.output_mimetypes or "" + class ArchiveResultSchema(MinimalArchiveResultSchema): - TYPE: str = 'core.models.ArchiveResult' + TYPE: str = "core.models.ArchiveResult" snapshot_id: UUID snapshot_timestamp: str snapshot_url: str - snapshot_tags: List[str] + snapshot_tags: list[str] @staticmethod def resolve_snapshot_timestamp(obj): @@ -134,25 +160,39 @@ class ArchiveResultSchema(MinimalArchiveResultSchema): class ArchiveResultFilterSchema(FilterSchema): - id: Annotated[Optional[str], FilterLookup(['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None - search: Annotated[Optional[str], FilterLookup(['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None - snapshot_id: Annotated[Optional[str], FilterLookup(['snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None - snapshot_url: Annotated[Optional[str], FilterLookup('snapshot__url__icontains')] = None - snapshot_tag: Annotated[Optional[str], FilterLookup('snapshot__tags__name__icontains')] = None - status: Annotated[Optional[str], FilterLookup('status')] = None - output_str: Annotated[Optional[str], FilterLookup('output_str__icontains')] = None - plugin: Annotated[Optional[str], FilterLookup('plugin__icontains')] = None - hook_name: Annotated[Optional[str], FilterLookup('hook_name__icontains')] = None - process_id: Annotated[Optional[str], FilterLookup('process__id__startswith')] = None - cmd: Annotated[Optional[str], FilterLookup('cmd__0__icontains')] = None - pwd: Annotated[Optional[str], FilterLookup('pwd__icontains')] = None - cmd_version: Annotated[Optional[str], FilterLookup('cmd_version')] = None - created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None - created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None - created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None + id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None + search: Annotated[ + str | None, + FilterLookup( + [ + "snapshot__url__icontains", + "snapshot__title__icontains", + "snapshot__tags__name__icontains", + "plugin", + "output_str__icontains", + "id__startswith", + "snapshot__id__startswith", + "snapshot__timestamp__startswith", + ], + ), + ] = None + snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None + snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None + snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None + status: Annotated[str | None, FilterLookup("status")] = None + output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None + plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None + hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None + process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None + cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None + pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None + cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None + created_at: Annotated[datetime | None, FilterLookup("created_at")] = None + created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None + created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None -@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult") +@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult") @paginate(CustomPagination) def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]): """List all ArchiveResult entries matching these filters.""" @@ -167,8 +207,9 @@ def get_archiveresult(request: HttpRequest, archiveresult_id: str): ### Snapshot ######################################################################### + class SnapshotSchema(Schema): - TYPE: str = 'core.models.Snapshot' + TYPE: str = "core.models.Snapshot" id: UUID created_by_id: str created_by_username: str @@ -177,14 +218,16 @@ class SnapshotSchema(Schema): status: str retry_at: datetime | None bookmarked_at: datetime - downloaded_at: Optional[datetime] + downloaded_at: datetime | None url: str - tags: List[str] - title: Optional[str] + tags: list[str] + title: str | None timestamp: str archive_path: str + archive_size: int + output_size: int num_archiveresults: int - archiveresults: List[MinimalArchiveResultSchema] + archiveresults: list[MinimalArchiveResultSchema] @staticmethod def resolve_created_by_id(obj): @@ -198,13 +241,21 @@ class SnapshotSchema(Schema): def resolve_tags(obj): return sorted(tag.name for tag in obj.tags.all()) + @staticmethod + def resolve_archive_size(obj): + return int(getattr(obj, "output_size_sum", obj.archive_size) or 0) + + @staticmethod + def resolve_output_size(obj): + return SnapshotSchema.resolve_archive_size(obj) + @staticmethod def resolve_num_archiveresults(obj, context): return obj.archiveresult_set.all().distinct().count() @staticmethod def resolve_archiveresults(obj, context): - if bool(getattr(context['request'], 'with_archiveresults', False)): + if bool(getattr(context["request"], "with_archiveresults", False)): return obj.archiveresult_set.all().distinct() return ArchiveResult.objects.none() @@ -212,16 +263,16 @@ class SnapshotSchema(Schema): class SnapshotUpdateSchema(Schema): status: str | None = None retry_at: datetime | None = None - tags: Optional[List[str]] = None + tags: list[str] | None = None class SnapshotCreateSchema(Schema): url: str - crawl_id: Optional[str] = None + crawl_id: str | None = None depth: int = 0 - title: Optional[str] = None - tags: Optional[List[str]] = None - status: Optional[str] = None + title: str | None = None + tags: list[str] | None = None + status: str | None = None class SnapshotDeleteResponseSchema(Schema): @@ -231,77 +282,82 @@ class SnapshotDeleteResponseSchema(Schema): deleted_count: int -def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]: +def normalize_tag_list(tags: list[str] | None = None) -> list[str]: return [tag.strip() for tag in (tags or []) if tag and tag.strip()] class SnapshotFilterSchema(FilterSchema): - id: Annotated[Optional[str], FilterLookup(['id__icontains', 'timestamp__startswith'])] = None - created_by_id: Annotated[Optional[str], FilterLookup('crawl__created_by_id')] = None - created_by_username: Annotated[Optional[str], FilterLookup('crawl__created_by__username__icontains')] = None - created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None - created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None - created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None - modified_at: Annotated[Optional[datetime], FilterLookup('modified_at')] = None - modified_at__gte: Annotated[Optional[datetime], FilterLookup('modified_at__gte')] = None - modified_at__lt: Annotated[Optional[datetime], FilterLookup('modified_at__lt')] = None - search: Annotated[Optional[str], FilterLookup(['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])] = None - url: Annotated[Optional[str], FilterLookup('url')] = None - tag: Annotated[Optional[str], FilterLookup('tags__name')] = None - title: Annotated[Optional[str], FilterLookup('title__icontains')] = None - timestamp: Annotated[Optional[str], FilterLookup('timestamp__startswith')] = None - bookmarked_at__gte: Annotated[Optional[datetime], FilterLookup('bookmarked_at__gte')] = None - bookmarked_at__lt: Annotated[Optional[datetime], FilterLookup('bookmarked_at__lt')] = None + id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None + created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None + created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None + created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None + created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None + created_at: Annotated[datetime | None, FilterLookup("created_at")] = None + modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None + modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None + modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None + search: Annotated[ + str | None, + FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]), + ] = None + url: Annotated[str | None, FilterLookup("url")] = None + tag: Annotated[str | None, FilterLookup("tags__name")] = None + title: Annotated[str | None, FilterLookup("title__icontains")] = None + timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None + bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None + bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None -@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots") +@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots") @paginate(CustomPagination) def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False): """List all Snapshot entries matching these filters.""" - setattr(request, 'with_archiveresults', with_archiveresults) - return filters.filter(Snapshot.objects.all()).distinct() + setattr(request, "with_archiveresults", with_archiveresults) + queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)) + return filters.filter(queryset).distinct() @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot") def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True): """Get a specific Snapshot by id.""" - setattr(request, 'with_archiveresults', with_archiveresults) + setattr(request, "with_archiveresults", with_archiveresults) + queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)) try: - return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id)) + return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id)) except Snapshot.DoesNotExist: - return Snapshot.objects.get(Q(id__icontains=snapshot_id)) + return queryset.get(Q(id__icontains=snapshot_id)) @router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot") def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema): tags = normalize_tag_list(data.tags) if data.status is not None and data.status not in Snapshot.StatusChoices.values: - raise HttpError(400, f'Invalid status: {data.status}') + raise HttpError(400, f"Invalid status: {data.status}") if not data.url.strip(): - raise HttpError(400, 'URL is required') + raise HttpError(400, "URL is required") if data.depth not in (0, 1, 2, 3, 4): - raise HttpError(400, 'depth must be between 0 and 4') + raise HttpError(400, "depth must be between 0 and 4") if data.crawl_id: crawl = Crawl.objects.get(id__icontains=data.crawl_id) - crawl_tags = normalize_tag_list(crawl.tags_str.split(',')) + crawl_tags = normalize_tag_list(crawl.tags_str.split(",")) tags = tags or crawl_tags else: crawl = Crawl.objects.create( urls=data.url, max_depth=max(data.depth, 0), - tags_str=','.join(tags), + tags_str=",".join(tags), status=Crawl.StatusChoices.QUEUED, retry_at=timezone.now(), created_by=request.user if isinstance(request.user, User) else None, ) snapshot_defaults = { - 'depth': data.depth, - 'title': data.title, - 'timestamp': str(timezone.now().timestamp()), - 'status': data.status or Snapshot.StatusChoices.QUEUED, - 'retry_at': timezone.now(), + "depth": data.depth, + "title": data.title, + "timestamp": str(timezone.now().timestamp()), + "status": data.status or Snapshot.StatusChoices.QUEUED, + "retry_at": timezone.now(), } snapshot, _ = Snapshot.objects.get_or_create( url=data.url, @@ -309,17 +365,17 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema): defaults=snapshot_defaults, ) - update_fields: List[str] = [] + update_fields: list[str] = [] if data.title is not None and snapshot.title != data.title: snapshot.title = data.title - update_fields.append('title') + update_fields.append("title") if data.status is not None and snapshot.status != data.status: if data.status not in Snapshot.StatusChoices.values: - raise HttpError(400, f'Invalid status: {data.status}') + raise HttpError(400, f"Invalid status: {data.status}") snapshot.status = data.status - update_fields.append('status') + update_fields.append("status") if update_fields: - update_fields.append('modified_at') + update_fields.append("modified_at") snapshot.save(update_fields=update_fields) if tags: @@ -330,7 +386,7 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema): except Exception: pass - setattr(request, 'with_archiveresults', False) + setattr(request, "with_archiveresults", False) return snapshot @@ -343,26 +399,26 @@ def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateS snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id)) payload = data.dict(exclude_unset=True) - update_fields = ['modified_at'] - tags = payload.pop('tags', None) + update_fields = ["modified_at"] + tags = payload.pop("tags", None) - if 'status' in payload: - if payload['status'] not in Snapshot.StatusChoices.values: - raise HttpError(400, f'Invalid status: {payload["status"]}') - snapshot.status = payload['status'] - if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload: + if "status" in payload: + if payload["status"] not in Snapshot.StatusChoices.values: + raise HttpError(400, f"Invalid status: {payload['status']}") + snapshot.status = payload["status"] + if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload: snapshot.retry_at = None - update_fields.append('status') + update_fields.append("status") - if 'retry_at' in payload: - snapshot.retry_at = payload['retry_at'] - update_fields.append('retry_at') + if "retry_at" in payload: + snapshot.retry_at = payload["retry_at"] + update_fields.append("retry_at") if tags is not None: snapshot.save_tags(normalize_tag_list(tags)) snapshot.save(update_fields=update_fields) - setattr(request, 'with_archiveresults', False) + setattr(request, "with_archiveresults", False) return snapshot @@ -373,17 +429,18 @@ def delete_snapshot(request: HttpRequest, snapshot_id: str): crawl_id_str = str(snapshot.crawl.pk) deleted_count, _ = snapshot.delete() return { - 'success': True, - 'snapshot_id': snapshot_id_str, - 'crawl_id': crawl_id_str, - 'deleted_count': deleted_count, + "success": True, + "snapshot_id": snapshot_id_str, + "crawl_id": crawl_id_str, + "deleted_count": deleted_count, } ### Tag ######################################################################### + class TagSchema(Schema): - TYPE: str = 'core.models.Tag' + TYPE: str = "core.models.Tag" id: int modified_at: datetime created_at: datetime @@ -392,7 +449,7 @@ class TagSchema(Schema): name: str slug: str num_snapshots: int - snapshots: List[SnapshotSchema] + snapshots: list[SnapshotSchema] @staticmethod def resolve_created_by_id(obj): @@ -402,7 +459,7 @@ class TagSchema(Schema): def resolve_created_by_username(obj): user_model = get_user_model() user = user_model.objects.get(id=obj.created_by_id) - username = getattr(user, 'username', None) + username = getattr(user, "username", None) return username if isinstance(username, str) else str(user) @staticmethod @@ -411,58 +468,67 @@ class TagSchema(Schema): @staticmethod def resolve_snapshots(obj, context): - if bool(getattr(context['request'], 'with_snapshots', False)): + if bool(getattr(context["request"], "with_snapshots", False)): return obj.snapshot_set.all().distinct() return Snapshot.objects.none() -@router.get("/tags", response=List[TagSchema], url_name="get_tags") +@router.get("/tags", response=list[TagSchema], url_name="get_tags") @paginate(CustomPagination) def get_tags(request: HttpRequest): - setattr(request, 'with_snapshots', False) - setattr(request, 'with_archiveresults', False) + setattr(request, "with_snapshots", False) + setattr(request, "with_archiveresults", False) return get_matching_tags() @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag") def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True): - setattr(request, 'with_snapshots', with_snapshots) - setattr(request, 'with_archiveresults', False) + setattr(request, "with_snapshots", with_snapshots) + setattr(request, "with_archiveresults", False) try: return get_tag_by_ref(tag_id) except (Tag.DoesNotExist, ValidationError): - raise HttpError(404, 'Tag not found') + raise HttpError(404, "Tag not found") -@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID") +@router.get( + "/any/{id}", + response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], + url_name="get_any", + summary="Get any object by its ID", +) def get_any(request: HttpRequest, id: str): """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.).""" - setattr(request, 'with_snapshots', False) - setattr(request, 'with_archiveresults', False) + setattr(request, "with_snapshots", False) + setattr(request, "with_archiveresults", False) for getter in [get_snapshot, get_archiveresult, get_tag]: try: response = getter(request, id) if isinstance(response, Model): - return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}") + return redirect( + f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}", + ) except Exception: pass try: from archivebox.api.v1_crawls import get_crawl + response = get_crawl(request, id) if isinstance(response, Model): return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}") except Exception: pass - raise HttpError(404, 'Object with given ID not found') + raise HttpError(404, "Object with given ID not found") ### Tag Editor API Endpoints ######################################################################### + class TagAutocompleteSchema(Schema): - tags: List[dict] + tags: list[dict] class TagCreateSchema(Schema): @@ -483,7 +549,7 @@ class TagSearchSnapshotSchema(Schema): favicon_url: str admin_url: str archive_url: str - downloaded_at: Optional[str] = None + downloaded_at: str | None = None class TagSearchCardSchema(Schema): @@ -497,11 +563,11 @@ class TagSearchCardSchema(Schema): export_jsonl_url: str rename_url: str delete_url: str - snapshots: List[TagSearchSnapshotSchema] + snapshots: list[TagSearchSnapshotSchema] class TagSearchResponseSchema(Schema): - tags: List[TagSearchCardSchema] + tags: list[TagSearchCardSchema] sort: str created_by: str year: str @@ -527,8 +593,8 @@ class TagDeleteResponseSchema(Schema): class TagSnapshotRequestSchema(Schema): snapshot_id: str - tag_name: Optional[str] = None - tag_id: Optional[int] = None + tag_name: str | None = None + tag_id: int | None = None class TagSnapshotResponseSchema(Schema): @@ -541,10 +607,10 @@ class TagSnapshotResponseSchema(Schema): def search_tags( request: HttpRequest, q: str = "", - sort: str = 'created_desc', - created_by: str = '', - year: str = '', - has_snapshots: str = 'all', + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", ): """Return detailed tag cards for admin/live-search UIs.""" normalized_sort = normalize_tag_sort(sort) @@ -552,7 +618,7 @@ def search_tags( normalized_year = normalize_created_year_filter(year) normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots) return { - 'tags': build_tag_cards( + "tags": build_tag_cards( query=q, request=request, sort=normalized_sort, @@ -560,28 +626,28 @@ def search_tags( year=normalized_year, has_snapshots=normalized_has_snapshots, ), - 'sort': normalized_sort, - 'created_by': normalized_created_by, - 'year': normalized_year, - 'has_snapshots': normalized_has_snapshots, + "sort": normalized_sort, + "created_by": normalized_created_by, + "year": normalized_year, + "has_snapshots": normalized_has_snapshots, } def _public_tag_listing_enabled() -> bool: - explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None) + explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None) if explicit is not None: return bool(explicit) - return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX)) + return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX)) def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool: - user = getattr(request, 'user', None) - if getattr(user, 'is_authenticated', False): + user = getattr(request, "user", None) + if getattr(user, "is_authenticated", False): return True - token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key') - auth_header = request.headers.get('Authorization', '') - if not token and auth_header.lower().startswith('bearer '): + token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key") + auth_header = request.headers.get("Authorization", "") + if not token and auth_header.lower().startswith("bearer "): token = auth_header.split(None, 1)[1].strip() if token and auth_using_token(token=token, request=request): @@ -594,12 +660,12 @@ def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool: def tags_autocomplete(request: HttpRequest, q: str = ""): """Return tags matching the query for autocomplete.""" if not _request_has_tag_autocomplete_access(request): - raise HttpError(401, 'Authentication required') + raise HttpError(401, "Authentication required") - tags = get_matching_tags(q)[:50 if not q else 20] + tags = get_matching_tags(q)[: 50 if not q else 20] return { - 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags] + "tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags], } @@ -615,10 +681,10 @@ def tags_create(request: HttpRequest, data: TagCreateSchema): raise HttpError(400, str(err)) from err return { - 'success': True, - 'tag_id': tag.pk, - 'tag_name': tag.name, - 'created': created, + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + "created": created, } @@ -627,15 +693,15 @@ def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema): try: tag = rename_tag_record(get_tag_by_ref(tag_id), data.name) except Tag.DoesNotExist as err: - raise HttpError(404, 'Tag not found') from err + raise HttpError(404, "Tag not found") from err except ValueError as err: raise HttpError(400, str(err)) from err return { - 'success': True, - 'tag_id': tag.pk, - 'tag_name': tag.name, - 'slug': tag.slug, + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + "slug": tag.slug, } @@ -644,13 +710,13 @@ def delete_tag(request: HttpRequest, tag_id: int): try: tag = get_tag_by_ref(tag_id) except Tag.DoesNotExist as err: - raise HttpError(404, 'Tag not found') from err + raise HttpError(404, "Tag not found") from err deleted_count, _ = delete_tag_record(tag) return { - 'success': True, - 'tag_id': int(tag_id), - 'deleted_count': deleted_count, + "success": True, + "tag_id": int(tag_id), + "deleted_count": deleted_count, } @@ -659,10 +725,10 @@ def tag_urls_export(request: HttpRequest, tag_id: int): try: tag = get_tag_by_ref(tag_id) except Tag.DoesNotExist as err: - raise HttpError(404, 'Tag not found') from err + raise HttpError(404, "Tag not found") from err - response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8') - response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"' + response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8") + response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"' return response @@ -671,10 +737,10 @@ def tag_snapshots_export(request: HttpRequest, tag_id: int): try: tag = get_tag_by_ref(tag_id) except Tag.DoesNotExist as err: - raise HttpError(404, 'Tag not found') from err + raise HttpError(404, "Tag not found") from err - response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8') - response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"' + response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8") + response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"' return response @@ -684,16 +750,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): # Get the snapshot try: snapshot = Snapshot.objects.get( - Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), ) except Snapshot.DoesNotExist: - raise HttpError(404, 'Snapshot not found') + raise HttpError(404, "Snapshot not found") except Snapshot.MultipleObjectsReturned: snapshot = Snapshot.objects.filter( - Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), ).first() if snapshot is None: - raise HttpError(404, 'Snapshot not found') + raise HttpError(404, "Snapshot not found") # Get or create the tag if data.tag_name: @@ -708,17 +774,17 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): try: tag = get_tag_by_ref(data.tag_id) except Tag.DoesNotExist: - raise HttpError(404, 'Tag not found') + raise HttpError(404, "Tag not found") else: - raise HttpError(400, 'Either tag_name or tag_id is required') + raise HttpError(400, "Either tag_name or tag_id is required") # Add the tag to the snapshot snapshot.tags.add(tag.pk) return { - 'success': True, - 'tag_id': tag.pk, - 'tag_name': tag.name, + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, } @@ -728,36 +794,36 @@ def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSche # Get the snapshot try: snapshot = Snapshot.objects.get( - Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), ) except Snapshot.DoesNotExist: - raise HttpError(404, 'Snapshot not found') + raise HttpError(404, "Snapshot not found") except Snapshot.MultipleObjectsReturned: snapshot = Snapshot.objects.filter( - Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), ).first() if snapshot is None: - raise HttpError(404, 'Snapshot not found') + raise HttpError(404, "Snapshot not found") # Get the tag if data.tag_id: try: tag = Tag.objects.get(pk=data.tag_id) except Tag.DoesNotExist: - raise HttpError(404, 'Tag not found') + raise HttpError(404, "Tag not found") elif data.tag_name: try: tag = Tag.objects.get(name__iexact=data.tag_name.strip()) except Tag.DoesNotExist: - raise HttpError(404, 'Tag not found') + raise HttpError(404, "Tag not found") else: - raise HttpError(400, 'Either tag_name or tag_id is required') + raise HttpError(400, "Either tag_name or tag_id is required") # Remove the tag from the snapshot snapshot.tags.remove(tag.pk) return { - 'success': True, - 'tag_id': tag.pk, - 'tag_name': tag.name, + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, } diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index c695fab5..a925ff18 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -1,7 +1,6 @@ -__package__ = 'archivebox.api' +__package__ = "archivebox.api" from uuid import UUID -from typing import List, Optional from datetime import datetime from django.http import HttpRequest from django.utils import timezone @@ -17,11 +16,11 @@ from archivebox.crawls.models import Crawl from .auth import API_AUTH_METHODS -router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS) +router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS) class CrawlSchema(Schema): - TYPE: str = 'crawls.models.Crawl' + TYPE: str = "crawls.models.Crawl" id: UUID @@ -35,6 +34,8 @@ class CrawlSchema(Schema): urls: str max_depth: int + max_urls: int + max_size: int tags_str: str config: dict @@ -48,12 +49,12 @@ class CrawlSchema(Schema): def resolve_created_by_username(obj): user_model = get_user_model() user = user_model.objects.get(id=obj.created_by_id) - username = getattr(user, 'username', None) + username = getattr(user, "username", None) return username if isinstance(username, str) else str(user) @staticmethod def resolve_snapshots(obj, context): - if bool(getattr(context['request'], 'with_snapshots', False)): + if bool(getattr(context["request"], "with_snapshots", False)): return obj.snapshot_set.all().distinct() return Snapshot.objects.none() @@ -61,17 +62,19 @@ class CrawlSchema(Schema): class CrawlUpdateSchema(Schema): status: str | None = None retry_at: datetime | None = None - tags: Optional[List[str]] = None + tags: list[str] | None = None tags_str: str | None = None class CrawlCreateSchema(Schema): - urls: List[str] + urls: list[str] max_depth: int = 0 - tags: Optional[List[str]] = None - tags_str: str = '' - label: str = '' - notes: str = '' + max_urls: int = 0 + max_size: int = 0 + tags: list[str] | None = None + tags_str: str = "" + label: str = "" + notes: str = "" config: dict = {} @@ -82,13 +85,13 @@ class CrawlDeleteResponseSchema(Schema): deleted_snapshots: int -def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]: +def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]: if tags is not None: return [tag.strip() for tag in tags if tag and tag.strip()] - return [tag.strip() for tag in tags_str.split(',') if tag.strip()] + return [tag.strip() for tag in tags_str.split(",") if tag.strip()] -@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls") +@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls") def get_crawls(request: HttpRequest): return Crawl.objects.all().distinct() @@ -97,15 +100,21 @@ def get_crawls(request: HttpRequest): def create_crawl(request: HttpRequest, data: CrawlCreateSchema): urls = [url.strip() for url in data.urls if url and url.strip()] if not urls: - raise HttpError(400, 'At least one URL is required') + raise HttpError(400, "At least one URL is required") if data.max_depth not in (0, 1, 2, 3, 4): - raise HttpError(400, 'max_depth must be between 0 and 4') + raise HttpError(400, "max_depth must be between 0 and 4") + if data.max_urls < 0: + raise HttpError(400, "max_urls must be >= 0") + if data.max_size < 0: + raise HttpError(400, "max_size must be >= 0") tags = normalize_tag_list(data.tags, data.tags_str) crawl = Crawl.objects.create( - urls='\n'.join(urls), + urls="\n".join(urls), max_depth=data.max_depth, - tags_str=','.join(tags), + max_urls=data.max_urls, + max_size=data.max_size, + tags_str=",".join(tags), label=data.label, notes=data.notes, config=data.config, @@ -116,25 +125,26 @@ def create_crawl(request: HttpRequest, data: CrawlCreateSchema): crawl.create_snapshots_from_urls() return crawl + @router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl") -def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False): +def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False): """Get a specific Crawl by id.""" - setattr(request, 'with_snapshots', with_snapshots) - setattr(request, 'with_archiveresults', with_archiveresults) + setattr(request, "with_snapshots", with_snapshots) + setattr(request, "with_archiveresults", with_archiveresults) crawl = Crawl.objects.get(id__icontains=crawl_id) - + if crawl and as_rss: # return snapshots as XML rss feed urls = [ - {'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str} + {"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str} for snapshot in crawl.snapshot_set.all() ] xml = '' for url in urls: - xml += f'{url["url"]}{url["title"]}{url["bookmarked_at"]}{url["tags"]}' - xml += '' + xml += f"{url['url']}{url['title']}{url['bookmarked_at']}{url['tags']}" + xml += "" return xml - + return crawl @@ -143,29 +153,29 @@ def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema): """Update a crawl (e.g., set status=sealed to cancel queued work).""" crawl = Crawl.objects.get(id__icontains=crawl_id) payload = data.dict(exclude_unset=True) - update_fields = ['modified_at'] + update_fields = ["modified_at"] - tags = payload.pop('tags', None) - tags_str = payload.pop('tags_str', None) + tags = payload.pop("tags", None) + tags_str = payload.pop("tags_str", None) if tags is not None or tags_str is not None: - crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or '')) - update_fields.append('tags_str') + crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or "")) + update_fields.append("tags_str") - if 'status' in payload: - if payload['status'] not in Crawl.StatusChoices.values: - raise HttpError(400, f'Invalid status: {payload["status"]}') - crawl.status = payload['status'] - if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload: + if "status" in payload: + if payload["status"] not in Crawl.StatusChoices.values: + raise HttpError(400, f"Invalid status: {payload['status']}") + crawl.status = payload["status"] + if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload: crawl.retry_at = None - update_fields.append('status') + update_fields.append("status") - if 'retry_at' in payload: - crawl.retry_at = payload['retry_at'] - update_fields.append('retry_at') + if "retry_at" in payload: + crawl.retry_at = payload["retry_at"] + update_fields.append("retry_at") crawl.save(update_fields=update_fields) - if payload.get('status') == Crawl.StatusChoices.SEALED: + if payload.get("status") == Crawl.StatusChoices.SEALED: Snapshot.objects.filter( crawl=crawl, status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], @@ -184,8 +194,8 @@ def delete_crawl(request: HttpRequest, crawl_id: str): snapshot_count = crawl.snapshot_set.count() deleted_count, _ = crawl.delete() return { - 'success': True, - 'crawl_id': crawl_id_str, - 'deleted_count': deleted_count, - 'deleted_snapshots': snapshot_count, + "success": True, + "crawl_id": crawl_id_str, + "deleted_count": deleted_count, + "deleted_snapshots": snapshot_count, } diff --git a/archivebox/api/v1_machine.py b/archivebox/api/v1_machine.py index 2406ef8c..e18dbe48 100644 --- a/archivebox/api/v1_machine.py +++ b/archivebox/api/v1_machine.py @@ -1,7 +1,7 @@ -__package__ = 'archivebox.api' +__package__ = "archivebox.api" from uuid import UUID -from typing import Annotated, List, Optional +from typing import Annotated from datetime import datetime from django.http import HttpRequest @@ -12,16 +12,18 @@ from ninja.pagination import paginate from archivebox.api.v1_core import CustomPagination -router = Router(tags=['Machine and Dependencies']) +router = Router(tags=["Machine and Dependencies"]) # ============================================================================ # Machine Schemas # ============================================================================ + class MachineSchema(Schema): """Schema for Machine model.""" - TYPE: str = 'machine.Machine' + + TYPE: str = "machine.Machine" id: UUID created_at: datetime modified_at: datetime @@ -43,22 +45,24 @@ class MachineSchema(Schema): class MachineFilterSchema(FilterSchema): - id: Annotated[Optional[str], FilterLookup('id__startswith')] = None - hostname: Annotated[Optional[str], FilterLookup('hostname__icontains')] = None - os_platform: Annotated[Optional[str], FilterLookup('os_platform__icontains')] = None - os_arch: Annotated[Optional[str], FilterLookup('os_arch')] = None - hw_in_docker: Annotated[Optional[bool], FilterLookup('hw_in_docker')] = None - hw_in_vm: Annotated[Optional[bool], FilterLookup('hw_in_vm')] = None - bin_providers: Annotated[Optional[str], FilterLookup('bin_providers__icontains')] = None + id: Annotated[str | None, FilterLookup("id__startswith")] = None + hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None + os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None + os_arch: Annotated[str | None, FilterLookup("os_arch")] = None + hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None + hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None + bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None # ============================================================================ # Binary Schemas # ============================================================================ + class BinarySchema(Schema): """Schema for Binary model.""" - TYPE: str = 'machine.Binary' + + TYPE: str = "machine.Binary" id: UUID created_at: datetime modified_at: datetime @@ -85,23 +89,25 @@ class BinarySchema(Schema): class BinaryFilterSchema(FilterSchema): - id: Annotated[Optional[str], FilterLookup('id__startswith')] = None - name: Annotated[Optional[str], FilterLookup('name__icontains')] = None - binprovider: Annotated[Optional[str], FilterLookup('binprovider')] = None - status: Annotated[Optional[str], FilterLookup('status')] = None - machine_id: Annotated[Optional[str], FilterLookup('machine_id__startswith')] = None - version: Annotated[Optional[str], FilterLookup('version__icontains')] = None + id: Annotated[str | None, FilterLookup("id__startswith")] = None + name: Annotated[str | None, FilterLookup("name__icontains")] = None + binprovider: Annotated[str | None, FilterLookup("binprovider")] = None + status: Annotated[str | None, FilterLookup("status")] = None + machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None + version: Annotated[str | None, FilterLookup("version__icontains")] = None # ============================================================================ # Machine Endpoints # ============================================================================ -@router.get("/machines", response=List[MachineSchema], url_name="get_machines") + +@router.get("/machines", response=list[MachineSchema], url_name="get_machines") @paginate(CustomPagination) def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]): """List all machines.""" from archivebox.machine.models import Machine + return filters.filter(Machine.objects.all()).distinct() @@ -109,6 +115,7 @@ def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]): def get_current_machine(request: HttpRequest): """Get the current machine.""" from archivebox.machine.models import Machine + return Machine.current() @@ -117,6 +124,7 @@ def get_machine(request: HttpRequest, machine_id: str): """Get a specific machine by ID.""" from archivebox.machine.models import Machine from django.db.models import Q + return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id)) @@ -127,23 +135,27 @@ def get_machine(request: HttpRequest, machine_id: str): # Binary Endpoints # ============================================================================ -@router.get("/binaries", response=List[BinarySchema], url_name="get_binaries") + +@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries") @paginate(CustomPagination) def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]): """List all binaries.""" from archivebox.machine.models import Binary - return filters.filter(Binary.objects.all().select_related('machine')).distinct() + + return filters.filter(Binary.objects.all().select_related("machine")).distinct() @router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary") def get_binary(request: HttpRequest, binary_id: str): """Get a specific binary by ID.""" from archivebox.machine.models import Binary - return Binary.objects.select_related('machine').get(id__startswith=binary_id) + + return Binary.objects.select_related("machine").get(id__startswith=binary_id) -@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name") +@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name") def get_binaries_by_name(request: HttpRequest, name: str): """Get all binaries with the given name.""" from archivebox.machine.models import Binary - return list(Binary.objects.filter(name__iexact=name).select_related('machine')) + + return list(Binary.objects.filter(name__iexact=name).select_related("machine")) diff --git a/archivebox/base_models/__init__.py b/archivebox/base_models/__init__.py index 8469c859..7c4b6853 100644 --- a/archivebox/base_models/__init__.py +++ b/archivebox/base_models/__init__.py @@ -1 +1 @@ -__package__ = 'archivebox.base_models' +__package__ = "archivebox.base_models" diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py index 116e3654..d6703b82 100644 --- a/archivebox/base_models/admin.py +++ b/archivebox/base_models/admin.py @@ -1,6 +1,6 @@ """Base admin classes for models using UUIDv7.""" -__package__ = 'archivebox.base_models' +__package__ = "archivebox.base_models" import json from collections.abc import Mapping @@ -32,11 +32,12 @@ class KeyValueWidget(forms.Widget): with + and - buttons to add/remove rows. Includes autocomplete for available config keys from the plugin system. """ + template_name = "" # We render manually class Media: css = { - 'all': [] + "all": [], } js = [] @@ -44,17 +45,18 @@ class KeyValueWidget(forms.Widget): """Get available config options from plugins.""" try: from archivebox.hooks import discover_plugin_configs + plugin_configs = discover_plugin_configs() options: dict[str, ConfigOption] = {} for plugin_name, schema in plugin_configs.items(): - for key, prop in schema.get('properties', {}).items(): + for key, prop in schema.get("properties", {}).items(): option: ConfigOption = { - 'plugin': plugin_name, - 'type': prop.get('type', 'string'), - 'default': prop.get('default', ''), - 'description': prop.get('description', ''), + "plugin": plugin_name, + "type": prop.get("type", "string"), + "default": prop.get("default", ""), + "description": prop.get("description", ""), } - for schema_key in ('enum', 'pattern', 'minimum', 'maximum'): + for schema_key in ("enum", "pattern", "minimum", "maximum"): if schema_key in prop: option[schema_key] = prop[schema_key] options[key] = option @@ -85,11 +87,11 @@ class KeyValueWidget(forms.Widget): ) -> SafeString: data = self._parse_value(value) - widget_id = attrs.get('id', name) if attrs else name + widget_id = attrs.get("id", name) if attrs else name config_options = self._get_config_options() # Build datalist options - datalist_options = '\n'.join( + datalist_options = "\n".join( f'' for key, opt in sorted(config_options.items()) ) @@ -111,7 +113,7 @@ class KeyValueWidget(forms.Widget): html += self._render_row(widget_id, key, val_str) # Always add one empty row for new entries - html += self._render_row(widget_id, '', '') + html += self._render_row(widget_id, "", "") html += f''' @@ -669,8 +671,8 @@ class KeyValueWidget(forms.Widget): def _escape(self, s: object) -> str: """Escape HTML special chars in attribute values.""" if not s: - return '' - return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + return "" + return str(s).replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) def value_from_datadict( self, @@ -678,8 +680,8 @@ class KeyValueWidget(forms.Widget): files: object, name: str, ) -> str: - value = data.get(name, '{}') - return value if isinstance(value, str) else '{}' + value = data.get(name, "{}") + return value if isinstance(value, str) else "{}" class ConfigEditorMixin(admin.ModelAdmin): @@ -696,14 +698,20 @@ class ConfigEditorMixin(admin.ModelAdmin): **kwargs: object, ) -> forms.Field | None: """Use KeyValueWidget for the config JSON field.""" - if db_field.name == 'config': - kwargs['widget'] = KeyValueWidget() + if db_field.name == "config": + kwargs["widget"] = KeyValueWidget() return super().formfield_for_dbfield(db_field, request, **kwargs) class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin): - list_display = ('id', 'created_at', 'created_by') - readonly_fields = ('id', 'created_at', 'modified_at') + list_display = ("id", "created_at", "created_by") + readonly_fields = ("id", "created_at", "modified_at") + show_search_mode_selector = False + + def get_default_search_mode(self) -> str: + # The shared changelist template always asks every admin for a default + # search mode, even when the search-mode toggle is hidden. + return "meta" def get_form( self, @@ -713,6 +721,6 @@ class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin): **kwargs: object, ): form = super().get_form(request, obj, change=change, **kwargs) - if 'created_by' in form.base_fields: - form.base_fields['created_by'].initial = request.user + if "created_by" in form.base_fields: + form.base_fields["created_by"].initial = request.user return form diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index ff360b69..e6913a9c 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -1,6 +1,6 @@ """Base models using UUIDv7 for all id fields.""" -__package__ = 'archivebox.base_models' +__package__ = "archivebox.base_models" from archivebox.uuid_compat import uuid7 from pathlib import Path @@ -15,22 +15,22 @@ from django.conf import settings from django_stubs_ext.db.models import TypedModelMeta - -def get_or_create_system_user_pk(username='system'): +def get_or_create_system_user_pk(username="system"): User = get_user_model() # If there's exactly one superuser, use that for all system operations if User.objects.filter(is_superuser=True).count() == 1: - return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0] + return User.objects.filter(is_superuser=True).values_list("pk", flat=True)[0] # Otherwise get or create the system user user, _ = User.objects.get_or_create( username=username, - defaults={'is_staff': True, 'is_superuser': True, 'email': '', 'password': '!'} + defaults={"is_staff": True, "is_superuser": True, "email": "", "password": "!"}, ) return user.pk class AutoDateTimeField(models.DateTimeField): """DateTimeField that automatically updates on save (legacy compatibility).""" + def pre_save(self, model_instance, add): if add or not getattr(model_instance, self.attname): value = timezone.now() @@ -43,13 +43,19 @@ class ModelWithUUID(models.Model): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, db_index=True) + created_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + default=get_or_create_system_user_pk, + null=False, + db_index=True, + ) class Meta(TypedModelMeta): abstract = True def __str__(self) -> str: - return f'[{self.id}] {self.__class__.__name__}' + return f"[{self.id}] {self.__class__.__name__}" @property def admin_change_url(self) -> str: @@ -57,17 +63,17 @@ class ModelWithUUID(models.Model): @property def api_url(self) -> str: - return str(reverse_lazy('api-1:get_any', args=[self.id])) + return str(reverse_lazy("api-1:get_any", args=[self.id])) @property def api_docs_url(self) -> str: - return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}' - + return f"/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}" class ModelWithNotes(models.Model): """Mixin for models with a notes field.""" - notes = models.TextField(blank=True, null=False, default='') + + notes = models.TextField(blank=True, null=False, default="") class Meta(TypedModelMeta): abstract = True @@ -75,6 +81,7 @@ class ModelWithNotes(models.Model): class ModelWithHealthStats(models.Model): """Mixin for models with health tracking fields.""" + num_uses_failed = models.PositiveIntegerField(default=0) num_uses_succeeded = models.PositiveIntegerField(default=0) @@ -88,12 +95,13 @@ class ModelWithHealthStats(models.Model): def increment_health_stats(self, success: bool): """Atomically increment success or failure counter using F() expression.""" - field = 'num_uses_succeeded' if success else 'num_uses_failed' + field = "num_uses_succeeded" if success else "num_uses_failed" type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1}) class ModelWithConfig(models.Model): """Mixin for models with a JSON config field.""" + config = models.JSONField(default=dict, null=True, blank=True, editable=True) class Meta(TypedModelMeta): @@ -111,7 +119,7 @@ class ModelWithOutputDir(ModelWithUUID): @property def output_dir_parent(self) -> str: - return f'{self._meta.model_name}s' + return f"{self._meta.model_name}s" @property def output_dir_name(self) -> str: @@ -119,7 +127,7 @@ class ModelWithOutputDir(ModelWithUUID): @property def output_dir_str(self) -> str: - return f'{self.output_dir_parent}/{self.output_dir_name}' + return f"{self.output_dir_parent}/{self.output_dir_name}" @property def output_dir(self) -> Path: diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index a1575a28..13a62c4f 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,5 +1,5 @@ -__package__ = 'archivebox.cli' -__command__ = 'archivebox' +__package__ = "archivebox.cli" +__command__ = "archivebox" import os import sys from importlib import import_module @@ -10,55 +10,55 @@ from rich import print from archivebox.config.version import VERSION - -if '--debug' in sys.argv: - os.environ['DEBUG'] = 'True' - sys.argv.remove('--debug') +if "--debug" in sys.argv: + os.environ["DEBUG"] = "True" + sys.argv.remove("--debug") class ArchiveBoxGroup(click.Group): """lazy loading click group for archivebox commands""" + meta_commands = { - 'help': 'archivebox.cli.archivebox_help.main', - 'version': 'archivebox.cli.archivebox_version.main', - 'mcp': 'archivebox.cli.archivebox_mcp.main', + "help": "archivebox.cli.archivebox_help.main", + "version": "archivebox.cli.archivebox_version.main", + "mcp": "archivebox.cli.archivebox_mcp.main", } setup_commands = { - 'init': 'archivebox.cli.archivebox_init.main', - 'install': 'archivebox.cli.archivebox_install.main', + "init": "archivebox.cli.archivebox_init.main", + "install": "archivebox.cli.archivebox_install.main", } # Model commands (CRUD operations via subcommands) model_commands = { - 'crawl': 'archivebox.cli.archivebox_crawl.main', - 'snapshot': 'archivebox.cli.archivebox_snapshot.main', - 'archiveresult': 'archivebox.cli.archivebox_archiveresult.main', - 'tag': 'archivebox.cli.archivebox_tag.main', - 'binary': 'archivebox.cli.archivebox_binary.main', - 'process': 'archivebox.cli.archivebox_process.main', - 'machine': 'archivebox.cli.archivebox_machine.main', - 'persona': 'archivebox.cli.archivebox_persona.main', + "crawl": "archivebox.cli.archivebox_crawl.main", + "snapshot": "archivebox.cli.archivebox_snapshot.main", + "archiveresult": "archivebox.cli.archivebox_archiveresult.main", + "tag": "archivebox.cli.archivebox_tag.main", + "binary": "archivebox.cli.archivebox_binary.main", + "process": "archivebox.cli.archivebox_process.main", + "machine": "archivebox.cli.archivebox_machine.main", + "persona": "archivebox.cli.archivebox_persona.main", } archive_commands = { # High-level commands - 'add': 'archivebox.cli.archivebox_add.main', - 'extract': 'archivebox.cli.archivebox_extract.main', - 'list': 'archivebox.cli.archivebox_list.main', - 'remove': 'archivebox.cli.archivebox_remove.main', - 'run': 'archivebox.cli.archivebox_run.main', - 'update': 'archivebox.cli.archivebox_update.main', - 'status': 'archivebox.cli.archivebox_status.main', - 'search': 'archivebox.cli.archivebox_search.main', - 'config': 'archivebox.cli.archivebox_config.main', - 'schedule': 'archivebox.cli.archivebox_schedule.main', - 'server': 'archivebox.cli.archivebox_server.main', - 'shell': 'archivebox.cli.archivebox_shell.main', - 'manage': 'archivebox.cli.archivebox_manage.main', + "add": "archivebox.cli.archivebox_add.main", + "extract": "archivebox.cli.archivebox_extract.main", + "list": "archivebox.cli.archivebox_list.main", + "remove": "archivebox.cli.archivebox_remove.main", + "run": "archivebox.cli.archivebox_run.main", + "update": "archivebox.cli.archivebox_update.main", + "status": "archivebox.cli.archivebox_status.main", + "search": "archivebox.cli.archivebox_search.main", + "config": "archivebox.cli.archivebox_config.main", + "schedule": "archivebox.cli.archivebox_schedule.main", + "server": "archivebox.cli.archivebox_server.main", + "shell": "archivebox.cli.archivebox_shell.main", + "manage": "archivebox.cli.archivebox_manage.main", # Introspection commands - 'pluginmap': 'archivebox.cli.archivebox_pluginmap.main', + "pluginmap": "archivebox.cli.archivebox_pluginmap.main", } legacy_model_commands = { - 'crawl': 'archivebox.cli.archivebox_crawl_compat.main', - 'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main', + "crawl": "archivebox.cli.archivebox_crawl_compat.main", + "snapshot": "archivebox.cli.archivebox_snapshot_compat.main", } all_subcommands = { **meta_commands, @@ -67,15 +67,15 @@ class ArchiveBoxGroup(click.Group): **archive_commands, } renamed_commands = { - 'setup': 'install', - 'import': 'add', - 'archive': 'add', + "setup": "install", + "import": "add", + "archive": "add", } legacy_model_subcommands = { - 'crawl': {'create', 'list', 'update', 'delete'}, - 'snapshot': {'create', 'list', 'update', 'delete'}, + "crawl": {"create", "list", "update", "delete"}, + "snapshot": {"create", "list", "update", "delete"}, } - + @classmethod def get_canonical_name(cls, cmd_name): return cls.renamed_commands.get(cmd_name, cmd_name) @@ -90,23 +90,22 @@ class ArchiveBoxGroup(click.Group): except ValueError: return False - remaining_args = sys.argv[arg_idx + 1:] + remaining_args = sys.argv[arg_idx + 1 :] if not remaining_args: return False first_arg = remaining_args[0] - if first_arg in ('-h', '--help'): + if first_arg in ("-h", "--help"): return False return first_arg not in cls.legacy_model_subcommands[cmd_name] - def get_command(self, ctx, cmd_name): # handle renamed commands if cmd_name in self.renamed_commands: new_name = self.renamed_commands[cmd_name] print( - f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`', + f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`", file=sys.stderr, ) cmd_name = new_name @@ -114,11 +113,11 @@ class ArchiveBoxGroup(click.Group): if self._should_use_legacy_model_command(cmd_name): return self._lazy_load(self.legacy_model_commands[cmd_name]) - + # handle lazy loading of commands if cmd_name in self.all_subcommands: return self._lazy_load(cmd_name) - + # fall-back to using click's default command lookup return super().get_command(ctx, cmd_name) @@ -127,72 +126,74 @@ class ArchiveBoxGroup(click.Group): import_path = cls.all_subcommands.get(cmd_name_or_path) if import_path is None: import_path = cmd_name_or_path - modname, funcname = import_path.rsplit('.', 1) - + modname, funcname = import_path.rsplit(".", 1) + # print(f'LAZY LOADING {import_path}') mod = import_module(modname) func = getattr(mod, funcname) - - if not hasattr(func, '__doc__'): - raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method') - + + if not hasattr(func, "__doc__"): + raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method") + # if not isinstance(cmd, click.BaseCommand): - # raise ValueError(f'lazy loading of {import_path} failed - not a click command') - + # raise ValueError(f'lazy loading of {import_path} failed - not a click command') + return func @click.group(cls=ArchiveBoxGroup, invoke_without_command=True) -@click.option('--help', '-h', is_flag=True, help='Show help') -@click.version_option(VERSION, '-v', '--version', package_name='archivebox', message='%(version)s') +@click.option("--help", "-h", is_flag=True, help="Show help") +@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s") @click.pass_context def cli(ctx, help=False): """ArchiveBox: The self-hosted internet archive""" - + subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand) - + # if --help is passed or no subcommand is given, show custom help message if help or ctx.invoked_subcommand is None: - ctx.invoke(ctx.command.get_command(ctx, 'help')) - + ctx.invoke(ctx.command.get_command(ctx, "help")) + # if the subcommand is in archive_commands or model_commands, # then we need to set up the django environment and check that we're in a valid data folder if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands: # print('SETUP DJANGO AND CHECK DATA FOLDER') try: - if subcommand == 'server': - run_in_debug = '--reload' in sys.argv or os.environ.get('DEBUG') in ('1', 'true', 'True', 'TRUE', 'yes') + if subcommand == "server": + run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes") if run_in_debug: - os.environ['ARCHIVEBOX_RUNSERVER'] = '1' - if '--reload' in sys.argv: - os.environ['ARCHIVEBOX_AUTORELOAD'] = '1' + os.environ["ARCHIVEBOX_RUNSERVER"] = "1" + if "--reload" in sys.argv: + os.environ["ARCHIVEBOX_AUTORELOAD"] = "1" from archivebox.config.common import STORAGE_CONFIG - os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid') + + os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid") from archivebox.config.django import setup_django from archivebox.misc.checks import check_data_folder + setup_django() check_data_folder() except Exception as e: - print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr) - if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand + print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr) + if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand raise - + def main(args=None, prog_name=None, stdin=None): # show `docker run archivebox xyz` in help messages if running in docker - IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') + IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") IS_TTY = sys.stdin.isatty() - prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox') - + prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox") + # stdin param allows passing input data from caller (used by __main__.py) # currently not used by click-based CLI, but kept for backwards compatibility try: cli(args=args, prog_name=prog_name) except KeyboardInterrupt: - print('\n\n[red][X] Got CTRL+C. Exiting...[/red]') + print("\n\n[red][X] Got CTRL+C. Exiting...[/red]") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index e38f4155..ae41dae2 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox add' +__package__ = "archivebox.cli" +__command__ = "archivebox add" import sys from pathlib import Path @@ -14,6 +14,7 @@ from django.utils import timezone from django.db.models import QuerySet from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.util import parse_filesize_to_bytes from archivebox import CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG from archivebox.config.permissions import USER, HOSTNAME @@ -29,34 +30,38 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]: urls: list[str] = [] for record in read_args_or_stdin(args): - url = record.get('url') + url = record.get("url") if isinstance(url, str) and url: urls.append(url) - urls_field = record.get('urls') + urls_field = record.get("urls") if isinstance(urls_field, str): for line in urls_field.splitlines(): line = line.strip() - if line and not line.startswith('#'): + if line and not line.startswith("#"): urls.append(line) return urls @enforce_types -def add(urls: str | list[str], - depth: int | str=0, - tag: str='', - url_allowlist: str='', - url_denylist: str='', - parser: str="auto", - plugins: str="", - persona: str='Default', - overwrite: bool=False, - update: bool | None=None, - index_only: bool=False, - bg: bool=False, - created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]: +def add( + urls: str | list[str], + depth: int | str = 0, + max_urls: int = 0, + max_size: int | str = 0, + tag: str = "", + url_allowlist: str = "", + url_denylist: str = "", + parser: str = "auto", + plugins: str = "", + persona: str = "Default", + overwrite: bool = False, + update: bool | None = None, + index_only: bool = False, + bg: bool = False, + created_by_id: int | None = None, +) -> tuple["Crawl", QuerySet["Snapshot"]]: """Add a new URL or list of URLs to your archive. The flow is: @@ -72,8 +77,15 @@ def add(urls: str | list[str], from rich import print depth = int(depth) + max_urls = int(max_urls or 0) + max_size = parse_filesize_to_bytes(max_size) - assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4' + if depth not in (0, 1, 2, 3, 4): + raise ValueError("Depth must be 0-4") + if max_urls < 0: + raise ValueError("max_urls must be >= 0") + if max_size < 0: + raise ValueError("max_size must be >= 0") # import models once django is set up from archivebox.core.models import Snapshot @@ -91,47 +103,49 @@ def add(urls: str | list[str], update = not ARCHIVING_CONFIG.ONLY_NEW # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt - sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt' + sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt" sources_file.parent.mkdir(parents=True, exist_ok=True) - sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) + sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls)) # 2. Create a new Crawl with inline URLs cli_args = [*sys.argv] - if cli_args[0].lower().endswith('archivebox'): - cli_args[0] = 'archivebox' - cmd_str = ' '.join(cli_args) + if cli_args[0].lower().endswith("archivebox"): + cli_args[0] = "archivebox" + cmd_str = " ".join(cli_args) timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") # Read URLs directly into crawl urls_content = sources_file.read_text() - persona_name = (persona or 'Default').strip() or 'Default' - plugins = plugins or str(get_config().get('PLUGINS') or '') + persona_name = (persona or "Default").strip() or "Default" + plugins = plugins or str(get_config().get("PLUGINS") or "") persona_obj, _ = Persona.objects.get_or_create(name=persona_name) persona_obj.ensure_dirs() crawl = Crawl.objects.create( urls=urls_content, max_depth=depth, + max_urls=max_urls, + max_size=max_size, tags_str=tag, persona_id=persona_obj.id, - label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]', + label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]", created_by_id=created_by_id, config={ - 'ONLY_NEW': not update, - 'INDEX_ONLY': index_only, - 'OVERWRITE': overwrite, - 'PLUGINS': plugins, - 'DEFAULT_PERSONA': persona_name, - 'PARSER': parser, - **({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}), - **({'URL_DENYLIST': url_denylist} if url_denylist else {}), - } + "ONLY_NEW": not update, + "INDEX_ONLY": index_only, + "OVERWRITE": overwrite, + "PLUGINS": plugins, + "DEFAULT_PERSONA": persona_name, + "PARSER": parser, + **({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}), + **({"URL_DENYLIST": url_denylist} if url_denylist else {}), + }, ) - print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]') - first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else '' - print(f' [dim]First URL: {first_url}[/dim]') + print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]") + first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else "" + print(f" [dim]First URL: {first_url}[/dim]") # 3. The CrawlMachine will create Snapshots from all URLs when started # Parser extractors run on snapshots and discover more URLs @@ -139,20 +153,21 @@ def add(urls: str | list[str], if index_only: # Just create the crawl but don't start processing - print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]') + print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]") # Create snapshots for all URLs in the crawl for url in crawl.get_urls_list(): snapshot, _ = Snapshot.objects.update_or_create( - crawl=crawl, url=url, + crawl=crawl, + url=url, defaults={ - 'status': Snapshot.INITIAL_STATE, - 'retry_at': timezone.now(), - 'timestamp': str(timezone.now().timestamp()), - 'depth': 0, + "status": Snapshot.INITIAL_STATE, + "retry_at": timezone.now(), + "timestamp": str(timezone.now().timestamp()), + "depth": 0, }, ) if tag: - snapshot.save_tags(tag.split(',')) + snapshot.save_tags(tag.split(",")) snapshot.ensure_crawl_symlink() return crawl, crawl.snapshot_set.all() @@ -168,10 +183,12 @@ def add(urls: str | list[str], if bg: # Background mode: just queue work and return (background runner via server will pick it up) - print('[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]') + print( + "[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]", + ) else: # Foreground mode: run full crawl runner until all work is done - print('[green]\\[*] Starting crawl runner to process crawl...[/green]') + print("[green]\\[*] Starting crawl runner to process crawl...[/green]") run_crawl(str(crawl.id)) # Print summary for foreground runs @@ -179,7 +196,10 @@ def add(urls: str | list[str], crawl.refresh_from_db() snapshots_count = crawl.snapshot_set.count() try: - total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all()) + from django.db.models import Count, Sum + + totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size")) + total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0 except Exception: total_bytes, _, _ = get_dir_size(crawl.output_dir) total_size = printable_filesize(total_bytes) @@ -197,23 +217,23 @@ def add(urls: str | list[str], # Output dir relative to DATA_DIR try: rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR) - rel_output_str = f'./{rel_output}' + rel_output_str = f"./{rel_output}" except Exception: rel_output_str = str(crawl.output_dir) - bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000' - if bind_addr.startswith('http://') or bind_addr.startswith('https://'): + bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000" + if bind_addr.startswith("http://") or bind_addr.startswith("https://"): base_url = bind_addr else: - base_url = f'http://{bind_addr}' - admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/' + base_url = f"http://{bind_addr}" + admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/" - print('\n[bold]crawl output saved to:[/bold]') - print(f' {rel_output_str}') - print(f' {admin_url}') - print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}') - print(f'[bold]total size:[/bold] {total_size}') - print(f'[bold]total time:[/bold] {duration_str}') + print("\n[bold]crawl output saved to:[/bold]") + print(f" {rel_output_str}") + print(f" {admin_url}") + print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}") + print(f"[bold]total size:[/bold] {total_size}") + print(f"[bold]total time:[/bold] {duration_str}") except Exception: # Summary is best-effort; avoid failing the command if something goes wrong pass @@ -224,29 +244,43 @@ def add(urls: str | list[str], @click.command() -@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away') -@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3') -@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl') -@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl') -@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)') -@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...') -@click.option('--persona', default='Default', help='Authentication profile to use when archiving') -@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously') -@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them') -@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now') -@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)') -@click.argument('urls', nargs=-1, type=click.Path()) +@click.option( + "--depth", + "-d", + type=click.Choice([str(i) for i in range(5)]), + default="0", + help="Recursively archive linked pages up to N hops away", +) +@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)") +@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)") +@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3") +@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl") +@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl") +@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)") +@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...") +@click.option("--persona", default="Default", help="Authentication profile to use when archiving") +@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously") +@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them") +@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now") +@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)") +@click.argument("urls", nargs=-1, type=click.Path()) @docstring(add.__doc__) def main(**kwargs): """Add a new URL or list of URLs to your archive""" - raw_urls = kwargs.pop('urls') + raw_urls = kwargs.pop("urls") urls = _collect_input_urls(raw_urls) if not urls: - raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.') + raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.") + if int(kwargs.get("max_urls") or 0) < 0: + raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls") + try: + kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size")) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="--max-size") from err add(urls=urls, **kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py index 6cf0dffc..9c1eaf7b 100644 --- a/archivebox/cli/archivebox_archiveresult.py +++ b/archivebox/cli/archivebox_archiveresult.py @@ -30,11 +30,10 @@ Examples: archivebox archiveresult list --status=failed | archivebox run """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox archiveresult' +__package__ = "archivebox.cli" +__command__ = "archivebox archiveresult" import sys -from typing import Optional import rich_click as click from rich import print as rprint @@ -42,13 +41,13 @@ from rich import print as rprint from archivebox.cli.cli_utils import apply_filters -def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict: +def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict: return { - 'type': 'ArchiveResult', - 'snapshot_id': str(snapshot_id), - 'plugin': plugin, - 'hook_name': hook_name, - 'status': status, + "type": "ArchiveResult", + "snapshot_id": str(snapshot_id), + "plugin": plugin, + "hook_name": hook_name, + "status": status, } @@ -56,10 +55,11 @@ def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = # CREATE # ============================================================================= + def create_archiveresults( - snapshot_id: Optional[str] = None, - plugin: Optional[str] = None, - status: str = 'queued', + snapshot_id: str | None = None, + plugin: str | None = None, + status: str = "queued", ) -> int: """ Create ArchiveResult request records for Snapshots. @@ -86,13 +86,13 @@ def create_archiveresults( snapshots = [Snapshot.objects.get(id=snapshot_id)] pass_through_records = [] except Snapshot.DoesNotExist: - rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr) + rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr) return 1 else: # Read from stdin records = list(read_stdin()) if not records: - rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr) return 1 # Separate snapshot records from pass-through records @@ -100,17 +100,17 @@ def create_archiveresults( pass_through_records = [] for record in records: - record_type = record.get('type', '') + record_type = record.get("type", "") if record_type == TYPE_SNAPSHOT: # Pass through the Snapshot record itself pass_through_records.append(record) - if record.get('id'): - snapshot_ids.append(record['id']) + if record.get("id"): + snapshot_ids.append(record["id"]) elif record_type == TYPE_ARCHIVERESULT: # ArchiveResult records: pass through if they have an id - if record.get('id'): + if record.get("id"): pass_through_records.append(record) # If no id, we could create it, but for now just pass through else: @@ -120,9 +120,9 @@ def create_archiveresults( # Other typed records (Crawl, Tag, etc): pass through pass_through_records.append(record) - elif record.get('id'): + elif record.get("id"): # Untyped record with id - assume it's a snapshot ID - snapshot_ids.append(record['id']) + snapshot_ids.append(record["id"]) # Output pass-through records first if not is_tty: @@ -131,15 +131,15 @@ def create_archiveresults( if not snapshot_ids: if pass_through_records: - rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr) + rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr) return 0 - rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr) return 1 snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids)) if not snapshots: - rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) return 0 if pass_through_records else 1 created_count = 0 @@ -150,7 +150,7 @@ def create_archiveresults( created_count += 1 else: config = get_config(crawl=snapshot.crawl, snapshot=snapshot) - hooks = discover_hooks('Snapshot', config=config) + hooks = discover_hooks("Snapshot", config=config) for hook_path in hooks: hook_name = hook_path.name plugin_name = hook_path.parent.name @@ -158,7 +158,7 @@ def create_archiveresults( write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status)) created_count += 1 - rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr) + rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr) return 0 @@ -166,11 +166,12 @@ def create_archiveresults( # LIST # ============================================================================= + def list_archiveresults( - status: Optional[str] = None, - plugin: Optional[str] = None, - snapshot_id: Optional[str] = None, - limit: Optional[int] = None, + status: str | None = None, + plugin: str | None = None, + snapshot_id: str | None = None, + limit: int | None = None, ) -> int: """ List ArchiveResults as JSONL with optional filters. @@ -183,13 +184,13 @@ def list_archiveresults( is_tty = sys.stdout.isatty() - queryset = ArchiveResult.objects.all().order_by('-start_ts') + queryset = ArchiveResult.objects.all().order_by("-start_ts") # Apply filters filter_kwargs = { - 'status': status, - 'plugin': plugin, - 'snapshot_id': snapshot_id, + "status": status, + "plugin": plugin, + "snapshot_id": snapshot_id, } queryset = apply_filters(queryset, filter_kwargs, limit=limit) @@ -197,20 +198,22 @@ def list_archiveresults( for result in queryset: if is_tty: status_color = { - 'queued': 'yellow', - 'started': 'blue', - 'succeeded': 'green', - 'failed': 'red', - 'skipped': 'dim', - 'noresults': 'dim', - 'backoff': 'magenta', - }.get(result.status, 'dim') - rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}') + "queued": "yellow", + "started": "blue", + "succeeded": "green", + "failed": "red", + "skipped": "dim", + "noresults": "dim", + "backoff": "magenta", + }.get(result.status, "dim") + rprint( + f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}", + ) else: write_record(result.to_json()) count += 1 - rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr) return 0 @@ -218,8 +221,9 @@ def list_archiveresults( # UPDATE # ============================================================================= + def update_archiveresults( - status: Optional[str] = None, + status: str | None = None, ) -> int: """ Update ArchiveResults from stdin JSONL. @@ -238,12 +242,12 @@ def update_archiveresults( records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: - result_id = record.get('id') + result_id = record.get("id") if not result_id: continue @@ -261,10 +265,10 @@ def update_archiveresults( write_record(result.to_json()) except ArchiveResult.DoesNotExist: - rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr) + rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr) continue - rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr) + rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr) return 0 @@ -272,6 +276,7 @@ def update_archiveresults( # DELETE # ============================================================================= + def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int: """ Delete ArchiveResults from stdin JSONL. @@ -287,37 +292,37 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 - result_ids = [r.get('id') for r in records if r.get('id')] + result_ids = [r.get("id") for r in records if r.get("id")] if not result_ids: - rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr) return 1 results = ArchiveResult.objects.filter(id__in=result_ids) count = results.count() if count == 0: - rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr) return 0 if dry_run: - rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr) + rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr) for result in results[:10]: - rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr) + rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr) if count > 10: - rprint(f' ... and {count - 10} more', file=sys.stderr) + rprint(f" ... and {count - 10} more", file=sys.stderr) return 0 if not yes: - rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Perform deletion deleted_count, _ = results.delete() - rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr) + rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr) return 0 @@ -325,51 +330,58 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int: # CLI Commands # ============================================================================= + @click.group() def main(): """Manage ArchiveResult records (plugin extraction results).""" pass -@main.command('create') -@click.option('--snapshot-id', help='Snapshot ID to create results for') -@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)') -@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') -def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str): +@main.command("create") +@click.option("--snapshot-id", help="Snapshot ID to create results for") +@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +def create_cmd(snapshot_id: str | None, plugin: str | None, status: str): """Create ArchiveResults for Snapshots from stdin JSONL.""" sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status)) -@main.command('list') -@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)') -@click.option('--plugin', '-p', help='Filter by plugin name') -@click.option('--snapshot-id', help='Filter by snapshot ID') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(status: Optional[str], plugin: Optional[str], - snapshot_id: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)") +@click.option("--plugin", "-p", help="Filter by plugin name") +@click.option("--snapshot-id", help="Filter by snapshot ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd( + status: str | None, + plugin: str | None, + snapshot_id: str | None, + limit: int | None, +): """List ArchiveResults as JSONL.""" - sys.exit(list_archiveresults( - status=status, - plugin=plugin, - snapshot_id=snapshot_id, - limit=limit, - )) + sys.exit( + list_archiveresults( + status=status, + plugin=plugin, + snapshot_id=snapshot_id, + limit=limit, + ), + ) -@main.command('update') -@click.option('--status', '-s', help='Set status') -def update_cmd(status: Optional[str]): +@main.command("update") +@click.option("--status", "-s", help="Set status") +def update_cmd(status: str | None): """Update ArchiveResults from stdin JSONL.""" sys.exit(update_archiveresults(status=status)) -@main.command('delete') -@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') -@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete ArchiveResults from stdin JSONL.""" sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_binary.py b/archivebox/cli/archivebox_binary.py index 7737dd3f..d156d8cc 100644 --- a/archivebox/cli/archivebox_binary.py +++ b/archivebox/cli/archivebox_binary.py @@ -25,11 +25,10 @@ Examples: archivebox binary list --name=chrome | archivebox binary delete --yes """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox binary' +__package__ = "archivebox.cli" +__command__ = "archivebox binary" import sys -from typing import Optional import rich_click as click from rich import print as rprint @@ -41,10 +40,11 @@ from archivebox.cli.cli_utils import apply_filters # CREATE # ============================================================================= + def create_binary( name: str, abspath: str, - version: str = '', + version: str = "", ) -> int: """ Create/register a Binary. @@ -59,7 +59,7 @@ def create_binary( is_tty = sys.stdout.isatty() if not name or not abspath: - rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr) + rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr) return 1 try: @@ -76,28 +76,30 @@ def create_binary( # Mirror the Binary model lifecycle used elsewhere in the system so CLI # records are owned by the current machine and can be safely piped into # `archivebox run` without creating invalid rows missing machine_id. - binary = Binary.from_json({ - 'name': name, - 'abspath': abspath, - 'version': version, - 'binproviders': 'env', - 'binprovider': 'env', - }) + binary = Binary.from_json( + { + "name": name, + "abspath": abspath, + "version": version, + "binproviders": "env", + "binprovider": "env", + }, + ) if binary is None: - raise ValueError('failed to create binary record') + raise ValueError("failed to create binary record") if not is_tty: write_record(binary.to_json()) if created: - rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr) + rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr) else: - rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr) + rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr) return 0 except Exception as e: - rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr) + rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr) return 1 @@ -105,11 +107,12 @@ def create_binary( # LIST # ============================================================================= + def list_binaries( - name: Optional[str] = None, - abspath__icontains: Optional[str] = None, - version__icontains: Optional[str] = None, - limit: Optional[int] = None, + name: str | None = None, + abspath__icontains: str | None = None, + version__icontains: str | None = None, + limit: int | None = None, ) -> int: """ List Binaries as JSONL with optional filters. @@ -122,25 +125,25 @@ def list_binaries( is_tty = sys.stdout.isatty() - queryset = Binary.objects.all().order_by('name', '-modified_at', '-created_at') + queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at") # Apply filters filter_kwargs = { - 'name': name, - 'abspath__icontains': abspath__icontains, - 'version__icontains': version__icontains, + "name": name, + "abspath__icontains": abspath__icontains, + "version__icontains": version__icontains, } queryset = apply_filters(queryset, filter_kwargs, limit=limit) count = 0 for binary in queryset: if is_tty: - rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}') + rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}") else: write_record(binary.to_json()) count += 1 - rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr) return 0 @@ -148,9 +151,10 @@ def list_binaries( # UPDATE # ============================================================================= + def update_binaries( - version: Optional[str] = None, - abspath: Optional[str] = None, + version: str | None = None, + abspath: str | None = None, ) -> int: """ Update Binaries from stdin JSONL. @@ -169,12 +173,12 @@ def update_binaries( records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: - binary_id = record.get('id') + binary_id = record.get("id") if not binary_id: continue @@ -194,10 +198,10 @@ def update_binaries( write_record(binary.to_json()) except Binary.DoesNotExist: - rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr) continue - rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr) + rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr) return 0 @@ -205,6 +209,7 @@ def update_binaries( # DELETE # ============================================================================= + def delete_binaries(yes: bool = False, dry_run: bool = False) -> int: """ Delete Binaries from stdin JSONL. @@ -220,35 +225,35 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 - binary_ids = [r.get('id') for r in records if r.get('id')] + binary_ids = [r.get("id") for r in records if r.get("id")] if not binary_ids: - rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr) return 1 binaries = Binary.objects.filter(id__in=binary_ids) count = binaries.count() if count == 0: - rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr) return 0 if dry_run: - rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr) + rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr) for binary in binaries: - rprint(f' {binary.name} {binary.abspath}', file=sys.stderr) + rprint(f" {binary.name} {binary.abspath}", file=sys.stderr) return 0 if not yes: - rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Perform deletion deleted_count, _ = binaries.delete() - rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr) + rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr) return 0 @@ -256,52 +261,59 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int: # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Binary records (detected executables).""" pass -@main.command('create') -@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)') -@click.option('--abspath', '-p', required=True, help='Absolute path to binary') -@click.option('--version', '-v', default='', help='Binary version') +@main.command("create") +@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)") +@click.option("--abspath", "-p", required=True, help="Absolute path to binary") +@click.option("--version", "-v", default="", help="Binary version") def create_cmd(name: str, abspath: str, version: str): """Create/register a Binary.""" sys.exit(create_binary(name=name, abspath=abspath, version=version)) -@main.command('list') -@click.option('--name', '-n', help='Filter by name') -@click.option('--abspath__icontains', help='Filter by path contains') -@click.option('--version__icontains', help='Filter by version contains') -@click.option('--limit', type=int, help='Limit number of results') -def list_cmd(name: Optional[str], abspath__icontains: Optional[str], - version__icontains: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--name", "-n", help="Filter by name") +@click.option("--abspath__icontains", help="Filter by path contains") +@click.option("--version__icontains", help="Filter by version contains") +@click.option("--limit", type=int, help="Limit number of results") +def list_cmd( + name: str | None, + abspath__icontains: str | None, + version__icontains: str | None, + limit: int | None, +): """List Binaries as JSONL.""" - sys.exit(list_binaries( - name=name, - abspath__icontains=abspath__icontains, - version__icontains=version__icontains, - limit=limit, - )) + sys.exit( + list_binaries( + name=name, + abspath__icontains=abspath__icontains, + version__icontains=version__icontains, + limit=limit, + ), + ) -@main.command('update') -@click.option('--version', '-v', help='Set version') -@click.option('--abspath', '-p', help='Set path') -def update_cmd(version: Optional[str], abspath: Optional[str]): +@main.command("update") +@click.option("--version", "-v", help="Set version") +@click.option("--abspath", "-p", help="Set path") +def update_cmd(version: str | None, abspath: str | None): """Update Binaries from stdin JSONL.""" sys.exit(update_binaries(version=version, abspath=abspath)) -@main.command('delete') -@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') -@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete Binaries from stdin JSONL.""" sys.exit(delete_binaries(yes=yes, dry_run=dry_run)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index aa576658..f21087af 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import sys import rich_click as click @@ -12,12 +12,14 @@ from archivebox.misc.toml_util import CustomTOMLEncoder @enforce_types -def config(*keys, - get: bool=False, - set: bool=False, - search: bool=False, - reset: bool=False, - **kwargs) -> None: +def config( + *keys, + get: bool = False, + set: bool = False, + search: bool = False, + reset: bool = False, + **kwargs, +) -> None: """Get and set your ArchiveBox project configuration values""" from archivebox.misc.checks import check_data_folder @@ -29,8 +31,8 @@ def config(*keys, FLAT_CONFIG = get_flat_config() CONFIGS = get_all_configs() - - config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()]) + + config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()]) no_args = not (get or set or reset or config_options) matching_config = {} @@ -39,19 +41,19 @@ def config(*keys, config_options = [get_real_name(key) for key in config_options] matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} for config_section in CONFIGS.values(): - aliases = getattr(config_section, 'aliases', {}) - + aliases = getattr(config_section, "aliases", {}) + for search_key in config_options: # search all aliases in the section for alias_key, key in aliases.items(): if search_key.lower() in alias_key.lower(): matching_config[key] = dict(config_section)[key] - + # search all keys and values in the section for existing_key, value in dict(config_section).items(): if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): matching_config[existing_key] = value - + print(printable_config(matching_config)) raise SystemExit(not matching_config) @@ -61,23 +63,23 @@ def config(*keys, matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} failed_config = [key for key in config_options if key not in FLAT_CONFIG] if failed_config: - print('\n[red][X] These options failed to get[/red]') - print(' {}'.format('\n '.join(config_options))) + print("\n[red][X] These options failed to get[/red]") + print(" {}".format("\n ".join(config_options))) raise SystemExit(1) else: matching_config = FLAT_CONFIG # Display core config sections for config_section in CONFIGS.values(): - section_header = getattr(config_section, 'toml_section_header', '') + section_header = getattr(config_section, "toml_section_header", "") if isinstance(section_header, str) and section_header: - print(f'[grey53]\\[{section_header}][/grey53]') + print(f"[grey53]\\[{section_header}][/grey53]") else: - print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]') + print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]") kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config} - print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n')) - print('[grey53]################################################################[/grey53]') + print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n")) + print("[grey53]################################################################[/grey53]") # Display plugin config section from archivebox.hooks import discover_plugin_configs @@ -87,17 +89,17 @@ def config(*keys, # Collect all plugin config keys for plugin_name, schema in plugin_configs.items(): - if 'properties' not in schema: + if "properties" not in schema: continue - for key in schema['properties'].keys(): + for key in schema["properties"].keys(): if key in matching_config: plugin_keys[key] = matching_config[key] # Display all plugin config in single [PLUGINS] section if plugin_keys: - print('[grey53]\\[PLUGINS][/grey53]') - print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n')) - print('[grey53]################################################################[/grey53]') + print("[grey53]\\[PLUGINS][/grey53]") + print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n")) + print("[grey53]################################################################[/grey53]") raise SystemExit(not matching_config) @@ -105,18 +107,20 @@ def config(*keys, new_config = {} failed_options = [] for line in config_options: - if line.startswith('#') or not line.strip(): + if line.startswith("#") or not line.strip(): continue - if '=' not in line: - print('[red][X] Config KEY=VALUE must have an = sign in it[/red]') - print(f' {line}') + if "=" not in line: + print("[red][X] Config KEY=VALUE must have an = sign in it[/red]") + print(f" {line}") raise SystemExit(2) - raw_key, val = line.split('=', 1) + raw_key, val = line.split("=", 1) raw_key = raw_key.upper().strip() key = get_real_name(raw_key) if key != raw_key: - print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]') + print( + f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]", + ) if key in FLAT_CONFIG: new_config[key] = val.strip() @@ -136,38 +140,38 @@ def config(*keys, if side_effect_changes: print(file=sys.stderr) - print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr) - print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr) + print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr) + print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr) if failed_options: print() - print('[red][X] These options failed to set (check for typos):[/red]') - print(' {}'.format('\n '.join(failed_options))) + print("[red][X] These options failed to set (check for typos):[/red]") + print(" {}".format("\n ".join(failed_options))) raise SystemExit(1) elif reset: - print('[red][X] This command is not implemented yet.[/red]') - print(' Please manually remove the relevant lines from your config file:') + print("[red][X] This command is not implemented yet.[/red]") + print(" Please manually remove the relevant lines from your config file:") raise SystemExit(2) else: - print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]') - print(' archivebox config') - print(' archivebox config --get SOME_KEY') - print(' archivebox config --set SOME_KEY=SOME_VALUE') + print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]") + print(" archivebox config") + print(" archivebox config --get SOME_KEY") + print(" archivebox config --set SOME_KEY=SOME_VALUE") raise SystemExit(2) @click.command() -@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term') -@click.option('--get', is_flag=True, help='Get the value for the given config KEYs') -@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values') -@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults') -@click.argument('KEY=VALUE', nargs=-1, type=str) +@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term") +@click.option("--get", is_flag=True, help="Get the value for the given config KEYs") +@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values") +@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults") +@click.argument("KEY=VALUE", nargs=-1, type=str) @docstring(config.__doc__) def main(**kwargs) -> None: config(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py index 59f176cd..c2b3c901 100644 --- a/archivebox/cli/archivebox_crawl.py +++ b/archivebox/cli/archivebox_crawl.py @@ -30,11 +30,11 @@ Examples: archivebox crawl create https://example.com | archivebox snapshot create | archivebox run """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox crawl' +__package__ = "archivebox.cli" +__command__ = "archivebox crawl" import sys -from typing import Optional, Iterable +from collections.abc import Iterable import rich_click as click from rich import print as rprint @@ -46,12 +46,13 @@ from archivebox.cli.cli_utils import apply_filters # CREATE # ============================================================================= + def create_crawl( urls: Iterable[str], depth: int = 0, - tag: str = '', - status: str = 'queued', - created_by_id: Optional[int] = None, + tag: str = "", + status: str = "queued", + created_by_id: int | None = None, ) -> int: """ Create a Crawl job from URLs. @@ -74,7 +75,7 @@ def create_crawl( records = list(read_args_or_stdin(urls)) if not records: - rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) + rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) return 1 # Separate pass-through records from URL records @@ -82,29 +83,29 @@ def create_crawl( pass_through_records = [] for record in records: - record_type = record.get('type', '') + record_type = record.get("type", "") # Pass-through: output records that aren't URL/Crawl types - if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'): + if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"): pass_through_records.append(record) continue # Handle existing Crawl records (just pass through with id) - if record_type == TYPE_CRAWL and record.get('id'): + if record_type == TYPE_CRAWL and record.get("id"): pass_through_records.append(record) continue # Collect URLs - url = record.get('url') + url = record.get("url") if url: url_list.append(url) # Handle 'urls' field (newline-separated) - urls_field = record.get('urls') + urls_field = record.get("urls") if urls_field: - for line in urls_field.split('\n'): + for line in urls_field.split("\n"): line = line.strip() - if line and not line.startswith('#'): + if line and not line.startswith("#"): url_list.append(line) # Output pass-through records first @@ -115,44 +116,44 @@ def create_crawl( if not url_list: if pass_through_records: # If we had pass-through records but no URLs, that's OK - rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr) + rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr) return 0 - rprint('[red]No valid URLs found[/red]', file=sys.stderr) + rprint("[red]No valid URLs found[/red]", file=sys.stderr) return 1 try: # Build crawl record with all URLs as newline-separated string crawl_record = { - 'urls': '\n'.join(url_list), - 'max_depth': depth, - 'tags_str': tag, - 'status': status, - 'label': '', + "urls": "\n".join(url_list), + "max_depth": depth, + "tags_str": tag, + "status": status, + "label": "", } - crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id}) if not crawl: - rprint('[red]Failed to create crawl[/red]', file=sys.stderr) + rprint("[red]Failed to create crawl[/red]", file=sys.stderr) return 1 # Output JSONL record (only when piped) if not is_tty: write_record(crawl.to_json()) - rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr) + rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr) # If TTY, show human-readable output if is_tty: - rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr) + rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr) for url in url_list[:5]: # Show first 5 URLs - rprint(f' {url[:70]}', file=sys.stderr) + rprint(f" {url[:70]}", file=sys.stderr) if len(url_list) > 5: - rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr) + rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr) return 0 except Exception as e: - rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr) + rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr) return 1 @@ -160,11 +161,12 @@ def create_crawl( # LIST # ============================================================================= + def list_crawls( - status: Optional[str] = None, - urls__icontains: Optional[str] = None, - max_depth: Optional[int] = None, - limit: Optional[int] = None, + status: str | None = None, + urls__icontains: str | None = None, + max_depth: int | None = None, + limit: int | None = None, ) -> int: """ List Crawls as JSONL with optional filters. @@ -177,13 +179,13 @@ def list_crawls( is_tty = sys.stdout.isatty() - queryset = Crawl.objects.all().order_by('-created_at') + queryset = Crawl.objects.all().order_by("-created_at") # Apply filters filter_kwargs = { - 'status': status, - 'urls__icontains': urls__icontains, - 'max_depth': max_depth, + "status": status, + "urls__icontains": urls__icontains, + "max_depth": max_depth, } queryset = apply_filters(queryset, filter_kwargs, limit=limit) @@ -191,17 +193,17 @@ def list_crawls( for crawl in queryset: if is_tty: status_color = { - 'queued': 'yellow', - 'started': 'blue', - 'sealed': 'green', - }.get(crawl.status, 'dim') - url_preview = crawl.urls[:50].replace('\n', ' ') - rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...') + "queued": "yellow", + "started": "blue", + "sealed": "green", + }.get(crawl.status, "dim") + url_preview = crawl.urls[:50].replace("\n", " ") + rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...") else: write_record(crawl.to_json()) count += 1 - rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr) return 0 @@ -209,9 +211,10 @@ def list_crawls( # UPDATE # ============================================================================= + def update_crawls( - status: Optional[str] = None, - max_depth: Optional[int] = None, + status: str | None = None, + max_depth: int | None = None, ) -> int: """ Update Crawls from stdin JSONL. @@ -232,12 +235,12 @@ def update_crawls( records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: - crawl_id = record.get('id') + crawl_id = record.get("id") if not crawl_id: continue @@ -258,10 +261,10 @@ def update_crawls( write_record(crawl.to_json()) except Crawl.DoesNotExist: - rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr) continue - rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr) + rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr) return 0 @@ -269,6 +272,7 @@ def update_crawls( # DELETE # ============================================================================= + def delete_crawls(yes: bool = False, dry_run: bool = False) -> int: """ Delete Crawls from stdin JSONL. @@ -284,36 +288,36 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 - crawl_ids = [r.get('id') for r in records if r.get('id')] + crawl_ids = [r.get("id") for r in records if r.get("id")] if not crawl_ids: - rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr) return 1 crawls = Crawl.objects.filter(id__in=crawl_ids) count = crawls.count() if count == 0: - rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr) return 0 if dry_run: - rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr) + rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr) for crawl in crawls: - url_preview = crawl.urls[:50].replace('\n', ' ') - rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr) + url_preview = crawl.urls[:50].replace("\n", " ") + rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr) return 0 if not yes: - rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Perform deletion deleted_count, _ = crawls.delete() - rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr) + rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr) return 0 @@ -321,53 +325,60 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int: # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Crawl records.""" pass -@main.command('create') -@click.argument('urls', nargs=-1) -@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)') -@click.option('--tag', '-t', default='', help='Comma-separated tags to add') -@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') +@main.command("create") +@click.argument("urls", nargs=-1) +@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)") +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") def create_cmd(urls: tuple, depth: int, tag: str, status: str): """Create a Crawl job from URLs or stdin.""" sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status)) -@main.command('list') -@click.option('--status', '-s', help='Filter by status (queued, started, sealed)') -@click.option('--urls__icontains', help='Filter by URLs contains') -@click.option('--max-depth', type=int, help='Filter by max depth') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(status: Optional[str], urls__icontains: Optional[str], - max_depth: Optional[int], limit: Optional[int]): +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--urls__icontains", help="Filter by URLs contains") +@click.option("--max-depth", type=int, help="Filter by max depth") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd( + status: str | None, + urls__icontains: str | None, + max_depth: int | None, + limit: int | None, +): """List Crawls as JSONL.""" - sys.exit(list_crawls( - status=status, - urls__icontains=urls__icontains, - max_depth=max_depth, - limit=limit, - )) + sys.exit( + list_crawls( + status=status, + urls__icontains=urls__icontains, + max_depth=max_depth, + limit=limit, + ), + ) -@main.command('update') -@click.option('--status', '-s', help='Set status') -@click.option('--max-depth', type=int, help='Set max depth') -def update_cmd(status: Optional[str], max_depth: Optional[int]): +@main.command("update") +@click.option("--status", "-s", help="Set status") +@click.option("--max-depth", type=int, help="Set max depth") +def update_cmd(status: str | None, max_depth: int | None): """Update Crawls from stdin JSONL.""" sys.exit(update_crawls(status=status, max_depth=max_depth)) -@main.command('delete') -@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') -@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete Crawls from stdin JSONL.""" sys.exit(delete_crawls(yes=yes, dry_run=dry_run)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_crawl_compat.py b/archivebox/cli/archivebox_crawl_compat.py index cec7bc39..e7679675 100644 --- a/archivebox/cli/archivebox_crawl_compat.py +++ b/archivebox/cli/archivebox_crawl_compat.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox crawl' +__package__ = "archivebox.cli" +__command__ = "archivebox crawl" import sys @@ -10,12 +10,12 @@ import rich_click as click from archivebox.cli.archivebox_add import add -@click.command(context_settings={'ignore_unknown_options': True}) -@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)') -@click.option('--tag', '-t', default='', help='Comma-separated tags to add') -@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') -@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility') -@click.argument('urls', nargs=-1) +@click.command(context_settings={"ignore_unknown_options": True}) +@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)") +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility") +@click.argument("urls", nargs=-1) def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]): """Backwards-compatible `archivebox crawl URL...` entrypoint.""" del status, wait @@ -23,5 +23,5 @@ def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]): sys.exit(0) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index 8f132a58..054382c1 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -27,8 +27,8 @@ Examples: archivebox crawl https://example.com | archivebox snapshot | archivebox extract """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox extract' +__package__ = "archivebox.cli" +__command__ = "archivebox extract" import sys from collections import defaultdict @@ -52,51 +52,52 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int: try: archiveresult = ArchiveResult.objects.get(id=archiveresult_id) except ArchiveResult.DoesNotExist: - rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr) + rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr) return 1 - rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr) + rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr) try: archiveresult.reset_for_retry() snapshot = archiveresult.snapshot snapshot.status = snapshot.StatusChoices.QUEUED snapshot.retry_at = timezone.now() - snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) crawl = snapshot.crawl if crawl.status != crawl.StatusChoices.STARTED: crawl.status = crawl.StatusChoices.QUEUED crawl.retry_at = timezone.now() - crawl.save(update_fields=['status', 'retry_at', 'modified_at']) + crawl.save(update_fields=["status", "retry_at", "modified_at"]) run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin]) archiveresult.refresh_from_db() if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: - print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]') + print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]") return 0 elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS: - print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]') + print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]") return 0 elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: - print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr) + print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr) return 1 else: # Still in progress or backoff - not a failure - print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]') + print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]") return 0 except Exception as e: - print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr) return 1 def run_plugins( args: tuple, records: list[dict] | None = None, - plugins: str = '', + plugins: str = "", wait: bool = True, + emit_results: bool = True, ) -> int: """ Run plugins on Snapshots from input. @@ -111,16 +112,18 @@ def run_plugins( from django.utils import timezone from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, - TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + read_args_or_stdin, + write_record, + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, ) - from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.core.models import Snapshot from archivebox.services.runner import run_crawl is_tty = sys.stdout.isatty() # Parse comma-separated plugins list once (reused in creation and filtering) - plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else [] + plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else [] # Parse stdin/args exactly once per CLI invocation. # `main()` may already have consumed stdin to distinguish Snapshot input from @@ -130,41 +133,41 @@ def run_plugins( records = list(read_args_or_stdin(args)) if not records: - rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr) + rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr) return 1 # Gather snapshot IDs and optional plugin constraints to process snapshot_ids = set() requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set) for record in records: - record_type = record.get('type') + record_type = record.get("type") if record_type == TYPE_SNAPSHOT: - snapshot_id = record.get('id') + snapshot_id = record.get("id") if snapshot_id: snapshot_ids.add(snapshot_id) - elif record.get('url'): + elif record.get("url"): # Look up by URL (get most recent if multiple exist) - snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first() + snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first() if snap: snapshot_ids.add(str(snap.id)) else: - rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr) elif record_type == TYPE_ARCHIVERESULT: - snapshot_id = record.get('snapshot_id') + snapshot_id = record.get("snapshot_id") if snapshot_id: snapshot_ids.add(snapshot_id) - plugin_name = record.get('plugin') + plugin_name = record.get("plugin") if plugin_name and not plugins_list: requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name)) - elif 'id' in record: + elif "id" in record: # Assume it's a snapshot ID - snapshot_ids.add(record['id']) + snapshot_ids.add(record["id"]) if not snapshot_ids: - rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr) + rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr) return 1 # Get snapshots and ensure they have pending ArchiveResults @@ -173,17 +176,13 @@ def run_plugins( try: snapshot = Snapshot.objects.get(id=snapshot_id) except Snapshot.DoesNotExist: - rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr) + rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr) continue - for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()): - existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first() - if existing_result and existing_result.status in [ - ArchiveResult.StatusChoices.FAILED, - ArchiveResult.StatusChoices.SKIPPED, - ArchiveResult.StatusChoices.NORESULTS, - ArchiveResult.StatusChoices.BACKOFF, - ]: + requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set()) + for plugin_name in requested_plugin_names: + existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first() + if existing_result: existing_result.reset_for_retry() # Reset snapshot status to allow processing @@ -195,34 +194,39 @@ def run_plugins( processed_count += 1 if processed_count == 0: - rprint('[red]No snapshots to process[/red]', file=sys.stderr) + rprint("[red]No snapshots to process[/red]", file=sys.stderr) return 1 - rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr) + rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr) # Run orchestrator if --wait (default) if wait: - rprint('[blue]Running plugins...[/blue]', file=sys.stderr) + rprint("[blue]Running plugins...[/blue]", file=sys.stderr) snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) for snapshot_id in snapshot_ids: try: - snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id) + snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id) except Snapshot.DoesNotExist: continue snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id)) for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items(): - selected_plugins = plugins_list or sorted({ - plugin - for snapshot_id in crawl_snapshot_ids - for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set()) - }) or None + selected_plugins = ( + plugins_list + or sorted( + {plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())}, + ) + or None + ) run_crawl( crawl_id, snapshot_ids=sorted(crawl_snapshot_ids), selected_plugins=selected_plugins, ) + if not emit_results: + return 0 + # Output results as JSONL (when piped) or human-readable (when TTY) for snapshot_id in snapshot_ids: try: @@ -234,11 +238,14 @@ def run_plugins( for result in results: if is_tty: status_color = { - 'succeeded': 'green', - 'failed': 'red', - 'skipped': 'yellow', - }.get(result.status, 'dim') - rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr) + "succeeded": "green", + "failed": "red", + "skipped": "yellow", + }.get(result.status, "dim") + rprint( + f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}", + file=sys.stderr, + ) else: write_record(result.to_json()) except Snapshot.DoesNotExist: @@ -250,18 +257,20 @@ def run_plugins( def is_archiveresult_id(value: str) -> bool: """Check if value looks like an ArchiveResult UUID.""" import re - uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I) + + uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I) if not uuid_pattern.match(value): return False # Verify it's actually an ArchiveResult (not a Snapshot or other object) from archivebox.core.models import ArchiveResult + return ArchiveResult.objects.filter(id=value).exists() @click.command() -@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)') -@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)') -@click.argument('args', nargs=-1) +@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)") +@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)") +@click.argument("args", nargs=-1) def main(plugins: str, wait: bool, args: tuple): """Run plugins on Snapshots, or process existing ArchiveResults by ID""" from archivebox.misc.jsonl import read_args_or_stdin @@ -271,14 +280,12 @@ def main(plugins: str, wait: bool, args: tuple): if not records: from rich import print as rprint - rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) + + rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr) sys.exit(1) # Check if input looks like existing ArchiveResult IDs to process - all_are_archiveresult_ids = all( - is_archiveresult_id(r.get('id') or r.get('url', '')) - for r in records - ) + all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records) if all_are_archiveresult_ids: # Process existing ArchiveResults by ID @@ -286,9 +293,9 @@ def main(plugins: str, wait: bool, args: tuple): exit_code = 0 for record in records: - archiveresult_id = record.get('id') or record.get('url') + archiveresult_id = record.get("id") or record.get("url") if not isinstance(archiveresult_id, str): - rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr) + rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr) exit_code = 1 continue result = process_archiveresult_by_id(archiveresult_id) @@ -300,5 +307,5 @@ def main(plugins: str, wait: bool, args: tuple): sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py index 4b6d68a2..86d0be86 100755 --- a/archivebox/cli/archivebox_help.py +++ b/archivebox/cli/archivebox_help.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox help' +__package__ = "archivebox.cli" +__command__ = "archivebox help" -import os +import os from pathlib import Path import click @@ -17,33 +17,44 @@ def help() -> None: from archivebox.config import CONSTANTS from archivebox.config.permissions import IN_DOCKER from archivebox.misc.logging_util import log_cli_command - - log_cli_command('help', [], None, '.') - - COMMANDS_HELP_TEXT = '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}' - for cmd in ArchiveBoxGroup.meta_commands.keys() - ) + '\n\n ' + '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}' - for cmd in ArchiveBoxGroup.setup_commands.keys() - ) + '\n\n ' + '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}' - for cmd in ArchiveBoxGroup.archive_commands.keys() + + log_cli_command("help", [], None, ".") + + COMMANDS_HELP_TEXT = ( + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys() + ) + + "\n\n " + + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys() + ) + + "\n\n " + + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys() + ) ) - - DOCKER_USAGE = ''' + + DOCKER_USAGE = ( + """ [dodger_blue3]Docker Usage:[/dodger_blue3] [grey53]# using Docker Compose:[/grey53] [blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] [grey53]# using Docker:[/grey53] [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] -''' if IN_DOCKER else '' - DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else '' - DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else '' - DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else '' +""" + if IN_DOCKER + else "" + ) + DOCKER_DOCS = ( + "\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]" + if IN_DOCKER + else "" + ) + DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else "" + DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else "" - print(f'''{DOCKER_USAGE} + print(f"""{DOCKER_USAGE} [deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT} [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] @@ -54,12 +65,11 @@ def help() -> None: [link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS} [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link] -''') - - +""") + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir(): - pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~') - EXAMPLE_USAGE = f''' + pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~") + EXAMPLE_USAGE = f""" [light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] [violet]Hint:[/violet] [i]Common maintenance tasks:[/i] @@ -73,33 +83,49 @@ def help() -> None: [dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title [dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss" [dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53] -''' - print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.')) +""" + print( + Panel( + EXAMPLE_USAGE, + expand=False, + border_style="grey53", + title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]", + subtitle="Commands run inside this dir will only apply to this collection.", + ), + ) else: - DATA_SETUP_HELP = '\n' + DATA_SETUP_HELP = "\n" if IN_DOCKER: - DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n' - DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n' - DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n' - DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n' - DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n' - DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n' - DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n' - DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n' - DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n' - DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n' - DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n' - DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n' - DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n' - print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) - + DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n" + DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n" + DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n" + DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n" + DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n" + DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n" + DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n" + DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n" + DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n" + DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n" + DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n" + DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n" + DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n" + print( + Panel( + DATA_SETUP_HELP, + expand=False, + border_style="grey53", + title="[red]:cross_mark: No collection is currently active[/red]", + subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]", + ), + ) @click.command() -@click.option('--help', '-h', is_flag=True, help='Show help') +@click.option("--help", "-h", is_flag=True, help="Show help") def main(**kwargs): """Print the ArchiveBox help message and usage""" return help() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 90a50fa5..2376e1f3 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import os import sys from pathlib import Path -from typing import Mapping +from collections.abc import Mapping from rich import print import rich_click as click @@ -14,12 +14,12 @@ from archivebox.misc.util import docstring, enforce_types def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None: - url = link_dict.get('url') + url = link_dict.get("url") if not isinstance(url, str) or not url: return None - record: dict[str, object] = {'url': url} - for key in ('timestamp', 'title', 'tags', 'sources'): + record: dict[str, object] = {"url": url} + for key in ("timestamp", "title", "tags", "sources"): value = link_dict.get(key) if value is not None: record[key] = value @@ -27,15 +27,15 @@ def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, di @enforce_types -def init(force: bool=False, quick: bool=False, install: bool=False) -> None: +def init(force: bool = False, quick: bool = False, install: bool = False) -> None: """Initialize a new ArchiveBox collection in the current directory""" - + from archivebox.config import CONSTANTS, VERSION, DATA_DIR from archivebox.config.common import SERVER_CONFIG from archivebox.config.collection import write_config_file from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details from archivebox.misc.db import apply_migrations - + # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK): # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr) # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr) @@ -43,69 +43,71 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR) existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE) if is_empty and not existing_index: - print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]') - print('[green]----------------------------------------------------------------------[/green]') + print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]") + print("[green]----------------------------------------------------------------------[/green]") elif existing_index: # TODO: properly detect and print the existing version in current index as well - print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]') - print('[green]----------------------------------------------------------------------[/green]') + print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]") + print("[green]----------------------------------------------------------------------[/green]") else: if force: - print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]') - print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]') + print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]") + print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]") else: print( - ("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n" + "[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n" " You must run init in a completely empty directory, or an existing data folder.\n\n" " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n" " then run and run 'archivebox init' to pick up where you left off.\n\n" - " (Always make sure your data folder is backed up first before updating ArchiveBox)" - ) + " (Always make sure your data folder is backed up first before updating ArchiveBox)", ) raise SystemExit(2) if existing_index: - print('\n[green][*] Verifying archive folder structure...[/green]') + print("\n[green][*] Verifying archive folder structure...[/green]") else: - print('\n[green][+] Building archive folder structure...[/green]') - - print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...') + print("\n[green][+] Building archive folder structure...[/green]") + + print( + f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...", + ) Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) - - print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...') - + + print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...") + # create the .archivebox_id file with a unique ID for this collection from archivebox.config.paths import _get_collection_id - _get_collection_id(DATA_DIR, force_create=True) - - # create the ArchiveBox.conf file - write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY}) + _get_collection_id(DATA_DIR, force_create=True) + + # create the ArchiveBox.conf file + write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY}) if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): - print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]') + print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]") else: - print('\n[green][+] Building main SQL index and running initial migrations...[/green]') - + print("\n[green][+] Building main SQL index and running initial migrations...[/green]") + from archivebox.config.django import setup_django + setup_django() - + for migration_line in apply_migrations(DATA_DIR): - sys.stdout.write(f' {migration_line}\n') + sys.stdout.write(f" {migration_line}\n") assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK) print() - print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') - + print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}") + # from django.contrib.auth.models import User # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists(): # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) # call_command("createsuperuser", interactive=True) print() - print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]') + print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]") from archivebox.core.models import Snapshot @@ -114,10 +116,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: if existing_index: all_links = Snapshot.objects.all() - print(f' √ Loaded {all_links.count()} links from existing main index.') + print(f" √ Loaded {all_links.count()} links from existing main index.") if quick: - print(' > Skipping orphan snapshot import (quick mode)') + print(" > Skipping orphan snapshot import (quick mode)") else: try: # Import orphaned links from legacy JSON indexes @@ -131,7 +133,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: orphaned_json_links[url] = record if orphaned_json_links: pending_links.update(orphaned_json_links) - print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]') + print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]") orphaned_data_dir_links: dict[str, dict[str, object]] = {} for link_dict in parse_json_links_details(DATA_DIR): @@ -143,7 +145,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: orphaned_data_dir_links[url] = record if orphaned_data_dir_links: pending_links.update(orphaned_data_dir_links) - print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]') + print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]") if pending_links: for link_dict in pending_links.values(): @@ -151,42 +153,44 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: # Hint for orphaned snapshot directories print() - print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:') - print(' archivebox update') + print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:") + print(" archivebox update") except (KeyboardInterrupt, SystemExit): print(file=sys.stderr) - print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr) - print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr) + print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr) + print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr) print(file=sys.stderr) - print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr) - print(' archivebox init --quick', file=sys.stderr) + print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr) + print(" archivebox init --quick", file=sys.stderr) raise SystemExit(1) - print('\n[green]----------------------------------------------------------------------[/green]') + print("\n[green]----------------------------------------------------------------------[/green]") from django.contrib.auth.models import User - if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists(): - print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]') + if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter( + username=SERVER_CONFIG.ADMIN_USERNAME, + ).exists(): + print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]") User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD) if existing_index: - print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]') + print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]") else: - print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]') + print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]") - CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) - (CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True) + (CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True) from archivebox.config.common import STORAGE_CONFIG from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir + STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) - (STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True) + (STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True) working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) if working_tmp_dir: @@ -195,33 +199,35 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True) if working_lib_dir: working_lib_dir.mkdir(parents=True, exist_ok=True) - (working_lib_dir / 'bin').mkdir(parents=True, exist_ok=True) - + (working_lib_dir / "bin").mkdir(parents=True, exist_ok=True) + if install: from archivebox.cli.archivebox_install import install as install_method + install_method() - if Snapshot.objects.count() < 25: # hide the hints for experienced users + if Snapshot.objects.count() < 25: # hide the hints for experienced users print() - print(' [violet]Hint:[/violet] To view your archive index, run:') - print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]') + print(" [violet]Hint:[/violet] To view your archive index, run:") + print( + " archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]", + ) print() - print(' To add new links, you can run:') + print(" To add new links, you can run:") print(" archivebox add < ~/some/path/to/list_of_links.txt") print() - print(' For more usage and examples, run:') - print(' archivebox help') - + print(" For more usage and examples, run:") + print(" archivebox help") @click.command() -@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway') -@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs') -@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving') +@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway") +@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs") +@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving") @docstring(init.__doc__) def main(**kwargs) -> None: init(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index b51be910..a8f956cb 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import os @@ -11,7 +11,7 @@ from archivebox.misc.util import docstring, enforce_types @enforce_types -def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None: +def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None: """Detect and install ArchiveBox dependencies by running the abx-dl install flow Examples: @@ -31,33 +31,34 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo # Show what we're installing if binaries: - print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]') + print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]") else: - print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]') + print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]") - if binproviders != '*': - print(f'[green][+] Using providers: {binproviders}[/green]') + if binproviders != "*": + print(f"[green][+] Using providers: {binproviders}[/green]") if IS_ROOT: EUID = os.geteuid() print() - print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]') - print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].') + print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]") + print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].") print() if dry_run: - print('[dim]Dry run - would run the abx-dl install flow[/dim]') + print("[dim]Dry run - would run the abx-dl install flow[/dim]") return # Set up Django from archivebox.config.django import setup_django + setup_django() plugin_names = list(binaries) - if binproviders != '*': - plugin_names.extend(provider.strip() for provider in binproviders.split(',') if provider.strip()) + if binproviders != "*": + plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip()) - print('[+] Running installer via abx-dl bus...') + print("[+] Running installer via abx-dl bus...") print() from archivebox.services.runner import run_install @@ -68,28 +69,36 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo # Check for superuser from django.contrib.auth import get_user_model + User = get_user_model() - if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): - stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') - stderr(' archivebox manage createsuperuser') + if not User.objects.filter(is_superuser=True).exclude(username="system").exists(): + stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green") + stderr(" archivebox manage createsuperuser") print() # Show version to display full status including installed binaries # Django is already loaded, so just import and call the function directly from archivebox.cli.archivebox_version import version as show_version + show_version(quiet=False) @click.command() -@click.argument('binaries', nargs=-1, type=str, required=False) -@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True) -@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False) +@click.argument("binaries", nargs=-1, type=str, required=False) +@click.option( + "--binproviders", + "-p", + default="*", + help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all", + show_default=True, +) +@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False) @docstring(install.__doc__) def main(**kwargs) -> None: install(**kwargs) - -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 6714c537..14359453 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox list' +__package__ = "archivebox.cli" +__command__ = "archivebox list" import sys -from typing import Optional import rich_click as click @@ -12,31 +11,47 @@ from archivebox.cli.archivebox_snapshot import list_snapshots @click.command() -@click.option('--status', '-s', help='Filter by status (queued, started, sealed)') -@click.option('--url__icontains', help='Filter by URL contains') -@click.option('--url__istartswith', help='Filter by URL starts with') -@click.option('--tag', '-t', help='Filter by tag name') -@click.option('--crawl-id', help='Filter by crawl ID') -@click.option('--limit', '-n', type=int, help='Limit number of results') -@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at') -@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title') -@click.option('--with-headers', is_flag=True, help='Include column headers in structured output') -def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str], - tag: Optional[str], crawl_id: Optional[str], limit: Optional[int], - sort: Optional[str], csv: Optional[str], with_headers: bool) -> None: +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--url__icontains", help="Filter by URL contains") +@click.option("--url__istartswith", help="Filter by URL starts with") +@click.option("--tag", "-t", help="Filter by tag name") +@click.option("--crawl-id", help="Filter by crawl ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") +@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title") +@click.option("--with-headers", is_flag=True, help="Include column headers in structured output") +@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query") +@click.argument("query", nargs=-1) +def main( + status: str | None, + url__icontains: str | None, + url__istartswith: str | None, + tag: str | None, + crawl_id: str | None, + limit: int | None, + sort: str | None, + csv: str | None, + with_headers: bool, + search: str | None, + query: tuple[str, ...], +) -> None: """List Snapshots.""" - sys.exit(list_snapshots( - status=status, - url__icontains=url__icontains, - url__istartswith=url__istartswith, - tag=tag, - crawl_id=crawl_id, - limit=limit, - sort=sort, - csv=csv, - with_headers=with_headers, - )) + sys.exit( + list_snapshots( + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + csv=csv, + with_headers=with_headers, + search=search, + query=" ".join(query), + ), + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_machine.py b/archivebox/cli/archivebox_machine.py index 86d3e219..688216c5 100644 --- a/archivebox/cli/archivebox_machine.py +++ b/archivebox/cli/archivebox_machine.py @@ -19,11 +19,10 @@ Examples: archivebox machine list --hostname__icontains=myserver """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox machine' +__package__ = "archivebox.cli" +__command__ = "archivebox machine" import sys -from typing import Optional import rich_click as click from rich import print as rprint @@ -35,10 +34,11 @@ from archivebox.cli.cli_utils import apply_filters # LIST # ============================================================================= + def list_machines( - hostname__icontains: Optional[str] = None, - os_platform: Optional[str] = None, - limit: Optional[int] = None, + hostname__icontains: str | None = None, + os_platform: str | None = None, + limit: int | None = None, ) -> int: """ List Machines as JSONL with optional filters. @@ -51,24 +51,24 @@ def list_machines( is_tty = sys.stdout.isatty() - queryset = Machine.objects.all().order_by('-created_at') + queryset = Machine.objects.all().order_by("-created_at") # Apply filters filter_kwargs = { - 'hostname__icontains': hostname__icontains, - 'os_platform': os_platform, + "hostname__icontains": hostname__icontains, + "os_platform": os_platform, } queryset = apply_filters(queryset, filter_kwargs, limit=limit) count = 0 for machine in queryset: if is_tty: - rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}') + rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}") else: write_record(machine.to_json()) count += 1 - rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr) return 0 @@ -76,24 +76,27 @@ def list_machines( # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Machine records (read-only, system-managed).""" pass -@main.command('list') -@click.option('--hostname__icontains', help='Filter by hostname contains') -@click.option('--os-platform', help='Filter by OS platform') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--hostname__icontains", help="Filter by hostname contains") +@click.option("--os-platform", help="Filter by OS platform") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None): """List Machines as JSONL.""" - sys.exit(list_machines( - hostname__icontains=hostname__icontains, - os_platform=os_platform, - limit=limit, - )) + sys.exit( + list_machines( + hostname__icontains=hostname__icontains, + os_platform=os_platform, + limit=limit, + ), + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py index 0d367042..7105161c 100644 --- a/archivebox/cli/archivebox_manage.py +++ b/archivebox/cli/archivebox_manage.py @@ -1,33 +1,34 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import rich_click as click from archivebox.misc.util import docstring, enforce_types @enforce_types -def manage(args: list[str] | None=None) -> None: +def manage(args: list[str] | None = None) -> None: """Run an ArchiveBox Django management command""" from archivebox.config.common import SHELL_CONFIG from archivebox.misc.logging import stderr if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY): - stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') - stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') - stderr('') + stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow") + stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow") + stderr("") from django.core.management import execute_from_command_line - execute_from_command_line(['manage.py', *(args or ['help'])]) + + execute_from_command_line(["manage.py", *(args or ["help"])]) @click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) -@click.argument('args', nargs=-1) +@click.argument("args", nargs=-1) @docstring(manage.__doc__) -def main(args: list[str] | None=None) -> None: +def main(args: list[str] | None = None) -> None: manage(args=args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_mcp.py b/archivebox/cli/archivebox_mcp.py index 07b3825f..cbc2ba19 100644 --- a/archivebox/cli/archivebox_mcp.py +++ b/archivebox/cli/archivebox_mcp.py @@ -6,8 +6,8 @@ Start the Model Context Protocol (MCP) server in stdio mode. Exposes all ArchiveBox CLI commands as MCP tools for AI agents. """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox mcp' +__package__ = "archivebox.cli" +__command__ = "archivebox mcp" import rich_click as click @@ -45,5 +45,5 @@ def main(**kwargs): mcp() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py index 6ba981f0..7f930665 100644 --- a/archivebox/cli/archivebox_persona.py +++ b/archivebox/cli/archivebox_persona.py @@ -24,8 +24,8 @@ Examples: archivebox persona list --name=old | archivebox persona delete --yes """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox persona' +__package__ = "archivebox.cli" +__command__ = "archivebox persona" import os import sys @@ -35,7 +35,7 @@ import subprocess import tempfile import json from pathlib import Path -from typing import Optional, Iterable +from collections.abc import Iterable from collections import OrderedDict import rich_click as click @@ -49,134 +49,145 @@ from archivebox.personas import importers as persona_importers # Browser Profile Locations # ============================================================================= -def get_chrome_user_data_dir() -> Optional[Path]: + +def get_chrome_user_data_dir() -> Path | None: """Get the default Chrome user data directory for the current platform.""" system = platform.system() home = Path.home() - if system == 'Darwin': # macOS + if system == "Darwin": # macOS candidates = [ - home / 'Library' / 'Application Support' / 'Google' / 'Chrome', - home / 'Library' / 'Application Support' / 'Chromium', + home / "Library" / "Application Support" / "Google" / "Chrome", + home / "Library" / "Application Support" / "Chromium", ] - elif system == 'Linux': + elif system == "Linux": candidates = [ - home / '.config' / 'google-chrome', - home / '.config' / 'chromium', - home / '.config' / 'chrome', - home / 'snap' / 'chromium' / 'common' / 'chromium', + home / ".config" / "google-chrome", + home / ".config" / "chromium", + home / ".config" / "chrome", + home / "snap" / "chromium" / "common" / "chromium", ] - elif system == 'Windows': - local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local')) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) candidates = [ - local_app_data / 'Google' / 'Chrome' / 'User Data', - local_app_data / 'Chromium' / 'User Data', + local_app_data / "Google" / "Chrome" / "User Data", + local_app_data / "Chromium" / "User Data", ] else: candidates = [] for candidate in candidates: - if candidate.exists() and (candidate / 'Default').exists(): + if candidate.exists() and (candidate / "Default").exists(): return candidate return None -def get_brave_user_data_dir() -> Optional[Path]: +def get_brave_user_data_dir() -> Path | None: """Get the default Brave user data directory for the current platform.""" system = platform.system() home = Path.home() - if system == 'Darwin': + if system == "Darwin": candidates = [ - home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser', + home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser", ] - elif system == 'Linux': + elif system == "Linux": candidates = [ - home / '.config' / 'BraveSoftware' / 'Brave-Browser', + home / ".config" / "BraveSoftware" / "Brave-Browser", ] - elif system == 'Windows': - local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local')) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) candidates = [ - local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data', + local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data", ] else: candidates = [] for candidate in candidates: - if candidate.exists() and (candidate / 'Default').exists(): + if candidate.exists() and (candidate / "Default").exists(): return candidate return None -def get_edge_user_data_dir() -> Optional[Path]: +def get_edge_user_data_dir() -> Path | None: """Get the default Edge user data directory for the current platform.""" system = platform.system() home = Path.home() - if system == 'Darwin': + if system == "Darwin": candidates = [ - home / 'Library' / 'Application Support' / 'Microsoft Edge', + home / "Library" / "Application Support" / "Microsoft Edge", ] - elif system == 'Linux': + elif system == "Linux": candidates = [ - home / '.config' / 'microsoft-edge', - home / '.config' / 'microsoft-edge-beta', - home / '.config' / 'microsoft-edge-dev', + home / ".config" / "microsoft-edge", + home / ".config" / "microsoft-edge-beta", + home / ".config" / "microsoft-edge-dev", ] - elif system == 'Windows': - local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local')) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) candidates = [ - local_app_data / 'Microsoft' / 'Edge' / 'User Data', + local_app_data / "Microsoft" / "Edge" / "User Data", ] else: candidates = [] for candidate in candidates: - if candidate.exists() and (candidate / 'Default').exists(): + if candidate.exists() and (candidate / "Default").exists(): return candidate return None -def get_browser_binary(browser: str) -> Optional[str]: +def get_browser_binary(browser: str) -> str | None: system = platform.system() home = Path.home() browser = browser.lower() - if system == 'Darwin': + if system == "Darwin": candidates = { - 'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'], - 'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'], - 'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'], - 'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'], + "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"], + "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"], + "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"], + "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"], }.get(browser, []) - elif system == 'Linux': + elif system == "Linux": candidates = { - 'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'], - 'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'], - 'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'], - 'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'], + "chrome": [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/google-chrome-beta", + "/usr/bin/google-chrome-unstable", + ], + "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"], + "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"], + "edge": [ + "/usr/bin/microsoft-edge", + "/usr/bin/microsoft-edge-stable", + "/usr/bin/microsoft-edge-beta", + "/usr/bin/microsoft-edge-dev", + ], }.get(browser, []) - elif system == 'Windows': - local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local')) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) candidates = { - 'chrome': [ - str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'), - 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', - 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe', + "chrome": [ + str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"), + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", ], - 'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')], - 'brave': [ - str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'), - 'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe', - 'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe', + "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")], + "brave": [ + str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"), + "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", ], - 'edge': [ - str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'), - 'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe', - 'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe', + "edge": [ + str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"), + "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe", + "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe", ], }.get(browser, []) else: @@ -190,13 +201,13 @@ def get_browser_binary(browser: str) -> Optional[str]: BROWSER_PROFILE_FINDERS = { - 'chrome': get_chrome_user_data_dir, - 'chromium': get_chrome_user_data_dir, # Same locations - 'brave': get_brave_user_data_dir, - 'edge': get_edge_user_data_dir, + "chrome": get_chrome_user_data_dir, + "chromium": get_chrome_user_data_dir, # Same locations + "brave": get_brave_user_data_dir, + "edge": get_edge_user_data_dir, } -CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'} +CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"} # ============================================================================= @@ -204,12 +215,12 @@ CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'} # ============================================================================= NETSCAPE_COOKIE_HEADER = [ - '# Netscape HTTP Cookie File', - '# https://curl.se/docs/http-cookies.html', - '# This file was generated by ArchiveBox persona cookie extraction', - '#', - '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue', - '', + "# Netscape HTTP Cookie File", + "# https://curl.se/docs/http-cookies.html", + "# This file was generated by ArchiveBox persona cookie extraction", + "#", + "# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue", + "", ] @@ -219,9 +230,9 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu return cookies for line in path.read_text().splitlines(): - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue - parts = line.split('\t') + parts = line.split("\t") if len(parts) < 7: continue domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7] @@ -233,8 +244,8 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None: lines = list(NETSCAPE_COOKIE_HEADER) for cookie in cookies.values(): - lines.append('\t'.join(cookie)) - path.write_text('\n'.join(lines) + '\n') + lines.append("\t".join(cookie)) + path.write_text("\n".join(lines) + "\n") def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: @@ -259,52 +270,52 @@ def extract_cookies_via_cdp( from archivebox.config.common import STORAGE_CONFIG # Find the cookie extraction script - chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome' - extract_script = chrome_plugin_dir / 'extract_cookies.js' + chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome" + extract_script = chrome_plugin_dir / "extract_cookies.js" if not extract_script.exists(): - rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr) return False # Get node modules dir - node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules' + node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" # Set up environment env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(node_modules_dir) - env['CHROME_USER_DATA_DIR'] = str(user_data_dir) - env['CHROME_HEADLESS'] = 'true' + env["NODE_MODULES_DIR"] = str(node_modules_dir) + env["CHROME_USER_DATA_DIR"] = str(user_data_dir) + env["CHROME_HEADLESS"] = "true" if chrome_binary: - env['CHROME_BINARY'] = str(chrome_binary) + env["CHROME_BINARY"] = str(chrome_binary) output_path = output_file temp_output = None temp_dir = None if output_file.exists(): - temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_')) - temp_output = temp_dir / 'cookies.txt' + temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_")) + temp_output = temp_dir / "cookies.txt" output_path = temp_output if profile_dir: - extra_arg = f'--profile-directory={profile_dir}' - existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip() + extra_arg = f"--profile-directory={profile_dir}" + existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip() args_list = [] if existing_extra: - if existing_extra.startswith('['): + if existing_extra.startswith("["): try: parsed = json.loads(existing_extra) if isinstance(parsed, list): args_list.extend(str(x) for x in parsed) except Exception: - args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()]) + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) else: - args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()]) + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) args_list.append(extra_arg) - env['CHROME_ARGS_EXTRA'] = json.dumps(args_list) + env["CHROME_ARGS_EXTRA"] = json.dumps(args_list) - env['COOKIES_OUTPUT_FILE'] = str(output_path) + env["COOKIES_OUTPUT_FILE"] = str(output_path) try: result = subprocess.run( - ['node', str(extract_script)], + ["node", str(extract_script)], env=env, capture_output=True, text=True, @@ -316,17 +327,17 @@ def extract_cookies_via_cdp( _merge_netscape_cookies(output_file, temp_output) return True else: - rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr) return False except subprocess.TimeoutExpired: - rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr) + rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr) return False except FileNotFoundError: - rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr) + rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr) return False except Exception as e: - rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr) return False finally: if temp_dir and temp_dir.exists(): @@ -337,6 +348,7 @@ def extract_cookies_via_cdp( # Validation Helpers # ============================================================================= + def validate_persona_name(name: str) -> tuple[bool, str]: """ Validate persona name to prevent path traversal attacks. @@ -348,19 +360,19 @@ def validate_persona_name(name: str) -> tuple[bool, str]: return False, "Persona name cannot be empty" # Check for path separators - if '/' in name or '\\' in name: + if "/" in name or "\\" in name: return False, "Persona name cannot contain path separators (/ or \\)" # Check for parent directory references - if '..' in name: + if ".." in name: return False, "Persona name cannot contain parent directory references (..)" # Check for hidden files/directories - if name.startswith('.'): + if name.startswith("."): return False, "Persona name cannot start with a dot (.)" # Ensure name doesn't contain null bytes or other dangerous chars - if '\x00' in name or '\n' in name or '\r' in name: + if "\x00" in name or "\n" in name or "\r" in name: return False, "Persona name contains invalid characters" return True, "" @@ -394,10 +406,11 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool: # CREATE # ============================================================================= + def create_personas( names: Iterable[str], - import_from: Optional[str] = None, - profile: Optional[str] = None, + import_from: str | None = None, + profile: str | None = None, ) -> int: """ Create Personas from names. @@ -416,7 +429,7 @@ def create_personas( name_list = list(names) if names else [] if not name_list: - rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr) + rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr) return 1 # Validate import source if specified @@ -424,23 +437,23 @@ def create_personas( if import_from: import_from = import_from.lower() if import_from not in BROWSER_PROFILE_FINDERS: - rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr) - rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr) + rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr) + rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr) return 1 source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]() if not source_profile_dir: - rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr) + rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr) return 1 - rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr) + rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr) - if profile is None and (source_profile_dir / 'Default').exists(): - profile = 'Default' + if profile is None and (source_profile_dir / "Default").exists(): + profile = "Default" browser_binary = get_browser_binary(import_from) if browser_binary: - rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr) + rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr) created_count = 0 for name in name_list: @@ -459,11 +472,11 @@ def create_personas( if created: persona.ensure_dirs() created_count += 1 - rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr) + rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr) else: - rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr) + rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr) - cookies_file = Path(persona.path) / 'cookies.txt' + cookies_file = Path(persona.path) / "cookies.txt" # Import browser profile if requested if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None: @@ -477,29 +490,31 @@ def create_personas( capture_storage=False, ) except Exception as e: - rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr) + rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr) return 1 if import_result.profile_copied: - rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr) + rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr) if import_result.cookies_imported: - rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr) + rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr) elif not import_result.profile_copied: - rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr) + rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr) for warning in import_result.warnings: - rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr) + rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr) if not is_tty: - write_record({ - 'id': str(persona.id) if hasattr(persona, 'id') else None, - 'name': persona.name, - 'path': str(persona.path), - 'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR, - 'COOKIES_FILE': persona.COOKIES_FILE, - }) + write_record( + { + "id": str(persona.id) if hasattr(persona, "id") else None, + "name": persona.name, + "path": str(persona.path), + "CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR, + "COOKIES_FILE": persona.COOKIES_FILE, + }, + ) - rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr) + rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr) return 0 @@ -507,10 +522,11 @@ def create_personas( # LIST # ============================================================================= + def list_personas( - name: Optional[str] = None, - name__icontains: Optional[str] = None, - limit: Optional[int] = None, + name: str | None = None, + name__icontains: str | None = None, + limit: int | None = None, ) -> int: """ List Personas as JSONL with optional filters. @@ -523,33 +539,35 @@ def list_personas( is_tty = sys.stdout.isatty() - queryset = Persona.objects.all().order_by('name') + queryset = Persona.objects.all().order_by("name") # Apply filters filter_kwargs = { - 'name': name, - 'name__icontains': name__icontains, + "name": name, + "name__icontains": name__icontains, } queryset = apply_filters(queryset, filter_kwargs, limit=limit) count = 0 for persona in queryset: - cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]' - chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]' + cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]" + chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]" if is_tty: - rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]') + rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]") else: - write_record({ - 'id': str(persona.id) if hasattr(persona, 'id') else None, - 'name': persona.name, - 'path': str(persona.path), - 'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR, - 'COOKIES_FILE': persona.COOKIES_FILE, - }) + write_record( + { + "id": str(persona.id) if hasattr(persona, "id") else None, + "name": persona.name, + "path": str(persona.path), + "CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR, + "COOKIES_FILE": persona.COOKIES_FILE, + }, + ) count += 1 - rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr) return 0 @@ -557,7 +575,8 @@ def list_personas( # UPDATE # ============================================================================= -def update_personas(name: Optional[str] = None) -> int: + +def update_personas(name: str | None = None) -> int: """ Update Personas from stdin JSONL. @@ -575,13 +594,13 @@ def update_personas(name: Optional[str] = None) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: - persona_id = record.get('id') - old_name = record.get('name') + persona_id = record.get("id") + old_name = record.get("name") if not persona_id and not old_name: continue @@ -613,17 +632,19 @@ def update_personas(name: Optional[str] = None) -> int: updated_count += 1 if not is_tty: - write_record({ - 'id': str(persona.id) if hasattr(persona, 'id') else None, - 'name': persona.name, - 'path': str(persona.path), - }) + write_record( + { + "id": str(persona.id) if hasattr(persona, "id") else None, + "name": persona.name, + "path": str(persona.path), + }, + ) except Persona.DoesNotExist: - rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr) continue - rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr) + rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr) return 0 @@ -631,6 +652,7 @@ def update_personas(name: Optional[str] = None) -> int: # DELETE # ============================================================================= + def delete_personas(yes: bool = False, dry_run: bool = False) -> int: """ Delete Personas from stdin JSONL. @@ -646,23 +668,24 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 # Collect persona IDs or names persona_ids = [] persona_names = [] for r in records: - if r.get('id'): - persona_ids.append(r['id']) - elif r.get('name'): - persona_names.append(r['name']) + if r.get("id"): + persona_ids.append(r["id"]) + elif r.get("name"): + persona_names.append(r["name"]) if not persona_ids and not persona_names: - rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr) return 1 from django.db.models import Q + query = Q() if persona_ids: query |= Q(id__in=persona_ids) @@ -673,17 +696,17 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int: count = personas.count() if count == 0: - rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr) return 0 if dry_run: - rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr) + rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr) for persona in personas: - rprint(f' {persona.name} ({persona.path})', file=sys.stderr) + rprint(f" {persona.name} ({persona.path})", file=sys.stderr) return 0 if not yes: - rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Delete persona directories and database records @@ -701,7 +724,7 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int: persona.delete() deleted_count += 1 - rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr) + rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr) return 0 @@ -709,44 +732,45 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int: # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Persona records (browser profiles).""" pass -@main.command('create') -@click.argument('names', nargs=-1) -@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)') -@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)') -def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]): +@main.command("create") +@click.argument("names", nargs=-1) +@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)") +@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)") +def create_cmd(names: tuple, import_from: str | None, profile: str | None): """Create Personas, optionally importing from a browser profile.""" sys.exit(create_personas(names, import_from=import_from, profile=profile)) -@main.command('list') -@click.option('--name', help='Filter by exact name') -@click.option('--name__icontains', help='Filter by name contains') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--name", help="Filter by exact name") +@click.option("--name__icontains", help="Filter by name contains") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(name: str | None, name__icontains: str | None, limit: int | None): """List Personas as JSONL.""" sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit)) -@main.command('update') -@click.option('--name', '-n', help='Set new name') -def update_cmd(name: Optional[str]): +@main.command("update") +@click.option("--name", "-n", help="Set new name") +def update_cmd(name: str | None): """Update Personas from stdin JSONL.""" sys.exit(update_personas(name=name)) -@main.command('delete') -@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') -@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete Personas from stdin JSONL.""" sys.exit(delete_personas(yes=yes, dry_run=dry_run)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py index 41c0724a..8b2d6541 100644 --- a/archivebox/cli/archivebox_pluginmap.py +++ b/archivebox/cli/archivebox_pluginmap.py @@ -1,8 +1,7 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" -from typing import Optional import rich_click as click @@ -137,7 +136,7 @@ BINARY_MACHINE_DIAGRAM = """ @enforce_types def pluginmap( show_disabled: bool = False, - model: Optional[str] = None, + model: str | None = None, quiet: bool = False, ) -> dict: """ @@ -164,25 +163,25 @@ def pluginmap( # Model event types that can have hooks model_events = { - 'Crawl': { - 'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)', - 'machine': 'CrawlMachine', - 'diagram': CRAWL_MACHINE_DIAGRAM, + "Crawl": { + "description": "Hooks run when a Crawl starts (QUEUED→STARTED)", + "machine": "CrawlMachine", + "diagram": CRAWL_MACHINE_DIAGRAM, }, - 'CrawlEnd': { - 'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)', - 'machine': 'CrawlMachine', - 'diagram': None, # Part of CrawlMachine + "CrawlEnd": { + "description": "Hooks run when a Crawl finishes (STARTED→SEALED)", + "machine": "CrawlMachine", + "diagram": None, # Part of CrawlMachine }, - 'Snapshot': { - 'description': 'Hooks run for each Snapshot (creates ArchiveResults)', - 'machine': 'SnapshotMachine', - 'diagram': SNAPSHOT_MACHINE_DIAGRAM, + "Snapshot": { + "description": "Hooks run for each Snapshot (creates ArchiveResults)", + "machine": "SnapshotMachine", + "diagram": SNAPSHOT_MACHINE_DIAGRAM, }, - 'Binary': { - 'description': 'Hooks for installing binary dependencies (providers)', - 'machine': 'BinaryMachine', - 'diagram': BINARY_MACHINE_DIAGRAM, + "Binary": { + "description": "Hooks for installing binary dependencies (providers)", + "machine": "BinaryMachine", + "diagram": BINARY_MACHINE_DIAGRAM, }, } @@ -195,16 +194,16 @@ def pluginmap( model_events = {model: model_events[model]} result = { - 'models': {}, - 'plugins_dir': str(BUILTIN_PLUGINS_DIR), - 'user_plugins_dir': str(USER_PLUGINS_DIR), + "models": {}, + "plugins_dir": str(BUILTIN_PLUGINS_DIR), + "user_plugins_dir": str(USER_PLUGINS_DIR), } if not quiet: prnt() - prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]') - prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]') - prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]') + prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]") + prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]") + prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]") prnt() for event_name, info in model_events.items(): @@ -218,88 +217,93 @@ def pluginmap( plugin_name = hook_path.parent.name is_bg = is_background_hook(hook_path.name) - hook_infos.append({ - 'path': str(hook_path), - 'name': hook_path.name, - 'plugin': plugin_name, - 'is_background': is_bg, - 'extension': hook_path.suffix, - }) + hook_infos.append( + { + "path": str(hook_path), + "name": hook_path.name, + "plugin": plugin_name, + "is_background": is_bg, + "extension": hook_path.suffix, + }, + ) - result['models'][event_name] = { - 'description': info['description'], - 'machine': info['machine'], - 'hooks': hook_infos, - 'hook_count': len(hook_infos), + result["models"][event_name] = { + "description": info["description"], + "machine": info["machine"], + "hooks": hook_infos, + "hook_count": len(hook_infos), } if not quiet: # Show diagram if this model has one - if info.get('diagram'): - assert info['diagram'] is not None - prnt(Panel( - info['diagram'], - title=f'[bold green]{info["machine"]}[/bold green]', - border_style='green', - expand=False, - )) + if info.get("diagram"): + assert info["diagram"] is not None + prnt( + Panel( + info["diagram"], + title=f"[bold green]{info['machine']}[/bold green]", + border_style="green", + expand=False, + ), + ) prnt() # Create hooks table table = Table( - title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)', + title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)", box=box.ROUNDED, show_header=True, - header_style='bold magenta', + header_style="bold magenta", ) - table.add_column('Plugin', style='cyan', width=20) - table.add_column('Hook Name', style='green') - table.add_column('BG', justify='center', width=4) - table.add_column('Type', justify='center', width=5) + table.add_column("Plugin", style="cyan", width=20) + table.add_column("Hook Name", style="green") + table.add_column("BG", justify="center", width=4) + table.add_column("Type", justify="center", width=5) # Sort lexicographically by hook name - sorted_hooks = sorted(hook_infos, key=lambda h: h['name']) + sorted_hooks = sorted(hook_infos, key=lambda h: h["name"]) for hook in sorted_hooks: - bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else '' - ext = hook['extension'].lstrip('.') + bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else "" + ext = hook["extension"].lstrip(".") table.add_row( - hook['plugin'], - hook['name'], + hook["plugin"], + hook["name"], bg_marker, ext, ) prnt(table) prnt() - prnt(f'[dim]{info["description"]}[/dim]') + prnt(f"[dim]{info['description']}[/dim]") prnt() # Summary if not quiet: - total_hooks = sum(m['hook_count'] for m in result['models'].values()) - prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]') + total_hooks = sum(m["hook_count"] for m in result["models"].values()) + prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]") prnt() - prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]') - prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]') - prnt('[dim] - .bg: Background hook (non-blocking)[/dim]') - prnt('[dim] - ext: py, sh, or js[/dim]') + prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]") + prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]") + prnt("[dim] - .bg: Background hook (non-blocking)[/dim]") + prnt("[dim] - ext: py, sh, or js[/dim]") prnt() return result @click.command() -@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too') -@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)') -@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams') +@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too") +@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)") +@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams") @docstring(pluginmap.__doc__) def main(**kwargs): import json + result = pluginmap(**kwargs) - if kwargs.get('quiet'): + if kwargs.get("quiet"): print(json.dumps(result, indent=2)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_process.py b/archivebox/cli/archivebox_process.py index d8b00426..4df39c75 100644 --- a/archivebox/cli/archivebox_process.py +++ b/archivebox/cli/archivebox_process.py @@ -22,11 +22,10 @@ Examples: archivebox process list --limit=10 """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox process' +__package__ = "archivebox.cli" +__command__ = "archivebox process" import sys -from typing import Optional import rich_click as click from rich import print as rprint @@ -38,10 +37,11 @@ from archivebox.cli.cli_utils import apply_filters # LIST # ============================================================================= + def list_processes( - binary_name: Optional[str] = None, - machine_id: Optional[str] = None, - limit: Optional[int] = None, + binary_name: str | None = None, + machine_id: str | None = None, + limit: int | None = None, ) -> int: """ List Processes as JSONL with optional filters. @@ -54,29 +54,29 @@ def list_processes( is_tty = sys.stdout.isatty() - queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts') + queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts") # Apply filters filter_kwargs = {} if binary_name: - filter_kwargs['binary__name'] = binary_name + filter_kwargs["binary__name"] = binary_name if machine_id: - filter_kwargs['machine_id'] = machine_id + filter_kwargs["machine_id"] = machine_id queryset = apply_filters(queryset, filter_kwargs, limit=limit) count = 0 for process in queryset: if is_tty: - binary_name_str = process.binary.name if process.binary else 'unknown' - exit_code = process.exit_code if process.exit_code is not None else '?' - status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow' - rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]') + binary_name_str = process.binary.name if process.binary else "unknown" + exit_code = process.exit_code if process.exit_code is not None else "?" + status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow" + rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]") else: write_record(process.to_json()) count += 1 - rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr) return 0 @@ -84,24 +84,27 @@ def list_processes( # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Process records (read-only, system-managed).""" pass -@main.command('list') -@click.option('--binary-name', '-b', help='Filter by binary name') -@click.option('--machine-id', '-m', help='Filter by machine ID') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--binary-name", "-b", help="Filter by binary name") +@click.option("--machine-id", "-m", help="Filter by machine ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None): """List Processes as JSONL.""" - sys.exit(list_processes( - binary_name=binary_name, - machine_id=machine_id, - limit=limit, - )) + sys.exit( + list_processes( + binary_name=binary_name, + machine_id=machine_id, + limit=limit, + ), + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index c8f8aa35..be3efcb4 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox remove' +__package__ = "archivebox.cli" +__command__ = "archivebox remove" import shutil from pathlib import Path -from typing import Iterable +from collections.abc import Iterable import rich_click as click @@ -26,25 +26,27 @@ from archivebox.misc.logging_util import ( @enforce_types -def remove(filter_patterns: Iterable[str]=(), - filter_type: str='exact', - snapshots: QuerySet | None=None, - after: float | None=None, - before: float | None=None, - yes: bool=False, - delete: bool=False, - out_dir: Path=DATA_DIR) -> QuerySet: +def remove( + filter_patterns: Iterable[str] = (), + filter_type: str = "exact", + snapshots: QuerySet | None = None, + after: float | None = None, + before: float | None = None, + yes: bool = False, + delete: bool = False, + out_dir: Path = DATA_DIR, +) -> QuerySet: """Remove the specified URLs from the archive""" - + setup_django() check_data_folder() - + from archivebox.cli.archivebox_search import get_snapshots pattern_list = list(filter_patterns) log_list_started(pattern_list or None, filter_type) - timer = TimedProgress(360, prefix=' ') + timer = TimedProgress(360, prefix=" ") try: snapshots = get_snapshots( snapshots=snapshots, @@ -63,7 +65,7 @@ def remove(filter_patterns: Iterable[str]=(), log_list_finished(snapshots) log_removal_started(snapshots, yes=yes, delete=delete) - timer = TimedProgress(360, prefix=' ') + timer = TimedProgress(360, prefix=" ") try: for snapshot in snapshots: if delete: @@ -88,17 +90,23 @@ def remove(filter_patterns: Iterable[str]=(), @click.command() -@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm') -@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index') -@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp') -@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp') -@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs') -@click.argument('filter_patterns', nargs=-1) +@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm") +@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index") +@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp") +@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp") +@click.option( + "--filter-type", + "-f", + type=click.Choice(("exact", "substring", "domain", "regex", "tag")), + default="exact", + help="Type of pattern matching to use when filtering URLs", +) +@click.argument("filter_patterns", nargs=-1) @docstring(remove.__doc__) def main(**kwargs): """Remove the specified URLs from the archive""" remove(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py index 292baf87..08346662 100644 --- a/archivebox/cli/archivebox_run.py +++ b/archivebox/cli/archivebox_run.py @@ -37,8 +37,8 @@ Examples: archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox run' +__package__ = "archivebox.cli" +__command__ = "archivebox run" import sys from collections import defaultdict @@ -87,8 +87,8 @@ def process_stdin_records() -> int: binary_ids: list[str] = [] for record in records: - record_type = record.get('type', '') - record_id = record.get('id') + record_type = record.get("type", "") + record_id = record.get("id") try: if record_type == TYPE_CRAWL: @@ -97,10 +97,10 @@ def process_stdin_records() -> int: try: crawl = Crawl.objects.get(id=record_id) except Crawl.DoesNotExist: - crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) else: # New crawl - create it - crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) if crawl: crawl.retry_at = timezone.now() @@ -112,16 +112,16 @@ def process_stdin_records() -> int: output_records.append(crawl.to_json()) queued_count += 1 - elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type): + elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type): if record_id: # Existing snapshot - re-queue try: snapshot = Snapshot.objects.get(id=record_id) except Snapshot.DoesNotExist: - snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) else: # New snapshot - create it - snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) if snapshot: snapshot.retry_at = timezone.now() @@ -132,7 +132,7 @@ def process_stdin_records() -> int: crawl.retry_at = timezone.now() if crawl.status != Crawl.StatusChoices.STARTED: crawl.status = Crawl.StatusChoices.QUEUED - crawl.save(update_fields=['status', 'retry_at', 'modified_at']) + crawl.save(update_fields=["status", "retry_at", "modified_at"]) crawl_id = str(snapshot.crawl_id) snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) run_all_plugins_for_crawl.add(crawl_id) @@ -149,11 +149,16 @@ def process_stdin_records() -> int: else: archiveresult = None - snapshot_id = record.get('snapshot_id') - plugin_name = record.get('plugin') + snapshot_id = record.get("snapshot_id") + plugin_name = record.get("plugin") snapshot = None if archiveresult: - if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]: + if archiveresult.status in [ + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ArchiveResult.StatusChoices.BACKOFF, + ]: archiveresult.reset_for_retry() snapshot = archiveresult.snapshot plugin_name = plugin_name or archiveresult.plugin @@ -167,12 +172,12 @@ def process_stdin_records() -> int: snapshot.retry_at = timezone.now() if snapshot.status != Snapshot.StatusChoices.STARTED: snapshot.status = Snapshot.StatusChoices.QUEUED - snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) crawl = snapshot.crawl crawl.retry_at = timezone.now() if crawl.status != Crawl.StatusChoices.STARTED: crawl.status = Crawl.StatusChoices.QUEUED - crawl.save(update_fields=['status', 'retry_at', 'modified_at']) + crawl.save(update_fields=["status", "retry_at", "modified_at"]) crawl_id = str(snapshot.crawl_id) snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) if plugin_name: @@ -203,7 +208,7 @@ def process_stdin_records() -> int: output_records.append(record) except Exception as e: - rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr) continue # Output all processed records (for chaining) @@ -212,10 +217,10 @@ def process_stdin_records() -> int: write_record(rec) if queued_count == 0: - rprint('[yellow]No records to process[/yellow]', file=sys.stderr) + rprint("[yellow]No records to process[/yellow]", file=sys.stderr) return 0 - rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr) + rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr) for binary_id in binary_ids: run_binary(binary_id) @@ -245,13 +250,14 @@ def run_runner(daemon: bool = False) -> int: from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls Process.cleanup_stale_running() + Process.cleanup_orphaned_workers() recover_orphaned_snapshots() recover_orphaned_crawls() Machine.current() current = Process.current() if current.process_type != Process.TypeChoices.ORCHESTRATOR: current.process_type = Process.TypeChoices.ORCHESTRATOR - current.save(update_fields=['process_type', 'modified_at']) + current.save(update_fields=["process_type", "modified_at"]) try: run_pending_crawls(daemon=daemon) @@ -259,21 +265,21 @@ def run_runner(daemon: bool = False) -> int: except KeyboardInterrupt: return 0 except Exception as e: - rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) return 1 finally: current.refresh_from_db() if current.status != Process.StatusChoices.EXITED: current.status = Process.StatusChoices.EXITED current.ended_at = current.ended_at or timezone.now() - current.save(update_fields=['status', 'ended_at', 'modified_at']) + current.save(update_fields=["status", "ended_at", "modified_at"]) @click.command() -@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)") -@click.option('--crawl-id', help="Run the crawl runner for a specific crawl only") -@click.option('--snapshot-id', help="Run one snapshot through its crawl") -@click.option('--binary-id', help="Run one queued binary install directly on the bus") +@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)") +@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only") +@click.option("--snapshot-id", help="Run one snapshot through its crawl") +@click.option("--binary-id", help="Run one queued binary install directly on the bus") def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str): """ Process queued work. @@ -297,21 +303,24 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str): except KeyboardInterrupt: sys.exit(0) except Exception as e: - rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) if crawl_id: try: from archivebox.services.runner import run_crawl + run_crawl(crawl_id) sys.exit(0) except KeyboardInterrupt: sys.exit(0) except Exception as e: - rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) @@ -333,17 +342,18 @@ def run_snapshot_worker(snapshot_id: str) -> int: from archivebox.services.runner import run_crawl try: - snapshot = Snapshot.objects.select_related('crawl').get(id=snapshot_id) + snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id) run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)]) return 0 except KeyboardInterrupt: return 0 except Exception as e: - rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) import traceback + traceback.print_exc() return 1 -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index c420fc6b..bb9c1dac 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import rich_click as click from rich import print @@ -10,18 +10,20 @@ from archivebox.config.common import ARCHIVING_CONFIG @enforce_types -def schedule(add: bool = False, - show: bool = False, - clear: bool = False, - foreground: bool = False, - run_all: bool = False, - quiet: bool = False, - every: str | None = None, - tag: str = '', - depth: int | str = 0, - overwrite: bool = False, - update: bool = not ARCHIVING_CONFIG.ONLY_NEW, - import_path: str | None = None): +def schedule( + add: bool = False, + show: bool = False, + clear: bool = False, + foreground: bool = False, + run_all: bool = False, + quiet: bool = False, + every: str | None = None, + tag: str = "", + depth: int | str = 0, + overwrite: bool = False, + update: bool = not ARCHIVING_CONFIG.ONLY_NEW, + import_path: str | None = None, +): """Manage database-backed scheduled crawls processed by the crawl runner.""" from django.utils import timezone @@ -33,55 +35,51 @@ def schedule(add: bool = False, depth = int(depth) result: dict[str, object] = { - 'created_schedule_ids': [], - 'disabled_count': 0, - 'run_all_enqueued': 0, - 'active_schedule_ids': [], + "created_schedule_ids": [], + "disabled_count": 0, + "run_all_enqueued": 0, + "active_schedule_ids": [], } def _active_schedules(): - return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at') + return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at") if clear: disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update( is_enabled=False, modified_at=timezone.now(), ) - result['disabled_count'] = disabled_count - print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]') + result["disabled_count"] = disabled_count + print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]") if every or add: - schedule_str = (every or 'day').strip() + schedule_str = (every or "day").strip() validate_schedule(schedule_str) created_by_id = get_or_create_system_user_pk() is_update_schedule = not import_path - template_urls = import_path or 'archivebox://update' - template_label = ( - f'Scheduled import: {template_urls}' - if import_path else - 'Scheduled ArchiveBox update' - )[:64] + template_urls = import_path or "archivebox://update" + template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64] template_notes = ( - f'Created by archivebox schedule for {template_urls}' - if import_path else - 'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.' + f"Created by archivebox schedule for {template_urls}" + if import_path + else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls." ) template = Crawl.objects.create( urls=template_urls, max_depth=0 if is_update_schedule else depth, - tags_str='' if is_update_schedule else tag, + tags_str="" if is_update_schedule else tag, label=template_label, notes=template_notes, created_by_id=created_by_id, status=Crawl.StatusChoices.SEALED, retry_at=None, config={ - 'ONLY_NEW': not update, - 'OVERWRITE': overwrite, - 'DEPTH': 0 if is_update_schedule else depth, - 'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl', + "ONLY_NEW": not update, + "OVERWRITE": overwrite, + "DEPTH": 0 if is_update_schedule else depth, + "SCHEDULE_KIND": "update" if is_update_schedule else "crawl", }, ) crawl_schedule = CrawlSchedule.objects.create( @@ -92,31 +90,31 @@ def schedule(add: bool = False, notes=template_notes, created_by_id=created_by_id, ) - result['created_schedule_ids'] = [str(crawl_schedule.id)] + result["created_schedule_ids"] = [str(crawl_schedule.id)] - schedule_type = 'maintenance update' if is_update_schedule else 'crawl' - print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]') - print(f' id={crawl_schedule.id}') - print(f' every={crawl_schedule.schedule}') - print(f' next_run={crawl_schedule.next_run_at.isoformat()}') + schedule_type = "maintenance update" if is_update_schedule else "crawl" + print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]") + print(f" id={crawl_schedule.id}") + print(f" every={crawl_schedule.schedule}") + print(f" next_run={crawl_schedule.next_run_at.isoformat()}") if import_path: - print(f' source={import_path}') + print(f" source={import_path}") schedules = list(_active_schedules()) - result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules] + result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules] if show: if schedules: - print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]') + print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]") for scheduled_crawl in schedules: template = scheduled_crawl.template print( - f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} ' - f'next_run={scheduled_crawl.next_run_at.isoformat()} ' - f'source={template.urls.splitlines()[0] if template.urls else ""}' + f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} " + f"next_run={scheduled_crawl.next_run_at.isoformat()} " + f"source={template.urls.splitlines()[0] if template.urls else ''}", ) else: - print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]') + print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]") if run_all: enqueued = 0 @@ -124,13 +122,17 @@ def schedule(add: bool = False, for scheduled_crawl in schedules: scheduled_crawl.enqueue(queued_at=now) enqueued += 1 - result['run_all_enqueued'] = enqueued - print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]') + result["run_all_enqueued"] = enqueued + print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]") if enqueued: - print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]') + print( + "[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]", + ) if foreground: - print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]') + print( + "[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]", + ) run_pending_crawls(daemon=True) if quiet: @@ -138,33 +140,38 @@ def schedule(add: bool = False, if not any((every, add, show, clear, foreground, run_all)): if schedules: - print('[green]\\[*] Active scheduled crawls:[/green]') + print("[green]\\[*] Active scheduled crawls:[/green]") for scheduled_crawl in schedules: - print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}') + print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}") else: - print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]') + print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]") return result @click.command() -@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output") -@click.option('--add', is_flag=True, help='Create a new scheduled crawl') -@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"') -@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots') -@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away') -@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously') -@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run') -@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules') -@click.option('--show', is_flag=True, help='Print all currently enabled schedules') -@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)') -@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once') -@click.argument('import_path', required=False) +@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output") +@click.option("--add", is_flag=True, help="Create a new scheduled crawl") +@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"') +@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots") +@click.option( + "--depth", + type=click.Choice([str(i) for i in range(5)]), + default="0", + help="Recursively archive linked pages up to N hops away", +) +@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously") +@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run") +@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules") +@click.option("--show", is_flag=True, help="Print all currently enabled schedules") +@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)") +@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once") +@click.argument("import_path", required=False) @docstring(schedule.__doc__) def main(**kwargs): """Manage database-backed scheduled crawls processed by the crawl runner.""" schedule(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py index 1b5ab259..86af83bb 100644 --- a/archivebox/cli/archivebox_search.py +++ b/archivebox/cli/archivebox_search.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox search' +__package__ = "archivebox.cli" +__command__ = "archivebox search" import sys from pathlib import Path -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING +from collections.abc import Callable import rich_click as click @@ -20,30 +21,28 @@ if TYPE_CHECKING: # Filter types for URL matching LINK_FILTERS: dict[str, Callable[[str], Q]] = { - 'exact': lambda pattern: Q(url=pattern), - 'substring': lambda pattern: Q(url__icontains=pattern), - 'regex': lambda pattern: Q(url__iregex=pattern), - 'domain': lambda pattern: ( - Q(url__istartswith=f'http://{pattern}') - | Q(url__istartswith=f'https://{pattern}') - | Q(url__istartswith=f'ftp://{pattern}') + "exact": lambda pattern: Q(url=pattern), + "substring": lambda pattern: Q(url__icontains=pattern), + "regex": lambda pattern: Q(url__iregex=pattern), + "domain": lambda pattern: ( + Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}") ), - 'tag': lambda pattern: Q(tags__name=pattern), - 'timestamp': lambda pattern: Q(timestamp=pattern), + "tag": lambda pattern: Q(tags__name=pattern), + "timestamp": lambda pattern: Q(timestamp=pattern), } -STATUS_CHOICES = ['indexed', 'archived', 'unarchived'] +STATUS_CHOICES = ["indexed", "archived", "unarchived"] def _apply_pattern_filters( - snapshots: QuerySet['Snapshot', 'Snapshot'], + snapshots: QuerySet["Snapshot", "Snapshot"], filter_patterns: list[str], filter_type: str, -) -> QuerySet['Snapshot', 'Snapshot']: +) -> QuerySet["Snapshot", "Snapshot"]: filter_builder = LINK_FILTERS.get(filter_type) if filter_builder is None: stderr() - stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red') + stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red") raise SystemExit(2) query = Q() @@ -53,7 +52,7 @@ def _apply_pattern_filters( def _snapshots_to_json( - snapshots: QuerySet['Snapshot', 'Snapshot'], + snapshots: QuerySet["Snapshot", "Snapshot"], *, with_headers: bool, ) -> str: @@ -63,31 +62,35 @@ def _snapshots_to_json( from archivebox.config.common import SERVER_CONFIG from archivebox.misc.util import to_json - main_index_header = { - 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', - 'schema': 'archivebox.index.json', - 'copyright_info': SERVER_CONFIG.FOOTER_INFO, - 'meta': { - 'project': 'ArchiveBox', - 'version': VERSION, - 'git_sha': VERSION, - 'website': 'https://ArchiveBox.io', - 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', - 'source': 'https://github.com/ArchiveBox/ArchiveBox', - 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', - 'dependencies': {}, - }, - } if with_headers else {} + main_index_header = ( + { + "info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.", + "schema": "archivebox.index.json", + "copyright_info": SERVER_CONFIG.FOOTER_INFO, + "meta": { + "project": "ArchiveBox", + "version": VERSION, + "git_sha": VERSION, + "website": "https://ArchiveBox.io", + "docs": "https://github.com/ArchiveBox/ArchiveBox/wiki", + "source": "https://github.com/ArchiveBox/ArchiveBox", + "issues": "https://github.com/ArchiveBox/ArchiveBox/issues", + "dependencies": {}, + }, + } + if with_headers + else {} + ) snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)] output: dict[str, object] | list[dict[str, object]] if with_headers: output = { **main_index_header, - 'num_links': len(snapshot_dicts), - 'updated': datetime.now(tz.utc), - 'last_run_cmd': sys.argv, - 'links': snapshot_dicts, + "num_links": len(snapshot_dicts), + "updated": datetime.now(tz.utc), + "last_run_cmd": sys.argv, + "links": snapshot_dicts, } else: output = snapshot_dicts @@ -96,18 +99,18 @@ def _snapshots_to_json( def _snapshots_to_csv( - snapshots: QuerySet['Snapshot', 'Snapshot'], + snapshots: QuerySet["Snapshot", "Snapshot"], *, cols: list[str], with_headers: bool, ) -> str: - header = ','.join(cols) if with_headers else '' - rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)] - return '\n'.join((header, *rows)) + header = ",".join(cols) if with_headers else "" + rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)] + return "\n".join((header, *rows)) def _snapshots_to_html( - snapshots: QuerySet['Snapshot', 'Snapshot'], + snapshots: QuerySet["Snapshot", "Snapshot"], *, with_headers: bool, ) -> str: @@ -119,26 +122,31 @@ def _snapshots_to_html( from archivebox.config.common import SERVER_CONFIG from archivebox.config.version import get_COMMIT_HASH - template = 'static_index.html' if with_headers else 'minimal_index.html' + template = "static_index.html" if with_headers else "minimal_index.html" snapshot_list = list(snapshots.iterator(chunk_size=500)) - return render_to_string(template, { - 'version': VERSION, - 'git_sha': get_COMMIT_HASH() or VERSION, - 'num_links': str(len(snapshot_list)), - 'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'), - 'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'), - 'links': snapshot_list, - 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, - }) + return render_to_string( + template, + { + "version": VERSION, + "git_sha": get_COMMIT_HASH() or VERSION, + "num_links": str(len(snapshot_list)), + "date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"), + "time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"), + "links": snapshot_list, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + }, + ) -def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None, - filter_patterns: list[str] | None=None, - filter_type: str='substring', - after: float | None=None, - before: float | None=None, - out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']: +def get_snapshots( + snapshots: QuerySet["Snapshot", "Snapshot"] | None = None, + filter_patterns: list[str] | None = None, + filter_type: str = "substring", + after: float | None = None, + before: float | None = None, + out_dir: Path = DATA_DIR, +) -> QuerySet["Snapshot", "Snapshot"]: """Filter and return Snapshots matching the given criteria.""" from archivebox.core.models import Snapshot @@ -155,29 +163,31 @@ def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None, result = _apply_pattern_filters(result, filter_patterns, filter_type) # Prefetch crawl relationship to avoid N+1 queries when accessing output_dir - result = result.select_related('crawl', 'crawl__created_by') + result = result.select_related("crawl", "crawl__created_by") if not result.exists(): - stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') + stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow") return result @enforce_types -def search(filter_patterns: list[str] | None=None, - filter_type: str='substring', - status: str='indexed', - before: float | None=None, - after: float | None=None, - sort: str | None=None, - json: bool=False, - html: bool=False, - csv: str | None=None, - with_headers: bool=False): +def search( + filter_patterns: list[str] | None = None, + filter_type: str = "substring", + status: str = "indexed", + before: float | None = None, + after: float | None = None, + sort: str | None = None, + json: bool = False, + html: bool = False, + csv: str | None = None, + with_headers: bool = False, +): """List, filter, and export information about archive entries""" if with_headers and not (json or html or csv): - stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') + stderr("[X] --with-headers requires --json, --html or --csv\n", color="red") raise SystemExit(2) # Query DB directly - no filesystem scanning @@ -189,9 +199,9 @@ def search(filter_patterns: list[str] | None=None, ) # Apply status filter - if status == 'archived': + if status == "archived": snapshots = snapshots.filter(downloaded_at__isnull=False) - elif status == 'unarchived': + elif status == "unarchived": snapshots = snapshots.filter(downloaded_at__isnull=True) # 'indexed' = all snapshots (no filter) @@ -204,9 +214,10 @@ def search(filter_patterns: list[str] | None=None, elif html: output = _snapshots_to_html(snapshots, with_headers=with_headers) elif csv: - output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers) + output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers) else: from archivebox.misc.logging_util import printable_folders + # Convert to dict for printable_folders folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots} output = printable_folders(folders, with_headers) @@ -214,28 +225,33 @@ def search(filter_patterns: list[str] | None=None, # Structured exports must be written directly to stdout. # rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output. sys.stdout.write(output) - if not output.endswith('\n'): - sys.stdout.write('\n') + if not output.endswith("\n"): + sys.stdout.write("\n") return output @click.command() -@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs') -@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status') -@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp') -@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp') -@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at') -@click.option('--json', '-J', is_flag=True, help='Print output in JSON format') -@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)') -@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title') -@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output') -@click.help_option('--help', '-h') -@click.argument('filter_patterns', nargs=-1) +@click.option( + "--filter-type", + "-f", + type=click.Choice(["search", *LINK_FILTERS.keys()]), + default="substring", + help="Pattern matching type for filtering URLs", +) +@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status") +@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp") +@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp") +@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") +@click.option("--json", "-J", is_flag=True, help="Print output in JSON format") +@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)") +@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title") +@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output") +@click.help_option("--help", "-h") +@click.argument("filter_patterns", nargs=-1) @docstring(search.__doc__) def main(**kwargs): return search(**kwargs) - -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index cbd7a9ce..861ce775 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" -from typing import Iterable +from collections.abc import Iterable import sys import rich_click as click @@ -15,20 +15,23 @@ from archivebox.config.common import SERVER_CONFIG def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int: """Stop any existing orchestrator process so the server can take ownership.""" process_model.cleanup_stale_running(machine=machine) + process_model.cleanup_orphaned_workers() - running_runners = list(process_model.objects.filter( - machine=machine, - status=process_model.StatusChoices.RUNNING, - process_type=process_model.TypeChoices.ORCHESTRATOR, - ).order_by('created_at')) + running_runners = list( + process_model.objects.filter( + machine=machine, + status=process_model.StatusChoices.RUNNING, + process_type=process_model.TypeChoices.ORCHESTRATOR, + ).order_by("created_at"), + ) if not running_runners: return 0 - log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]') + log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]") if supervisor is not None and stop_worker_fn is not None: - for worker_name in ('worker_runner', 'worker_runner_watch'): + for worker_name in ("worker_runner", "worker_runner_watch"): try: stop_worker_fn(supervisor, worker_name) except Exception: @@ -47,23 +50,70 @@ def stop_existing_background_runner(*, machine, process_model, supervisor=None, return len(running_runners) +def _read_supervisor_worker_command(worker_name: str) -> str: + from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file + + worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf" + if not worker_conf.exists(): + return "" + + for line in worker_conf.read_text().splitlines(): + if line.startswith("command="): + return line.removeprefix("command=").strip() + return "" + + +def _worker_command_matches_bind(command: str, host: str, port: str) -> bool: + if not command: + return False + return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command) + + +def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int: + """Stop existing ArchiveBox web workers if they already own the requested bind.""" + stopped = 0 + + for worker_name in ("worker_runserver", "worker_daphne"): + try: + proc = supervisor.getProcessInfo(worker_name) if supervisor else None + except Exception: + proc = None + if not isinstance(proc, dict) or proc.get("statename") != "RUNNING": + continue + + command = _read_supervisor_worker_command(worker_name) + if not _worker_command_matches_bind(command, host, port): + continue + + if stopped == 0: + log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]") + stop_worker_fn(supervisor, worker_name) + stopped += 1 + + return stopped + + @enforce_types -def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), - reload: bool=False, - init: bool=False, - debug: bool=False, - daemonize: bool=False, - nothreading: bool=False) -> None: +def server( + runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,), + reload: bool = False, + init: bool = False, + debug: bool = False, + daemonize: bool = False, + nothreading: bool = False, +) -> None: """Run the ArchiveBox HTTP server""" runserver_args = list(runserver_args) - + if init: from archivebox.cli.archivebox_init import init as archivebox_init + archivebox_init(quick=True) print() from archivebox.misc.checks import check_data_folder + check_data_folder() from archivebox.config.common import SHELL_CONFIG @@ -73,22 +123,24 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), SHELL_CONFIG.DEBUG = True from django.contrib.auth.models import User - - if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + + if not User.objects.filter(is_superuser=True).exclude(username="system").exists(): print() - print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:') - print(' [green]archivebox manage createsuperuser[/green]') + print( + "[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:", + ) + print(" [green]archivebox manage createsuperuser[/green]") print() - host = '127.0.0.1' - port = '8000' - + host = "127.0.0.1" + port = "8000" + try: - host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] - if ':' in host_and_port: - host, port = host_and_port.split(':') + host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0] + if ":" in host_and_port: + host, port = host_and_port.split(":") else: - if '.' in host_and_port: + if "." in host_and_port: host = host_and_port else: port = host_and_port @@ -104,66 +156,80 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), ) from archivebox.machine.models import Machine, Process - # Check if port is already in use - if is_port_in_use(host, int(port)): - print(f'[red][X] Error: Port {port} is already in use[/red]') - print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}') - print(' Stop the conflicting process or choose a different port') - sys.exit(1) - machine = Machine.current() + supervisor = get_existing_supervisord_process() stop_existing_background_runner( machine=machine, process_model=Process, - supervisor=get_existing_supervisord_process(), + supervisor=supervisor, stop_worker_fn=stop_worker, ) + if supervisor: + stop_existing_server_workers( + supervisor=supervisor, + stop_worker_fn=stop_worker, + host=host, + port=port, + ) + + # Check if port is already in use + if is_port_in_use(host, int(port)): + print(f"[red][X] Error: Port {port} is already in use[/red]") + print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}") + print(" Stop the conflicting process or choose a different port") + sys.exit(1) supervisor = get_existing_supervisord_process() if supervisor: - server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne' + server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne" server_proc = get_worker(supervisor, server_worker_name) - server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None - if server_state == 'RUNNING': - runner_proc = get_worker(supervisor, 'worker_runner') - runner_watch_proc = get_worker(supervisor, 'worker_runner_watch') - runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None - runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None - print('[red][X] Error: ArchiveBox server is already running[/red]') - print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - if runner_state == 'RUNNING': - print(' [green]√[/green] Background runner (worker_runner) is RUNNING') - if runner_watch_state == 'RUNNING': - print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING') + server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None + if server_state == "RUNNING": + runner_proc = get_worker(supervisor, "worker_runner") + runner_watch_proc = get_worker(supervisor, "worker_runner_watch") + runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None + runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None + print("[red][X] Error: ArchiveBox server is already running[/red]") + print( + f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]", + ) + if runner_state == "RUNNING": + print(" [green]√[/green] Background runner (worker_runner) is RUNNING") + if runner_watch_state == "RUNNING": + print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING") print() - print('[yellow]To stop the existing server, run:[/yellow]') + print("[yellow]To stop the existing server, run:[/yellow]") print(' pkill -f "archivebox server"') - print(' pkill -f supervisord') + print(" pkill -f supervisord") sys.exit(1) if run_in_debug: - print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]') + print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]") else: - print('[green][+] Starting ArchiveBox webserver...[/green]') - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') - print(' > Writing ArchiveBox error log to ./logs/errors.log') + print("[green][+] Starting ArchiveBox webserver...[/green]") + print( + f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]", + ) + print( + f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]", + ) + print(" > Writing ArchiveBox error log to ./logs/errors.log") print() start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading) print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") @click.command() -@click.argument('runserver_args', nargs=-1) -@click.option('--reload', is_flag=True, help='Enable auto-reloading when code or templates change') -@click.option('--debug', is_flag=True, help='Enable DEBUG=True mode with more verbose errors') -@click.option('--nothreading', is_flag=True, help='Force runserver to run in single-threaded mode') -@click.option('--init', is_flag=True, help='Run a full archivebox init/upgrade before starting the server') -@click.option('--daemonize', is_flag=True, help='Run the server in the background as a daemon') +@click.argument("runserver_args", nargs=-1) +@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change") +@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors") +@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode") +@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server") +@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon") @docstring(server.__doc__) def main(**kwargs): server(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py index d51e8aba..26943d24 100644 --- a/archivebox/cli/archivebox_shell.py +++ b/archivebox/cli/archivebox_shell.py @@ -1,27 +1,28 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" -from typing import Iterable +from collections.abc import Iterable import rich_click as click from archivebox.misc.util import docstring -def shell(args: Iterable[str]=()) -> None: +def shell(args: Iterable[str] = ()) -> None: """Enter an interactive ArchiveBox Django shell""" from django.core.management import call_command + call_command("shell_plus", *args) @click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) -@click.argument('args', nargs=-1) +@click.argument("args", nargs=-1) @docstring(shell.__doc__) -def main(args: Iterable[str]=()) -> None: +def main(args: Iterable[str] = ()) -> None: shell(args=args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index ae65fdab..e9126549 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -27,14 +27,16 @@ Examples: archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox snapshot' +__package__ = "archivebox.cli" +__command__ = "archivebox snapshot" import sys -from typing import Optional, Iterable +from collections.abc import Iterable import rich_click as click from rich import print as rprint +from django.db.models import Q, Sum +from django.db.models.functions import Coalesce from archivebox.cli.cli_utils import apply_filters @@ -43,12 +45,13 @@ from archivebox.cli.cli_utils import apply_filters # CREATE # ============================================================================= + def create_snapshots( urls: Iterable[str], - tag: str = '', - status: str = 'queued', + tag: str = "", + status: str = "queued", depth: int = 0, - created_by_id: Optional[int] = None, + created_by_id: int | None = None, ) -> int: """ Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). @@ -59,8 +62,10 @@ def create_snapshots( 1: Failure """ from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, - TYPE_SNAPSHOT, TYPE_CRAWL + read_args_or_stdin, + write_record, + TYPE_SNAPSHOT, + TYPE_CRAWL, ) from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.core.models import Snapshot @@ -73,7 +78,7 @@ def create_snapshots( records = list(read_args_or_stdin(urls)) if not records: - rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) + rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) return 1 # Process each record - handle Crawls and plain URLs/Snapshots @@ -81,7 +86,7 @@ def create_snapshots( pass_through_count = 0 for record in records: - record_type = record.get('type', '') + record_type = record.get("type", "") try: if record_type == TYPE_CRAWL: @@ -91,14 +96,14 @@ def create_snapshots( # Input is a Crawl - get or create it, then create Snapshots for its URLs crawl = None - crawl_id = record.get('id') + crawl_id = record.get("id") if crawl_id: try: crawl = Crawl.objects.get(id=crawl_id) except Crawl.DoesNotExist: - crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) else: - crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) if not crawl: continue @@ -109,27 +114,27 @@ def create_snapshots( if tag: merged_tags = f"{merged_tags},{tag}" if merged_tags else tag snapshot_record = { - 'url': url, - 'tags': merged_tags, - 'crawl_id': str(crawl.id), - 'depth': depth, - 'status': status, + "url": url, + "tags": merged_tags, + "crawl_id": str(crawl.id), + "depth": depth, + "status": status, } - snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id}) if snapshot: created_snapshots.append(snapshot) if not is_tty: write_record(snapshot.to_json()) - elif record_type == TYPE_SNAPSHOT or record.get('url'): + elif record_type == TYPE_SNAPSHOT or record.get("url"): # Input is a Snapshot or plain URL - if tag and not record.get('tags'): - record['tags'] = tag + if tag and not record.get("tags"): + record["tags"] = tag if status: - record['status'] = status - record['depth'] = record.get('depth', depth) + record["status"] = status + record["depth"] = record.get("depth", depth) - snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) if snapshot: created_snapshots.append(snapshot) if not is_tty: @@ -142,21 +147,21 @@ def create_snapshots( pass_through_count += 1 except Exception as e: - rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr) + rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr) continue if not created_snapshots: if pass_through_count > 0: - rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr) + rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr) return 0 - rprint('[red]No snapshots created[/red]', file=sys.stderr) + rprint("[red]No snapshots created[/red]", file=sys.stderr) return 1 - rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr) + rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr) if is_tty: for snapshot in created_snapshots: - rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr) + rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) return 0 @@ -165,16 +170,19 @@ def create_snapshots( # LIST # ============================================================================= + def list_snapshots( - status: Optional[str] = None, - url__icontains: Optional[str] = None, - url__istartswith: Optional[str] = None, - tag: Optional[str] = None, - crawl_id: Optional[str] = None, - limit: Optional[int] = None, - sort: Optional[str] = None, - csv: Optional[str] = None, + status: str | None = None, + url__icontains: str | None = None, + url__istartswith: str | None = None, + tag: str | None = None, + crawl_id: str | None = None, + limit: int | None = None, + sort: str | None = None, + csv: str | None = None, with_headers: bool = False, + search: str | None = None, + query: str | None = None, ) -> int: """ List Snapshots as JSONL with optional filters. @@ -184,64 +192,106 @@ def list_snapshots( """ from archivebox.misc.jsonl import write_record from archivebox.core.models import Snapshot + from archivebox.search import ( + get_default_search_mode, + get_search_mode, + prioritize_metadata_matches, + query_search_index, + ) if with_headers and not csv: - rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr) + rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr) return 2 is_tty = sys.stdout.isatty() and not csv - queryset = Snapshot.objects.all().order_by('-created_at') + queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at") # Apply filters filter_kwargs = { - 'status': status, - 'url__icontains': url__icontains, - 'url__istartswith': url__istartswith, - 'crawl_id': crawl_id, + "status": status, + "url__icontains": url__icontains, + "url__istartswith": url__istartswith, + "crawl_id": crawl_id, } - queryset = apply_filters(queryset, filter_kwargs, limit=limit) + queryset = apply_filters(queryset, filter_kwargs) # Tag filter requires special handling (M2M) if tag: queryset = queryset.filter(tags__name__iexact=tag) + query = (query or "").strip() + if query: + metadata_qs = queryset.filter( + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), + ) + requested_search_mode = (search or "").strip().lower() + if requested_search_mode == "content": + requested_search_mode = "contents" + search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode) + + if search_mode == "meta": + queryset = metadata_qs + else: + try: + deep_qsearch = None + if search_mode == "deep": + qsearch = query_search_index(query, search_mode="contents") + deep_qsearch = query_search_index(query, search_mode="deep") + else: + qsearch = query_search_index(query, search_mode=search_mode) + queryset = prioritize_metadata_matches( + queryset, + metadata_qs, + qsearch, + deep_queryset=deep_qsearch, + ordering=("-created_at",) if not sort else None, + ) + except Exception as err: + rprint( + f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]", + file=sys.stderr, + ) + queryset = metadata_qs + if sort: queryset = queryset.order_by(sort) + if limit: + queryset = queryset[:limit] count = 0 if csv: - cols = [col.strip() for col in csv.split(',') if col.strip()] + cols = [col.strip() for col in csv.split(",") if col.strip()] if not cols: - rprint('[red]No CSV columns provided[/red]', file=sys.stderr) + rprint("[red]No CSV columns provided[/red]", file=sys.stderr) return 2 rows: list[str] = [] if with_headers: - rows.append(','.join(cols)) + rows.append(",".join(cols)) for snapshot in queryset.iterator(chunk_size=500): - rows.append(snapshot.to_csv(cols=cols, separator=',')) + rows.append(snapshot.to_csv(cols=cols, separator=",")) count += 1 - output = '\n'.join(rows) + output = "\n".join(rows) if output: sys.stdout.write(output) - if not output.endswith('\n'): - sys.stdout.write('\n') - rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr) + if not output.endswith("\n"): + sys.stdout.write("\n") + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) return 0 for snapshot in queryset: if is_tty: status_color = { - 'queued': 'yellow', - 'started': 'blue', - 'sealed': 'green', - }.get(snapshot.status, 'dim') - rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}') + "queued": "yellow", + "started": "blue", + "sealed": "green", + }.get(snapshot.status, "dim") + rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}") else: write_record(snapshot.to_json()) count += 1 - rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) return 0 @@ -249,9 +299,10 @@ def list_snapshots( # UPDATE # ============================================================================= + def update_snapshots( - status: Optional[str] = None, - tag: Optional[str] = None, + status: str | None = None, + tag: str | None = None, ) -> int: """ Update Snapshots from stdin JSONL. @@ -272,12 +323,12 @@ def update_snapshots( records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: - snapshot_id = record.get('id') + snapshot_id = record.get("id") if not snapshot_id: continue @@ -292,6 +343,7 @@ def update_snapshots( # Add tag to existing tags snapshot.save() # Ensure saved before M2M from archivebox.core.models import Tag + tag_obj, _ = Tag.objects.get_or_create(name=tag) snapshot.tags.add(tag_obj) @@ -302,10 +354,10 @@ def update_snapshots( write_record(snapshot.to_json()) except Snapshot.DoesNotExist: - rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr) continue - rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr) + rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr) return 0 @@ -313,6 +365,7 @@ def update_snapshots( # DELETE # ============================================================================= + def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: """ Delete Snapshots from stdin JSONL. @@ -328,35 +381,35 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 - snapshot_ids = [r.get('id') for r in records if r.get('id')] + snapshot_ids = [r.get("id") for r in records if r.get("id")] if not snapshot_ids: - rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr) return 1 snapshots = Snapshot.objects.filter(id__in=snapshot_ids) count = snapshots.count() if count == 0: - rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) return 0 if dry_run: - rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr) + rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr) for snapshot in snapshots: - rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr) + rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) return 0 if not yes: - rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Perform deletion deleted_count, _ = snapshots.delete() - rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr) + rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr) return 0 @@ -364,57 +417,81 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Snapshot records.""" pass -@main.command('create') -@click.argument('urls', nargs=-1) -@click.option('--tag', '-t', default='', help='Comma-separated tags to add') -@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') -@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)') +@main.command("create") +@click.argument("urls", nargs=-1) +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)") def create_cmd(urls: tuple, tag: str, status: str, depth: int): """Create Snapshots from URLs or stdin JSONL.""" sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) -@main.command('list') -@click.option('--status', '-s', help='Filter by status (queued, started, sealed)') -@click.option('--url__icontains', help='Filter by URL contains') -@click.option('--url__istartswith', help='Filter by URL starts with') -@click.option('--tag', '-t', help='Filter by tag name') -@click.option('--crawl-id', help='Filter by crawl ID') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str], - tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--url__icontains", help="Filter by URL contains") +@click.option("--url__istartswith", help="Filter by URL starts with") +@click.option("--tag", "-t", help="Filter by tag name") +@click.option("--crawl-id", help="Filter by crawl ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") +@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title") +@click.option("--with-headers", is_flag=True, help="Include column headers in structured output") +@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query") +@click.argument("query", nargs=-1) +def list_cmd( + status: str | None, + url__icontains: str | None, + url__istartswith: str | None, + tag: str | None, + crawl_id: str | None, + limit: int | None, + sort: str | None, + csv: str | None, + with_headers: bool, + search: str | None, + query: tuple[str, ...], +): """List Snapshots as JSONL.""" - sys.exit(list_snapshots( - status=status, - url__icontains=url__icontains, - url__istartswith=url__istartswith, - tag=tag, - crawl_id=crawl_id, - limit=limit, - )) + sys.exit( + list_snapshots( + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + csv=csv, + with_headers=with_headers, + search=search, + query=" ".join(query), + ), + ) -@main.command('update') -@click.option('--status', '-s', help='Set status') -@click.option('--tag', '-t', help='Add tag') -def update_cmd(status: Optional[str], tag: Optional[str]): +@main.command("update") +@click.option("--status", "-s", help="Set status") +@click.option("--tag", "-t", help="Add tag") +def update_cmd(status: str | None, tag: str | None): """Update Snapshots from stdin JSONL.""" sys.exit(update_snapshots(status=status, tag=tag)) -@main.command('delete') -@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') -@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete Snapshots from stdin JSONL.""" sys.exit(delete_snapshots(yes=yes, dry_run=dry_run)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_snapshot_compat.py b/archivebox/cli/archivebox_snapshot_compat.py index bb2e95c2..62f684e0 100644 --- a/archivebox/cli/archivebox_snapshot_compat.py +++ b/archivebox/cli/archivebox_snapshot_compat.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox snapshot' +__package__ = "archivebox.cli" +__command__ = "archivebox snapshot" import sys @@ -10,15 +10,15 @@ import rich_click as click from archivebox.cli.archivebox_snapshot import create_snapshots -@click.command(context_settings={'ignore_unknown_options': True}) -@click.option('--tag', '-t', default='', help='Comma-separated tags to add') -@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') -@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)') -@click.argument('urls', nargs=-1) +@click.command(context_settings={"ignore_unknown_options": True}) +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)") +@click.argument("urls", nargs=-1) def main(tag: str, status: str, depth: int, urls: tuple[str, ...]): """Backwards-compatible `archivebox snapshot URL...` entrypoint.""" sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index 0c736ebc..b64ecddb 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" from pathlib import Path @@ -16,31 +16,34 @@ from archivebox.misc.logging_util import printable_filesize @enforce_types -def status(out_dir: Path=DATA_DIR) -> None: +def status(out_dir: Path = DATA_DIR) -> None: """Print out some info and statistics about the archive collection""" from django.contrib.auth import get_user_model + from django.db.models import Sum + from django.db.models.functions import Coalesce from archivebox.core.models import Snapshot + User = get_user_model() - print('[green]\\[*] Scanning archive main index...[/green]') - print(f'[yellow] {out_dir}/*[/yellow]') - num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') + print("[green]\\[*] Scanning archive main index...[/green]") + print(f"[yellow] {out_dir}/*[/yellow]") + num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.") size = printable_filesize(num_bytes) - print(f' Index size: {size} across {num_files} files') + print(f" Index size: {size} across {num_files} files") print() - links = list(Snapshot.objects.all()) + links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))) num_sql_links = len(links) num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) - print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})') - print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') + print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})") + print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)") print() - print('[green]\\[*] Scanning archive data directories...[/green]') - users_dir = out_dir / 'users' + print("[green]\\[*] Scanning archive data directories...[/green]") + users_dir = out_dir / "users" scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()] - scan_roots_display = ', '.join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR) - print(f'[yellow] {scan_roots_display}[/yellow]') + scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR) + print(f"[yellow] {scan_roots_display}[/yellow]") num_bytes = num_dirs = num_files = 0 for root in scan_roots: root_bytes, root_dirs, root_files = get_dir_size(root) @@ -48,80 +51,66 @@ def status(out_dir: Path=DATA_DIR) -> None: num_dirs += root_dirs num_files += root_files size = printable_filesize(num_bytes) - print(f' Size: {size} across {num_files} files in {num_dirs} directories') + print(f" Size: {size} across {num_files} files in {num_dirs} directories") # Use DB as source of truth for snapshot status num_indexed = len(links) num_archived = sum(1 for snapshot in links if snapshot.is_archived) num_unarchived = max(num_indexed - num_archived, 0) - print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)') - print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)') - print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)') + print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)") + print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)") + print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)") # Count snapshot directories on filesystem across both legacy and current layouts. - expected_snapshot_dirs = { - str(Path(snapshot.output_dir).resolve()) - for snapshot in links - if Path(snapshot.output_dir).exists() - } + expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()} discovered_snapshot_dirs = set() if ARCHIVE_DIR.exists(): - discovered_snapshot_dirs.update( - str(entry.resolve()) - for entry in ARCHIVE_DIR.iterdir() - if entry.is_dir() - ) + discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir()) if users_dir.exists(): - discovered_snapshot_dirs.update( - str(entry.resolve()) - for entry in users_dir.glob('*/snapshots/*/*/*') - if entry.is_dir() - ) + discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir()) orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs) num_present = len(discovered_snapshot_dirs) num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs) print() - print(f' > present: {num_present}'.ljust(36), '(snapshot directories on disk)') - print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)') + print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)") + print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)") num_orphaned = len(orphaned_dirs) - print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)') + print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)") if num_indexed: - print(' [violet]Hint:[/violet] You can list snapshots by status like so:') - print(' [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]') + print(" [violet]Hint:[/violet] You can list snapshots by status like so:") + print(" [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]") if orphaned_dirs: - print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') - print(' [green]archivebox init[/green]') + print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:") + print(" [green]archivebox init[/green]") print() - print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') - print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]') - admin_users = User.objects.filter(is_superuser=True).exclude(username='system') + print("[green]\\[*] Scanning recent archive changes and user logins:[/green]") + print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]") + admin_users = User.objects.filter(is_superuser=True).exclude(username="system") users = [user.get_username() for user in admin_users] - print(f' UI users {len(users)}: {", ".join(users)}') - last_login = admin_users.order_by('last_login').last() + print(f" UI users {len(users)}: {', '.join(users)}") + last_login = admin_users.order_by("last_login").last() if last_login: - print(f' Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}') - last_downloaded = Snapshot.objects.order_by('downloaded_at').last() + print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}") + last_downloaded = Snapshot.objects.order_by("downloaded_at").last() if last_downloaded: - print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}') + print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}") if not users: print() - print(' [violet]Hint:[/violet] You can create an admin user by running:') - print(' [green]archivebox manage createsuperuser[/green]') + print(" [violet]Hint:[/violet] You can create an admin user by running:") + print(" [green]archivebox manage createsuperuser[/green]") print() recent_snapshots = sorted( links, - key=lambda snapshot: ( - snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at - ), + key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at, reverse=True, )[:10] for snapshot in recent_snapshots: @@ -129,14 +118,14 @@ def status(out_dir: Path=DATA_DIR) -> None: continue print( ( - '[grey53] ' - f' > {str(snapshot.downloaded_at)[:16]} ' - f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' + "[grey53] " + f" > {str(snapshot.downloaded_at)[:16]} " + f"[{snapshot.num_outputs} {('X', '√')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] " f'"{snapshot.title}": {snapshot.url}' - '[/grey53]' - )[:SHELL_CONFIG.TERM_WIDTH], + "[/grey53]" + )[: SHELL_CONFIG.TERM_WIDTH], ) - print('[grey53] ...') + print("[grey53] ...") @click.command() @@ -146,5 +135,5 @@ def main(**kwargs): status(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_tag.py b/archivebox/cli/archivebox_tag.py index bf72ef97..73352d5d 100644 --- a/archivebox/cli/archivebox_tag.py +++ b/archivebox/cli/archivebox_tag.py @@ -27,11 +27,11 @@ Examples: archivebox tag list --name=unused | archivebox tag delete --yes """ -__package__ = 'archivebox.cli' -__command__ = 'archivebox tag' +__package__ = "archivebox.cli" +__command__ = "archivebox tag" import sys -from typing import Optional, Iterable +from collections.abc import Iterable import rich_click as click from rich import print as rprint @@ -43,6 +43,7 @@ from archivebox.cli.cli_utils import apply_filters # CREATE # ============================================================================= + def create_tags(names: Iterable[str]) -> int: """ Create Tags from names. @@ -60,7 +61,7 @@ def create_tags(names: Iterable[str]) -> int: name_list = list(names) if names else [] if not name_list: - rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr) + rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr) return 1 created_count = 0 @@ -76,11 +77,11 @@ def create_tags(names: Iterable[str]) -> int: if created: created_count += 1 - rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr) + rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr) else: - rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr) + rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr) - rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr) + rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr) return 0 @@ -88,10 +89,11 @@ def create_tags(names: Iterable[str]) -> int: # LIST # ============================================================================= + def list_tags( - name: Optional[str] = None, - name__icontains: Optional[str] = None, - limit: Optional[int] = None, + name: str | None = None, + name__icontains: str | None = None, + limit: int | None = None, ) -> int: """ List Tags as JSONL with optional filters. @@ -104,12 +106,12 @@ def list_tags( is_tty = sys.stdout.isatty() - queryset = Tag.objects.all().order_by('name') + queryset = Tag.objects.all().order_by("name") # Apply filters filter_kwargs = { - 'name': name, - 'name__icontains': name__icontains, + "name": name, + "name__icontains": name__icontains, } queryset = apply_filters(queryset, filter_kwargs, limit=limit) @@ -117,12 +119,12 @@ def list_tags( for tag in queryset: snapshot_count = tag.snapshot_set.count() if is_tty: - rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]') + rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]") else: write_record(tag.to_json()) count += 1 - rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr) + rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr) return 0 @@ -130,7 +132,8 @@ def list_tags( # UPDATE # ============================================================================= -def update_tags(name: Optional[str] = None) -> int: + +def update_tags(name: str | None = None) -> int: """ Update Tags from stdin JSONL. @@ -148,13 +151,13 @@ def update_tags(name: Optional[str] = None) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: - tag_id = record.get('id') - old_name = record.get('name') + tag_id = record.get("id") + old_name = record.get("name") if not tag_id and not old_name: continue @@ -176,10 +179,10 @@ def update_tags(name: Optional[str] = None) -> int: write_record(tag.to_json()) except Tag.DoesNotExist: - rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr) + rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr) continue - rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr) + rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr) return 0 @@ -187,6 +190,7 @@ def update_tags(name: Optional[str] = None) -> int: # DELETE # ============================================================================= + def delete_tags(yes: bool = False, dry_run: bool = False) -> int: """ Delete Tags from stdin JSONL. @@ -202,23 +206,24 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int: records = list(read_stdin()) if not records: - rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 # Collect tag IDs or names tag_ids = [] tag_names = [] for r in records: - if r.get('id'): - tag_ids.append(r['id']) - elif r.get('name'): - tag_names.append(r['name']) + if r.get("id"): + tag_ids.append(r["id"]) + elif r.get("name"): + tag_names.append(r["name"]) if not tag_ids and not tag_names: - rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr) + rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr) return 1 from django.db.models import Q + query = Q() if tag_ids: query |= Q(id__in=tag_ids) @@ -229,22 +234,22 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int: count = tags.count() if count == 0: - rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr) + rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr) return 0 if dry_run: - rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr) + rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr) for tag in tags: - rprint(f' {tag.name}', file=sys.stderr) + rprint(f" {tag.name}", file=sys.stderr) return 0 if not yes: - rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Perform deletion deleted_count, _ = tags.delete() - rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr) + rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr) return 0 @@ -252,42 +257,43 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int: # CLI Commands # ============================================================================= + @click.group() def main(): """Manage Tag records.""" pass -@main.command('create') -@click.argument('names', nargs=-1) +@main.command("create") +@click.argument("names", nargs=-1) def create_cmd(names: tuple): """Create Tags from names.""" sys.exit(create_tags(names)) -@main.command('list') -@click.option('--name', help='Filter by exact name') -@click.option('--name__icontains', help='Filter by name contains') -@click.option('--limit', '-n', type=int, help='Limit number of results') -def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]): +@main.command("list") +@click.option("--name", help="Filter by exact name") +@click.option("--name__icontains", help="Filter by name contains") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(name: str | None, name__icontains: str | None, limit: int | None): """List Tags as JSONL.""" sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit)) -@main.command('update') -@click.option('--name', '-n', help='Set new name') -def update_cmd(name: Optional[str]): +@main.command("update") +@click.option("--name", "-n", help="Set new name") +def update_cmd(name: str | None): """Update Tags from stdin JSONL.""" sys.exit(update_tags(name=name)) -@main.command('delete') -@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') -@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete Tags from stdin JSONL.""" sys.exit(delete_tags(yes=yes, dry_run=dry_run)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 2019fbd5..659fcb97 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import os import time -from typing import TYPE_CHECKING, Callable, Iterable +from typing import TYPE_CHECKING, Any +from collections.abc import Callable, Iterable from pathlib import Path import rich_click as click @@ -20,24 +21,22 @@ if TYPE_CHECKING: LINK_FILTERS: dict[str, Callable[[str], Q]] = { - 'exact': lambda pattern: Q(url=pattern), - 'substring': lambda pattern: Q(url__icontains=pattern), - 'regex': lambda pattern: Q(url__iregex=pattern), - 'domain': lambda pattern: ( - Q(url__istartswith=f'http://{pattern}') - | Q(url__istartswith=f'https://{pattern}') - | Q(url__istartswith=f'ftp://{pattern}') + "exact": lambda pattern: Q(url=pattern), + "substring": lambda pattern: Q(url__icontains=pattern), + "regex": lambda pattern: Q(url__iregex=pattern), + "domain": lambda pattern: ( + Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}") ), - 'tag': lambda pattern: Q(tags__name=pattern), - 'timestamp': lambda pattern: Q(timestamp=pattern), + "tag": lambda pattern: Q(tags__name=pattern), + "timestamp": lambda pattern: Q(timestamp=pattern), } def _apply_pattern_filters( - snapshots: QuerySet['Snapshot', 'Snapshot'], + snapshots: QuerySet["Snapshot", "Snapshot"], filter_patterns: list[str], filter_type: str, -) -> QuerySet['Snapshot', 'Snapshot']: +) -> QuerySet["Snapshot", "Snapshot"]: filter_builder = LINK_FILTERS.get(filter_type) if filter_builder is None: raise SystemExit(2) @@ -48,21 +47,120 @@ def _apply_pattern_filters( return snapshots.filter(query) -def _get_snapshot_crawl(snapshot: 'Snapshot') -> 'Crawl | None': +def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None": try: return snapshot.crawl except ObjectDoesNotExist: return None +def _get_search_indexing_plugins() -> list[str]: + from abx_dl.models import discover_plugins + from archivebox.hooks import get_search_backends + + available_backends = set(get_search_backends()) + plugins = discover_plugins() + return sorted( + plugin_name + for plugin_name, plugin in plugins.items() + if plugin_name.startswith("search_backend_") + and plugin_name.removeprefix("search_backend_") in available_backends + and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks) + ) + + +def _build_filtered_snapshots_queryset( + *, + filter_patterns: Iterable[str], + filter_type: str, + before: float | None, + after: float | None, + resume: str | None = None, +): + from archivebox.core.models import Snapshot + from datetime import datetime + + snapshots = Snapshot.objects.all() + + if filter_patterns: + snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type) + + if before: + snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before)) + if after: + snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after)) + if resume: + snapshots = snapshots.filter(timestamp__lte=resume) + + return snapshots.select_related("crawl").order_by("-bookmarked_at") + + +def reindex_snapshots( + snapshots: QuerySet["Snapshot", "Snapshot"], + *, + search_plugins: list[str], + batch_size: int, +) -> dict[str, int]: + from archivebox.cli.archivebox_extract import run_plugins + + stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0} + records: list[dict[str, str]] = [] + + total = snapshots.count() + print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}") + + for snapshot in snapshots.iterator(chunk_size=batch_size): + stats["processed"] += 1 + + if _get_snapshot_crawl(snapshot) is None: + continue + + output_dir = Path(snapshot.output_dir) + has_directory = output_dir.exists() and output_dir.is_dir() + if has_directory: + snapshot.reconcile_with_index_json() + stats["reconciled"] += 1 + + for plugin_name in search_plugins: + existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first() + if existing_result: + existing_result.reset_for_retry() + records.append( + { + "type": "ArchiveResult", + "snapshot_id": str(snapshot.id), + "plugin": plugin_name, + }, + ) + stats["queued"] += 1 + + if not records: + return stats + + exit_code = run_plugins( + args=(), + records=records, + wait=True, + emit_results=False, + ) + if exit_code != 0: + raise SystemExit(exit_code) + + stats["reindexed"] = len(records) + return stats + + @enforce_types -def update(filter_patterns: Iterable[str] = (), - filter_type: str = 'exact', - before: float | None = None, - after: float | None = None, - resume: str | None = None, - batch_size: int = 100, - continuous: bool = False) -> None: +def update( + filter_patterns: Iterable[str] = (), + filter_type: str = "exact", + before: float | None = None, + after: float | None = None, + resume: str | None = None, + batch_size: int = 100, + continuous: bool = False, + index_only: bool = False, +) -> None: """ Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving. @@ -77,41 +175,69 @@ def update(filter_patterns: Iterable[str] = (), from rich import print from archivebox.config.django import setup_django + setup_django() from django.core.management import call_command # Run migrations first to ensure DB schema is up-to-date - print('[*] Checking for pending migrations...') + print("[*] Checking for pending migrations...") try: - call_command('migrate', '--no-input', verbosity=0) + call_command("migrate", "--no-input", verbosity=0) except Exception as e: - print(f'[!] Warning: Migration check failed: {e}') + print(f"[!] Warning: Migration check failed: {e}") while True: - if filter_patterns or before or after: + if index_only: + search_plugins = _get_search_indexing_plugins() + if not search_plugins: + print("[*] No search indexing plugins are available, nothing to backfill.") + break + + if not (filter_patterns or before or after): + print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...") + drain_old_archive_dirs( + resume_from=resume, + batch_size=batch_size, + ) + + snapshots = _build_filtered_snapshots_queryset( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + resume=resume, + ) + stats = reindex_snapshots( + snapshots, + search_plugins=search_plugins, + batch_size=batch_size, + ) + print_index_stats(stats) + elif filter_patterns or before or after: # Filtered mode: query DB only - print('[*] Processing filtered snapshots from database...') + print("[*] Processing filtered snapshots from database...") stats = process_filtered_snapshots( filter_patterns=filter_patterns, filter_type=filter_type, before=before, after=after, - batch_size=batch_size + resume=resume, + batch_size=batch_size, ) print_stats(stats) else: # Full mode: drain old dirs + process DB - stats_combined = {'phase1': {}, 'phase2': {}} + stats_combined = {"phase1": {}, "phase2": {}} - print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...') - stats_combined['phase1'] = drain_old_archive_dirs( + print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...") + stats_combined["phase1"] = drain_old_archive_dirs( resume_from=resume, - batch_size=batch_size + batch_size=batch_size, ) - print('[*] Phase 2: Processing all database snapshots (most recent first)...') - stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size) + print("[*] Phase 2: Processing all database snapshots (most recent first)...") + stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume) # Phase 3: Deduplication (disabled for now) # print('[*] Phase 3: Deduplicating...') @@ -122,7 +248,7 @@ def update(filter_patterns: Iterable[str] = (), if not continuous: break - print('[yellow]Sleeping 60s before next pass...[/yellow]') + print("[yellow]Sleeping 60s before next pass...[/yellow]") time.sleep(60) resume = None @@ -144,34 +270,34 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100 from archivebox.config import CONSTANTS from django.db import transaction - stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0} + stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0} archive_dir = CONSTANTS.ARCHIVE_DIR if not archive_dir.exists(): return stats - print('[DEBUG Phase1] Scanning for old directories in archive/...') + print("[DEBUG Phase1] Scanning for old directories in archive/...") # Scan for real directories only (skip symlinks - they're already migrated) all_entries = list(os.scandir(archive_dir)) - print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}') + print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}") entries = [ (e.stat().st_mtime, e.path) for e in all_entries if e.is_dir(follow_symlinks=False) # Skip symlinks ] entries.sort(reverse=True) # Newest first - print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}') - print(f'[*] Found {len(entries)} old directories to drain') + print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}") + print(f"[*] Found {len(entries)} old directories to drain") for mtime, entry_path in entries: entry_path = Path(entry_path) # Resume from timestamp if specified - if resume_from and entry_path.name < resume_from: + if resume_from and entry_path.name > resume_from: continue - stats['processed'] += 1 + stats["processed"] += 1 # Try to load existing snapshot from DB snapshot = Snapshot.load_from_directory(entry_path) @@ -182,16 +308,16 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100 if not snapshot: # Invalid directory - move to invalid/ Snapshot.move_directory_to_invalid(entry_path) - stats['invalid'] += 1 + stats["invalid"] += 1 print(f" [{stats['processed']}] Invalid: {entry_path.name}") continue try: snapshot.save() - stats['migrated'] += 1 + stats["migrated"] += 1 print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}") except Exception as e: - stats['skipped'] += 1 + stats["skipped"] += 1 print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") continue @@ -201,30 +327,35 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100 if not has_valid_crawl: # Create a new crawl (created_by will default to system user) from archivebox.crawls.models import Crawl + crawl = Crawl.objects.create(urls=snapshot.url) # Use queryset update to avoid triggering save() hooks from archivebox.core.models import Snapshot as SnapshotModel + SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl) # Refresh the instance snapshot.crawl = crawl print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}") # Check if needs migration (0.8.x → 0.9.x) - print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}") + print( + f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}", + ) if snapshot.fs_migration_needed: try: # Calculate paths using actual directory (entry_path), not snapshot.timestamp # because snapshot.timestamp might be truncated old_dir = entry_path - new_dir = snapshot.get_storage_path_for_version('0.9.0') + new_dir = snapshot.get_storage_path_for_version("0.9.0") print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}") # Manually migrate files if not new_dir.exists() and old_dir.exists(): new_dir.mkdir(parents=True, exist_ok=True) import shutil + file_count = 0 - for old_file in old_dir.rglob('*'): + for old_file in old_dir.rglob("*"): if old_file.is_file(): rel_path = old_file.relative_to(old_dir) new_file = new_dir / rel_path @@ -236,7 +367,8 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100 # Update only fs_version field using queryset update (bypasses validation) from archivebox.core.models import Snapshot as SnapshotModel - SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0') + + SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0") # Commit the transaction transaction.commit() @@ -245,22 +377,22 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100 if old_dir.exists() and old_dir != new_dir: snapshot._cleanup_old_migration_dir(old_dir, new_dir) - stats['migrated'] += 1 + stats["migrated"] += 1 print(f" [{stats['processed']}] Migrated: {entry_path.name}") except Exception as e: - stats['skipped'] += 1 + stats["skipped"] += 1 print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") else: - stats['skipped'] += 1 + stats["skipped"] += 1 - if stats['processed'] % batch_size == 0: + if stats["processed"] % batch_size == 0: transaction.commit() transaction.commit() return stats -def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]: +def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]: """ O(n) scan over entire DB from most recent to least recent. @@ -275,24 +407,30 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]: from django.db import transaction from django.utils import timezone - stats = {'processed': 0, 'reconciled': 0, 'queued': 0} + stats = {"processed": 0, "reconciled": 0, "queued": 0} - total = Snapshot.objects.count() - print(f'[*] Processing {total} snapshots from database (most recent first)...') + queryset = Snapshot.objects.all() + if resume: + queryset = queryset.filter(timestamp__lte=resume) + total = queryset.count() + print(f"[*] Processing {total} snapshots from database (most recent first)...") # Process from most recent to least recent - for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size): - stats['processed'] += 1 + for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size): + stats["processed"] += 1 # Skip snapshots with missing crawl references (orphaned by migration errors) if _get_snapshot_crawl(snapshot) is None: continue try: - print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}") + print( + f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}", + ) # Check if snapshot has a directory on disk from pathlib import Path + output_dir = Path(snapshot.output_dir) has_directory = output_dir.exists() and output_dir.is_dir() @@ -313,22 +451,23 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]: print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation") # Use queryset update to set fs_version without triggering save() hooks from archivebox.core.models import Snapshot as SnapshotModel - SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0') - snapshot.fs_version = '0.9.0' + + SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0") + snapshot.fs_version = "0.9.0" # Queue for archiving (state machine will handle it) snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.retry_at = timezone.now() snapshot.save() - stats['reconciled'] += 1 if has_directory else 0 - stats['queued'] += 1 + stats["reconciled"] += 1 if has_directory else 0 + stats["queued"] += 1 except Exception as e: # Skip snapshots that can't be processed (e.g., missing crawl) print(f" [!] Skipping snapshot {snapshot.id}: {e}") continue - if stats['processed'] % batch_size == 0: + if stats["processed"] % batch_size == 0: transaction.commit() print(f" [{stats['processed']}/{total}] Processed...") @@ -341,31 +480,28 @@ def process_filtered_snapshots( filter_type: str, before: float | None, after: float | None, - batch_size: int + resume: str | None, + batch_size: int, ) -> dict[str, int]: """Process snapshots matching filters (DB query only).""" - from archivebox.core.models import Snapshot from django.db import transaction from django.utils import timezone - from datetime import datetime - stats = {'processed': 0, 'reconciled': 0, 'queued': 0} + stats = {"processed": 0, "reconciled": 0, "queued": 0} - snapshots = Snapshot.objects.all() - - if filter_patterns: - snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type) - - if before: - snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before)) - if after: - snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after)) + snapshots = _build_filtered_snapshots_queryset( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + resume=resume, + ) total = snapshots.count() - print(f'[*] Found {total} matching snapshots') + print(f"[*] Found {total} matching snapshots") - for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size): - stats['processed'] += 1 + for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size): + stats["processed"] += 1 # Skip snapshots with missing crawl references if _get_snapshot_crawl(snapshot) is None: @@ -384,14 +520,14 @@ def process_filtered_snapshots( snapshot.retry_at = timezone.now() snapshot.save() - stats['reconciled'] += 1 - stats['queued'] += 1 + stats["reconciled"] += 1 + stats["queued"] += 1 except Exception as e: # Skip snapshots that can't be processed print(f" [!] Skipping snapshot {snapshot.id}: {e}") continue - if stats['processed'] % batch_size == 0: + if stats["processed"] % batch_size == 0: transaction.commit() print(f" [{stats['processed']}/{total}] Processed...") @@ -405,9 +541,9 @@ def print_stats(stats: dict): print(f""" [green]Update Complete[/green] - Processed: {stats['processed']} - Reconciled: {stats['reconciled']} - Queued: {stats['queued']} + Processed: {stats["processed"]} + Reconciled: {stats["reconciled"]} + Queued: {stats["queued"]} """) @@ -415,37 +551,50 @@ def print_combined_stats(stats_combined: dict): """Print statistics for full mode.""" from rich import print - s1 = stats_combined['phase1'] - s2 = stats_combined['phase2'] + s1 = stats_combined["phase1"] + s2 = stats_combined["phase2"] print(f""" [green]Archive Update Complete[/green] Phase 1 (Drain Old Dirs): - Checked: {s1.get('processed', 0)} - Migrated: {s1.get('migrated', 0)} - Skipped: {s1.get('skipped', 0)} - Invalid: {s1.get('invalid', 0)} + Checked: {s1.get("processed", 0)} + Migrated: {s1.get("migrated", 0)} + Skipped: {s1.get("skipped", 0)} + Invalid: {s1.get("invalid", 0)} Phase 2 (Process DB): - Processed: {s2.get('processed', 0)} - Reconciled: {s2.get('reconciled', 0)} - Queued: {s2.get('queued', 0)} + Processed: {s2.get("processed", 0)} + Reconciled: {s2.get("reconciled", 0)} + Queued: {s2.get("queued", 0)} +""") + + +def print_index_stats(stats: dict[str, Any]) -> None: + from rich import print + + print(f""" +[green]Search Reindex Complete[/green] + Processed: {stats["processed"]} + Reconciled: {stats["reconciled"]} + Queued: {stats["queued"]} + Reindexed: {stats["reindexed"]} """) @click.command() -@click.option('--resume', type=str, help='Resume from timestamp') -@click.option('--before', type=float, help='Only snapshots before timestamp') -@click.option('--after', type=float, help='Only snapshots after timestamp') -@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact') -@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots') -@click.option('--continuous', is_flag=True, help='Run continuously as background worker') -@click.argument('filter_patterns', nargs=-1) +@click.option("--resume", type=str, help="Resume from timestamp") +@click.option("--before", type=float, help="Only snapshots before timestamp") +@click.option("--after", type=float, help="Only snapshots after timestamp") +@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact") +@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots") +@click.option("--continuous", is_flag=True, help="Run continuously as background worker") +@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content") +@click.argument("filter_patterns", nargs=-1) @docstring(update.__doc__) def main(**kwargs): update(**kwargs) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 1015111d..7d293c5a 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' +__package__ = "archivebox.cli" import sys import os import platform from pathlib import Path -from typing import Iterable +from collections.abc import Iterable import rich_click as click @@ -14,19 +14,22 @@ from archivebox.misc.util import docstring, enforce_types @enforce_types -def version(quiet: bool=False, - binaries: Iterable[str]=()) -> list[str]: +def version( + quiet: bool = False, + binaries: Iterable[str] = (), +) -> list[str]: """Print the ArchiveBox version, debug metadata, and installed dependency versions""" - + # fast path for just getting the version and exiting, dont do any slower imports from archivebox.config.version import VERSION + print(VERSION) - if quiet or '--version' in sys.argv: + if quiet or "--version" in sys.argv: return [] - + from rich.panel import Panel from rich.console import Console - + from archivebox.config import CONSTANTS, DATA_DIR from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER @@ -34,78 +37,89 @@ def version(quiet: bool=False, from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG from archivebox.misc.logging_util import printable_folder_status from archivebox.config.configset import get_config - + console = Console() prnt = console.print - + # Check if LDAP is enabled (simple config lookup) config = get_config() - LDAP_ENABLED = config.get('LDAP_ENABLED', False) + LDAP_ENABLED = config.get("LDAP_ENABLED", False) p = platform.uname() COMMIT_HASH = get_COMMIT_HASH() prnt( - '[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION), - f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', - f'BUILD_TIME={get_BUILD_TIME()}', + f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]", + f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}", + f"BUILD_TIME={get_BUILD_TIME()}", ) prnt( - f'IN_DOCKER={IN_DOCKER}', - f'IN_QEMU={SHELL_CONFIG.IN_QEMU}', - f'ARCH={p.machine}', - f'OS={p.system}', - f'PLATFORM={platform.platform()}', - f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''), + f"IN_DOCKER={IN_DOCKER}", + f"IN_QEMU={SHELL_CONFIG.IN_QEMU}", + f"ARCH={p.machine}", + f"OS={p.system}", + f"PLATFORM={platform.platform()}", + f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""), ) - + try: OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount except Exception: OUTPUT_IS_REMOTE_FS = False - + try: DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() prnt( - f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', - f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}', - f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}', - f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', - f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', + f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", + f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}", + f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}", + f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}", + f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}", ) except Exception: prnt( - f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', + f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", ) - + prnt( - f'DEBUG={SHELL_CONFIG.DEBUG}', - f'IS_TTY={SHELL_CONFIG.IS_TTY}', - f'SUDO={CONSTANTS.IS_ROOT}', - f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}', - f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}', - f'LDAP={LDAP_ENABLED}', + f"DEBUG={SHELL_CONFIG.DEBUG}", + f"IS_TTY={SHELL_CONFIG.IS_TTY}", + f"SUDO={CONSTANTS.IS_ROOT}", + f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}", + f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}", + f"LDAP={LDAP_ENABLED}", ) prnt() - + if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): - PANEL_TEXT = '\n'.join(( - '', - '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...', - ' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.', - '', - ' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]', - '', - )) - prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) + PANEL_TEXT = "\n".join( + ( + "", + "[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...", + " [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.", + "", + " [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]", + "", + ), + ) + prnt( + Panel( + PANEL_TEXT, + expand=False, + border_style="grey53", + title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]", + subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]", + ), + ) prnt() return [] - prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]') + prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]") failures = [] # Setup Django before importing models try: from archivebox.config.django import setup_django + setup_django() from archivebox.machine.models import Machine, Binary @@ -113,12 +127,17 @@ def version(quiet: bool=False, machine = Machine.current() # Get all binaries from the database with timeout protection - all_installed = Binary.objects.filter( - machine=machine - ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name') + all_installed = ( + Binary.objects.filter( + machine=machine, + ) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("name") + ) if not all_installed.exists(): - prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]') + prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") else: for installed in all_installed: # Skip if user specified specific binaries and this isn't one @@ -126,71 +145,91 @@ def version(quiet: bool=False, continue if installed.is_valid: - display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') - version_str = (installed.version or 'unknown')[:15] - provider = (installed.binprovider or 'env')[:8] - prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False) + display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~") + version_str = (installed.version or "unknown")[:15] + provider = (installed.binprovider or "env")[:8] + prnt( + "", + "[green]√[/green]", + "", + installed.name.ljust(18), + version_str.ljust(16), + provider.ljust(8), + display_path, + overflow="ignore", + crop=False, + ) else: - prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False) + prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False) failures.append(installed.name) # Show hint if no binaries are installed yet - has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists() + has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists() if not has_any_installed: prnt() - prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]') + prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]") except Exception as e: # Handle database errors gracefully (locked, missing, etc.) prnt() - prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]') - prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]') + prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]") + prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]") if not binaries: # Show code and data locations prnt() - prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]') + prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]") try: for name, path in get_code_locations().items(): if isinstance(name, str) and isinstance(path, dict): - prnt(printable_folder_status(name, path), overflow='ignore', crop=False) + prnt(printable_folder_status(name, path), overflow="ignore", crop=False) except Exception as e: - prnt(f' [red]Error getting code locations: {e}[/red]') + prnt(f" [red]Error getting code locations: {e}[/red]") prnt() if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): - prnt('[bright_yellow][i] Data locations:[/bright_yellow]') + prnt("[bright_yellow][i] Data locations:[/bright_yellow]") try: for name, path in get_data_locations().items(): if isinstance(name, str) and isinstance(path, dict): - prnt(printable_folder_status(name, path), overflow='ignore', crop=False) + prnt(printable_folder_status(name, path), overflow="ignore", crop=False) except Exception as e: - prnt(f' [red]Error getting data locations: {e}[/red]') - + prnt(f" [red]Error getting data locations: {e}[/red]") + try: from archivebox.misc.checks import check_data_dir_permissions + check_data_dir_permissions() except Exception: pass else: prnt() - prnt('[red][i] Data locations:[/red] (not in a data directory)') - + prnt("[red][i] Data locations:[/red] (not in a data directory)") + prnt() - + if failures: - prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]') - prnt(f' [red]{", ".join(failures)}[/red]') + prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]") + prnt(f" [red]{', '.join(failures)}[/red]") prnt() - prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:') - prnt(' [green]archivebox install[/green]') + prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:") + prnt(" [green]archivebox install[/green]") prnt() return failures @click.command() -@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)') -@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)') +@click.option( + "--quiet", + "-q", + is_flag=True, + help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)", +) +@click.option( + "--binaries", + "-b", + help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)", +) @docstring(version.__doc__) def main(**kwargs): failures = version(**kwargs) @@ -198,5 +237,5 @@ def main(**kwargs): raise SystemExit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/archivebox/cli/cli_utils.py b/archivebox/cli/cli_utils.py index 8bb7f66d..799624e2 100644 --- a/archivebox/cli/cli_utils.py +++ b/archivebox/cli/cli_utils.py @@ -5,12 +5,10 @@ This module contains common utilities used across multiple CLI commands, extracted to avoid code duplication. """ -__package__ = 'archivebox.cli' - -from typing import Optional +__package__ = "archivebox.cli" -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): +def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None): """ Apply Django-style filters from CLI kwargs to a QuerySet. @@ -31,11 +29,11 @@ def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): """ filters = {} for key, value in filter_kwargs.items(): - if value is None or key in ('limit', 'offset'): + if value is None or key in ("limit", "offset"): continue # Handle CSV lists for __in filters - if key.endswith('__in') and isinstance(value, str): - value = [v.strip() for v in value.split(',')] + if key.endswith("__in") and isinstance(value, str): + value = [v.strip() for v in value.split(",")] filters[key] = value if filters: diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 0033269c..5f4f9032 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -5,16 +5,16 @@ This module provides backwards-compatible config exports for extractors and other modules that expect to import config values directly. """ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" __order__ = 200 from .paths import ( - PACKAGE_DIR, # noqa - DATA_DIR, # noqa - ARCHIVE_DIR, # noqa + PACKAGE_DIR, + DATA_DIR, + ARCHIVE_DIR, ) -from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa -from .version import VERSION # noqa +from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa +from .version import VERSION # noqa ############################################################################### @@ -22,15 +22,18 @@ from .version import VERSION # noqa # These provide backwards compatibility with extractors that import from ..config ############################################################################### + def _get_config(): """Lazy import to avoid circular imports.""" from .common import ARCHIVING_CONFIG, STORAGE_CONFIG + return ARCHIVING_CONFIG, STORAGE_CONFIG # Direct exports (evaluated at import time for backwards compat) # These are recalculated each time the module attribute is accessed + def __getattr__(name: str): """ Module-level __getattr__ for lazy config loading. @@ -40,38 +43,38 @@ def __getattr__(name: str): """ # Generic timeout settings (used by multiple plugins) - if name == 'TIMEOUT': + if name == "TIMEOUT": cfg, _ = _get_config() return cfg.TIMEOUT # Generic SSL/Security settings (used by multiple plugins) - if name == 'CHECK_SSL_VALIDITY': + if name == "CHECK_SSL_VALIDITY": cfg, _ = _get_config() return cfg.CHECK_SSL_VALIDITY # Generic storage settings (used by multiple plugins) - if name == 'RESTRICT_FILE_NAMES': + if name == "RESTRICT_FILE_NAMES": _, storage = _get_config() return storage.RESTRICT_FILE_NAMES # Generic user agent / cookies (used by multiple plugins) - if name == 'COOKIES_FILE': + if name == "COOKIES_FILE": cfg, _ = _get_config() return cfg.COOKIES_FILE - if name == 'USER_AGENT': + if name == "USER_AGENT": cfg, _ = _get_config() return cfg.USER_AGENT # Generic resolution settings (used by multiple plugins) - if name == 'RESOLUTION': + if name == "RESOLUTION": cfg, _ = _get_config() return cfg.RESOLUTION # Allowlist/Denylist patterns (compiled regexes) - if name == 'SAVE_ALLOWLIST_PTN': + if name == "SAVE_ALLOWLIST_PTN": cfg, _ = _get_config() return cfg.SAVE_ALLOWLIST_PTNS - if name == 'SAVE_DENYLIST_PTN': + if name == "SAVE_DENYLIST_PTN": cfg, _ = _get_config() return cfg.SAVE_DENYLIST_PTNS @@ -90,12 +93,13 @@ def get_CONFIG(): SEARCH_BACKEND_CONFIG, ) from .ldap import LDAP_CONFIG + return { - 'SHELL_CONFIG': SHELL_CONFIG, - 'STORAGE_CONFIG': STORAGE_CONFIG, - 'GENERAL_CONFIG': GENERAL_CONFIG, - 'SERVER_CONFIG': SERVER_CONFIG, - 'ARCHIVING_CONFIG': ARCHIVING_CONFIG, - 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG, - 'LDAP_CONFIG': LDAP_CONFIG, + "SHELL_CONFIG": SHELL_CONFIG, + "STORAGE_CONFIG": STORAGE_CONFIG, + "GENERAL_CONFIG": GENERAL_CONFIG, + "SERVER_CONFIG": SERVER_CONFIG, + "ARCHIVING_CONFIG": ARCHIVING_CONFIG, + "SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG, + "LDAP_CONFIG": LDAP_CONFIG, } diff --git a/archivebox/config/collection.py b/archivebox/config/collection.py index 74392b91..215bafc4 100644 --- a/archivebox/config/collection.py +++ b/archivebox/config/collection.py @@ -1,8 +1,8 @@ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" import os import json -from typing import Any, Optional, Type, Tuple, Dict +from typing import Any from pathlib import Path from configparser import ConfigParser @@ -27,13 +27,15 @@ def get_real_name(key: str) -> str: return key -def load_config_val(key: str, - default: Any=None, - type: Optional[Type]=None, - aliases: Optional[Tuple[str, ...]]=None, - config: Optional[benedict]=None, - env_vars: Optional[os._Environ]=None, - config_file_vars: Optional[Dict[str, str]]=None) -> Any: +def load_config_val( + key: str, + default: Any = None, + type: type | None = None, + aliases: tuple[str, ...] | None = None, + config: benedict | None = None, + env_vars: os._Environ | None = None, + config_file_vars: dict[str, str] | None = None, +) -> Any: """parse bool, int, and str key=value pairs from env""" assert isinstance(config, dict) @@ -67,8 +69,8 @@ def load_config_val(key: str, assert isinstance(val, str) # calculate value based on expected type - BOOL_TRUEIES = ('true', 'yes', '1') - BOOL_FALSEIES = ('false', 'no', '0') + BOOL_TRUEIES = ("true", "yes", "1") + BOOL_FALSEIES = ("false", "no", "0") if type is bool: if val.lower() in BOOL_TRUEIES: @@ -76,28 +78,28 @@ def load_config_val(key: str, elif val.lower() in BOOL_FALSEIES: return False else: - raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)') + raise ValueError(f"Invalid configuration option {key}={val} (expected a boolean: True/False)") elif type is str: if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES): - raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)') + raise ValueError(f"Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)") return val.strip() elif type is int: if not val.strip().isdigit(): - raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)') + raise ValueError(f"Invalid configuration option {key}={val} (expected an integer)") return int(val.strip()) elif type is list or type is dict: return json.loads(val) - + elif type is Path: return Path(val) - raise Exception('Config values can only be str, bool, int, or json') + raise Exception("Config values can only be str, bool, int, or json") -def load_config_file() -> Optional[benedict]: +def load_config_file() -> benedict | None: """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" config_path = CONSTANTS.CONFIG_FILE @@ -105,17 +107,16 @@ def load_config_file() -> Optional[benedict]: config_file = CaseConfigParser() config_file.read(config_path) # flatten into one namespace - config_file_vars = benedict({ - key.upper(): val - for section, options in config_file.items() - for key, val in options.items() - }) + config_file_vars = benedict({key.upper(): val for section, options in config_file.items() for key, val in options.items()}) # print('[i] Loaded config file', os.path.abspath(config_path)) # print(config_file_vars) return config_file_vars return None + + class PluginConfigSection: """Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf""" + toml_section_header = "PLUGINS" def __init__(self, key: str): @@ -144,8 +145,14 @@ def section_for_key(key: str) -> Any: ) # First check core config sections - for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, - SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]: + for section in [ + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ]: if hasattr(section, key): return section @@ -154,20 +161,19 @@ def section_for_key(key: str) -> Any: plugin_configs = discover_plugin_configs() for plugin_name, schema in plugin_configs.items(): - if 'properties' in schema and key in schema['properties']: + if "properties" in schema and key in schema["properties"]: # All plugin config goes to [PLUGINS] section return PluginConfigSection(key) - raise ValueError(f'No config section found for key: {key}') + raise ValueError(f"No config section found for key: {key}") -def write_config_file(config: Dict[str, str]) -> benedict: +def write_config_file(config: dict[str, str]) -> benedict: """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" from archivebox.misc.system import atomic_write - CONFIG_HEADER = ( - """# This is the config file for your ArchiveBox collection. + CONFIG_HEADER = """# This is the config file for your ArchiveBox collection. # # You can add options here manually in INI format, or automatically by running: # archivebox config --set KEY=VALUE @@ -178,7 +184,7 @@ def write_config_file(config: Dict[str, str]) -> benedict: # A list of all possible config with documentation and examples can be found here: # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration - """) + """ config_path = CONSTANTS.CONFIG_FILE @@ -188,57 +194,56 @@ def write_config_file(config: Dict[str, str]) -> benedict: config_file = CaseConfigParser() config_file.read(config_path) - with open(config_path, 'r', encoding='utf-8') as old: - atomic_write(f'{config_path}.bak', old.read()) + with open(config_path, encoding="utf-8") as old: + atomic_write(f"{config_path}.bak", old.read()) # Set up sections in empty config file for key, val in config.items(): section = section_for_key(key) assert section is not None - - if not hasattr(section, 'toml_section_header'): - raise ValueError(f'{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.') - + + if not hasattr(section, "toml_section_header"): + raise ValueError(f"{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.") + section_name = section.toml_section_header - + if section_name in config_file: existing_config = dict(config_file[section_name]) else: existing_config = {} - + config_file[section_name] = benedict({**existing_config, key: val}) section.update_in_place(warn=False, persist=False, **{key: val}) - with open(config_path, 'w+', encoding='utf-8') as new: + with open(config_path, "w+", encoding="utf-8") as new: config_file.write(new) updated_config = {} try: # validate the updated_config by attempting to re-parse it from archivebox.config.configset import get_flat_config + updated_config = {**load_all_config(), **get_flat_config()} - except BaseException: # lgtm [py/catch-base-exception] + except BaseException: # lgtm [py/catch-base-exception] # something went horribly wrong, revert to the previous version - with open(f'{config_path}.bak', 'r', encoding='utf-8') as old: + with open(f"{config_path}.bak", encoding="utf-8") as old: atomic_write(config_path, old.read()) raise - if os.access(f'{config_path}.bak', os.F_OK): - os.remove(f'{config_path}.bak') + if os.access(f"{config_path}.bak", os.F_OK): + os.remove(f"{config_path}.bak") - return benedict({ - key.upper(): updated_config.get(key.upper()) - for key in config.keys() - }) + return benedict({key.upper(): updated_config.get(key.upper()) for key in config.keys()}) - -def load_config(defaults: Dict[str, Any], - config: Optional[benedict]=None, - out_dir: Optional[str]=None, - env_vars: Optional[os._Environ]=None, - config_file_vars: Optional[Dict[str, str]]=None) -> benedict: +def load_config( + defaults: dict[str, Any], + config: benedict | None = None, + out_dir: str | None = None, + env_vars: os._Environ | None = None, + config_file_vars: dict[str, str] | None = None, +) -> benedict: env_vars = env_vars or os.environ config_file_vars = config_file_vars or load_config_file() @@ -249,9 +254,9 @@ def load_config(defaults: Dict[str, Any], # print('LOADING CONFIG KEY:', key, 'DEFAULT=', default) extended_config[key] = load_config_val( key, - default=default['default'], - type=default.get('type'), - aliases=default.get('aliases'), + default=default["default"], + type=default.get("type"), + aliases=default.get("aliases"), config=extended_config, env_vars=env_vars, config_file_vars=config_file_vars, @@ -260,19 +265,20 @@ def load_config(defaults: Dict[str, Any], raise SystemExit(0) except Exception as e: stderr() - stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config) - stderr(' {}: {}'.format(e.__class__.__name__, e)) + stderr(f"[X] Error while loading configuration value: {key}", color="red", config=extended_config) + stderr(f" {e.__class__.__name__}: {e}") stderr() - stderr(' Check your config for mistakes and try again (your archive data is unaffected).') + stderr(" Check your config for mistakes and try again (your archive data is unaffected).") stderr() - stderr(' For config documentation and examples see:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration') + stderr(" For config documentation and examples see:") + stderr(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration") stderr() # raise # raise SystemExit(2) return benedict(extended_config) + def load_all_config(): """Load all config sections and return as a flat dict.""" from archivebox.config.common import ( @@ -283,11 +289,17 @@ def load_all_config(): ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG, ) - + flat_config = benedict() - - for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, - SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]: + + for config_section in [ + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ]: flat_config.update(dict(config_section)) - + return flat_config diff --git a/archivebox/config/common.py b/archivebox/config/common.py index e01b1931..2be64d9b 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -4,7 +4,7 @@ import re import secrets import sys import shutil -from typing import ClassVar, Dict, Optional, List +from typing import ClassVar from pathlib import Path from rich.console import Console @@ -39,8 +39,8 @@ class ShellConfig(BaseConfigSet): IN_DOCKER: bool = Field(default=IN_DOCKER) IN_QEMU: bool = Field(default=False) - ANSI: Dict[str, str] = Field( - default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS + ANSI: dict[str, str] = Field( + default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS, ) @property @@ -50,7 +50,7 @@ class ShellConfig(BaseConfigSet): return shutil.get_terminal_size((140, 10)).columns @property - def COMMIT_HASH(self) -> Optional[str]: + def COMMIT_HASH(self) -> str | None: return get_COMMIT_HASH() @property @@ -112,7 +112,7 @@ class ServerConfig(BaseConfigSet): "danger-onedomain-fullreplay", ) - SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50))) + SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50))) BIND_ADDR: str = Field(default="127.0.0.1:8000") LISTEN_HOST: str = Field(default="archivebox.localhost:8000") ADMIN_BASE_URL: str = Field(default="") @@ -124,7 +124,7 @@ class ServerConfig(BaseConfigSet): SNAPSHOTS_PER_PAGE: int = Field(default=40) PREVIEW_ORIGINALS: bool = Field(default=True) FOOTER_INFO: str = Field( - default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests." + default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.", ) # CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant @@ -132,8 +132,8 @@ class ServerConfig(BaseConfigSet): PUBLIC_SNAPSHOTS: bool = Field(default=True) PUBLIC_ADD_VIEW: bool = Field(default=False) - ADMIN_USERNAME: Optional[str] = Field(default=None) - ADMIN_PASSWORD: Optional[str] = Field(default=None) + ADMIN_USERNAME: str | None = Field(default=None) + ADMIN_PASSWORD: str | None = Field(default=None) REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User") REVERSE_PROXY_WHITELIST: str = Field(default="") @@ -234,22 +234,22 @@ class ArchivingConfig(BaseConfigSet): RESOLUTION: str = Field(default="1440,2000") CHECK_SSL_VALIDITY: bool = Field(default=True) USER_AGENT: str = Field( - default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)" + default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)", ) COOKIES_FILE: Path | None = Field(default=None) URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST") URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST") - SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods - SAVE_DENYLIST: Dict[str, List[str]] = Field(default={}) + SAVE_ALLOWLIST: dict[str, list[str]] = Field(default={}) # mapping of regex patterns to list of archive methods + SAVE_DENYLIST: dict[str, list[str]] = Field(default={}) DEFAULT_PERSONA: str = Field(default="Default") def warn_if_invalid(self) -> None: if int(self.TIMEOUT) < 5: rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr) - rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr) + rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr) rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr) rprint(file=sys.stderr) rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr) @@ -274,7 +274,7 @@ class ArchivingConfig(BaseConfigSet): return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) @property - def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]: + def SAVE_ALLOWLIST_PTNS(self) -> dict[re.Pattern, list[str]]: return ( { # regexp: methods list @@ -286,7 +286,7 @@ class ArchivingConfig(BaseConfigSet): ) @property - def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]: + def SAVE_DENYLIST_PTNS(self) -> dict[re.Pattern, list[str]]: return ( { # regexp: methods list diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index ce5b5646..869d1cf0 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -11,7 +11,7 @@ __package__ = "archivebox.config" import os import json from pathlib import Path -from typing import Any, Dict, Optional, Type, Tuple +from typing import Any from configparser import ConfigParser from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict @@ -28,17 +28,18 @@ class IniConfigSettingsSource(PydanticBaseSettingsSource): Flattens all sections into a single namespace. """ - def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]: + def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]: config_vals = self._load_config_file() field_value = config_vals.get(field_name.upper()) return field_value, field_name, False - def __call__(self) -> Dict[str, Any]: + def __call__(self) -> dict[str, Any]: return self._load_config_file() - def _load_config_file(self) -> Dict[str, Any]: + def _load_config_file(self) -> dict[str, Any]: try: from archivebox.config.constants import CONSTANTS + config_path = CONSTANTS.CONFIG_FILE except ImportError: return {} @@ -78,25 +79,25 @@ class BaseConfigSet(BaseSettings): @classmethod def settings_customise_sources( cls, - settings_cls: Type[BaseSettings], + settings_cls: type[BaseSettings], init_settings: PydanticBaseSettingsSource, env_settings: PydanticBaseSettingsSource, dotenv_settings: PydanticBaseSettingsSource, file_secret_settings: PydanticBaseSettingsSource, - ) -> Tuple[PydanticBaseSettingsSource, ...]: + ) -> tuple[PydanticBaseSettingsSource, ...]: """ Define the order of settings sources (first = highest priority). """ return ( - init_settings, # 1. Passed to __init__ - env_settings, # 2. Environment variables + init_settings, # 1. Passed to __init__ + env_settings, # 2. Environment variables IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file # dotenv_settings, # Skip .env files # file_secret_settings, # Skip secrets files ) @classmethod - def load_from_file(cls, config_path: Path) -> Dict[str, str]: + def load_from_file(cls, config_path: Path) -> dict[str, str]: """Load config values from INI file.""" if not config_path.exists(): return {} @@ -120,14 +121,14 @@ class BaseConfigSet(BaseSettings): def get_config( - defaults: Optional[Dict] = None, + defaults: dict | None = None, persona: Any = None, user: Any = None, crawl: Any = None, snapshot: Any = None, archiveresult: Any = None, machine: Any = None, -) -> Dict[str, Any]: +) -> dict[str, Any]: """ Get merged config from all sources. @@ -176,7 +177,7 @@ def get_config( if persona_id: persona = Persona.objects.filter(id=persona_id).first() if persona is None: - raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}') + raise Persona.DoesNotExist(f"Crawl {getattr(crawl, 'id', None)} references missing Persona {persona_id}") if persona is None: crawl_config = getattr(crawl, "config", None) or {} @@ -200,6 +201,7 @@ def get_config( # Add plugin config defaults from JSONSchema config.json files try: from archivebox.hooks import get_config_defaults_from_plugins + plugin_defaults = get_config_defaults_from_plugins() config.update(plugin_defaults) except ImportError: @@ -224,6 +226,7 @@ def get_config( # Default to current machine if not provided try: from archivebox.machine.models import Machine + machine = Machine.current() except Exception: pass # Machine might not be available during early init @@ -246,16 +249,17 @@ def get_config( # Also check plugin config aliases in environment try: from archivebox.hooks import discover_plugin_configs + plugin_configs = discover_plugin_configs() for plugin_name, schema in plugin_configs.items(): - for key, prop_schema in schema.get('properties', {}).items(): + for key, prop_schema in schema.get("properties", {}).items(): # Check x-aliases - for alias in prop_schema.get('x-aliases', []): + for alias in prop_schema.get("x-aliases", []): if alias in os.environ and key not in os.environ: config[key] = _parse_env_value(os.environ[alias], config.get(key)) break # Check x-fallback - fallback = prop_schema.get('x-fallback') + fallback = prop_schema.get("x-fallback") if fallback and fallback in config and key not in config: config[key] = config[fallback] except ImportError: @@ -275,33 +279,34 @@ def get_config( # Add crawl path aliases for hooks that need shared crawl state. if crawl and hasattr(crawl, "output_dir"): - config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir) - config['CRAWL_DIR'] = str(crawl.output_dir) - config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID') + config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir) + config["CRAWL_DIR"] = str(crawl.output_dir) + config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID") # Apply snapshot config overrides (highest priority) if snapshot and hasattr(snapshot, "config") and snapshot.config: config.update(snapshot.config) if snapshot: - config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID') - config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0) + config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID") + config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0) if hasattr(snapshot, "output_dir"): - config['SNAP_DIR'] = str(snapshot.output_dir) + config["SNAP_DIR"] = str(snapshot.output_dir) if getattr(snapshot, "crawl_id", None): - config['CRAWL_ID'] = str(snapshot.crawl_id) + config["CRAWL_ID"] = str(snapshot.crawl_id) # Normalize all aliases to canonical names (after all sources merged) # This handles aliases that came from user/crawl/snapshot configs, not just env try: from archivebox.hooks import discover_plugin_configs + plugin_configs = discover_plugin_configs() aliases_to_normalize = {} # {alias_key: canonical_key} # Build alias mapping from all plugin schemas for plugin_name, schema in plugin_configs.items(): - for canonical_key, prop_schema in schema.get('properties', {}).items(): - for alias in prop_schema.get('x-aliases', []): + for canonical_key, prop_schema in schema.get("properties", {}).items(): + for alias in prop_schema.get("x-aliases", []): aliases_to_normalize[alias] = canonical_key # Normalize: copy alias values to canonical keys (aliases take precedence) @@ -314,10 +319,14 @@ def get_config( except ImportError: pass + if not config.get("DATA_DIR"): + config["DATA_DIR"] = str(CONSTANTS.DATA_DIR) + config["ABX_RUNTIME"] = "archivebox" + return config -def get_flat_config() -> Dict[str, Any]: +def get_flat_config() -> dict[str, Any]: """ Get a flat dictionary of all config values. @@ -326,20 +335,24 @@ def get_flat_config() -> Dict[str, Any]: return get_config() -def get_all_configs() -> Dict[str, BaseConfigSet]: +def get_all_configs() -> dict[str, BaseConfigSet]: """ Get all config section objects as a dictionary. Replaces abx.pm.hook.get_CONFIGS() """ from archivebox.config.common import ( - SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG + SHELL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, ) + return { - 'SHELL_CONFIG': SHELL_CONFIG, - 'SERVER_CONFIG': SERVER_CONFIG, - 'ARCHIVING_CONFIG': ARCHIVING_CONFIG, - 'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG, + "SHELL_CONFIG": SHELL_CONFIG, + "SERVER_CONFIG": SERVER_CONFIG, + "ARCHIVING_CONFIG": ARCHIVING_CONFIG, + "SEARCH_BACKEND_CONFIG": SEARCH_BACKEND_CONFIG, } @@ -394,7 +407,7 @@ DEFAULT_WORKER_CONCURRENCY = { } -def get_worker_concurrency() -> Dict[str, int]: +def get_worker_concurrency() -> dict[str, int]: """ Get worker concurrency settings. diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index ffcaf775..40fc11d9 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -5,17 +5,16 @@ Constants are for things that never change at runtime. DATA_DIR will never change at runtime, but you can run archivebox from inside a different DATA_DIR on the same machine. -This is loaded very early in the archivebox startup flow, so nothing in this file -or imported from this file should import anything from archivebox.config.common, +This is loaded very early in the archivebox startup flow, so nothing in this file +or imported from this file should import anything from archivebox.config.common, django, other INSTALLED_APPS, or anything else that is not in a standard library. """ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" import re import sys -from typing import Dict from pathlib import Path from benedict import benedict @@ -46,184 +45,235 @@ from .version import detect_installed_version class ConstantsDict: - PACKAGE_DIR: Path = PACKAGE_DIR - DATA_DIR: Path = DATA_DIR - ARCHIVE_DIR: Path = ARCHIVE_DIR - - MACHINE_TYPE: str = get_machine_type() - MACHINE_ID: str = get_machine_id() - COLLECTION_ID: str = get_collection_id(DATA_DIR) - + PACKAGE_DIR: Path = PACKAGE_DIR + DATA_DIR: Path = DATA_DIR + ARCHIVE_DIR: Path = ARCHIVE_DIR + + MACHINE_TYPE: str = get_machine_type() + MACHINE_ID: str = get_machine_id() + COLLECTION_ID: str = get_collection_id(DATA_DIR) + # Host system - VERSION: str = detect_installed_version(PACKAGE_DIR) - IN_DOCKER: bool = IN_DOCKER - + VERSION: str = detect_installed_version(PACKAGE_DIR) + IN_DOCKER: bool = IN_DOCKER + # Permissions - IS_ROOT: bool = IS_ROOT - ARCHIVEBOX_USER: int = ARCHIVEBOX_USER - ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP - RUNNING_AS_UID: int = RUNNING_AS_UID - RUNNING_AS_GID: int = RUNNING_AS_GID - DEFAULT_PUID: int = DEFAULT_PUID - DEFAULT_PGID: int = DEFAULT_PGID - IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix - + IS_ROOT: bool = IS_ROOT + ARCHIVEBOX_USER: int = ARCHIVEBOX_USER + ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP + RUNNING_AS_UID: int = RUNNING_AS_UID + RUNNING_AS_GID: int = RUNNING_AS_GID + DEFAULT_PUID: int = DEFAULT_PUID + DEFAULT_PGID: int = DEFAULT_PGID + IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix + # Source code dirs - PACKAGE_DIR_NAME: str = PACKAGE_DIR.name - TEMPLATES_DIR_NAME: str = 'templates' - TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME - STATIC_DIR_NAME: str = 'static' - STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME + PACKAGE_DIR_NAME: str = PACKAGE_DIR.name + TEMPLATES_DIR_NAME: str = "templates" + TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME + STATIC_DIR_NAME: str = "static" + STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME # Data dirs - ARCHIVE_DIR_NAME: str = 'archive' - SOURCES_DIR_NAME: str = 'sources' - PERSONAS_DIR_NAME: str = 'personas' - CACHE_DIR_NAME: str = 'cache' - LOGS_DIR_NAME: str = 'logs' - CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins' - CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates' - ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME - SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME - PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME - LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME - CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME - CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME - USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME + ARCHIVE_DIR_NAME: str = "archive" + SOURCES_DIR_NAME: str = "sources" + PERSONAS_DIR_NAME: str = "personas" + CACHE_DIR_NAME: str = "cache" + LOGS_DIR_NAME: str = "logs" + CUSTOM_PLUGINS_DIR_NAME: str = "custom_plugins" + CUSTOM_TEMPLATES_DIR_NAME: str = "custom_templates" + ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME + SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME + PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME + LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME + CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME + CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME + USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME # Data dir files - CONFIG_FILENAME: str = 'ArchiveBox.conf' - SQL_INDEX_FILENAME: str = 'index.sqlite3' - CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME - DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME - - JSON_INDEX_FILENAME: str = 'index.json' - JSONL_INDEX_FILENAME: str = 'index.jsonl' - HTML_INDEX_FILENAME: str = 'index.html' - ROBOTS_TXT_FILENAME: str = 'robots.txt' - FAVICON_FILENAME: str = 'favicon.ico' - - # Runtime dirs - TMP_DIR_NAME: str = 'tmp' - DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323 + CONFIG_FILENAME: str = "ArchiveBox.conf" + SQL_INDEX_FILENAME: str = "index.sqlite3" + CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME + DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME - LIB_DIR_NAME: str = 'lib' - DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker - DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / 'bin' # ./data/lib/arm64-linux-docker/bin + JSON_INDEX_FILENAME: str = "index.json" + JSONL_INDEX_FILENAME: str = "index.jsonl" + HTML_INDEX_FILENAME: str = "index.html" + ROBOTS_TXT_FILENAME: str = "robots.txt" + FAVICON_FILENAME: str = "favicon.ico" + + # Runtime dirs + TMP_DIR_NAME: str = "tmp" + DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323 + + LIB_DIR_NAME: str = "lib" + DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker + DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / "bin" # ./data/lib/arm64-linux-docker/bin # Config constants - TIMEZONE: str = 'UTC' - DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS - DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS}) + TIMEZONE: str = "UTC" + DEFAULT_CLI_COLORS: dict[str, str] = DEFAULT_CLI_COLORS + DISABLED_CLI_COLORS: dict[str, str] = benedict({k: "" for k in DEFAULT_CLI_COLORS}) # Hard safety limits (seconds) - MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours - MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours - ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE + ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE - STATICFILE_EXTENSIONS: frozenset[str] = frozenset(( - # 99.999% of the time, URLs ending in these extensions are static files - # that can be downloaded as-is, not html pages that need to be rendered - 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', - 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', - 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', - 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', - 'atom', 'rss', 'css', 'js', 'json', - 'dmg', 'iso', 'img', - 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', + STATICFILE_EXTENSIONS: frozenset[str] = frozenset( + ( + # 99.999% of the time, URLs ending in these extensions are static files + # that can be downloaded as-is, not html pages that need to be rendered + "gif", + "jpeg", + "jpg", + "png", + "tif", + "tiff", + "wbmp", + "ico", + "jng", + "bmp", + "svg", + "svgz", + "webp", + "ps", + "eps", + "ai", + "mp3", + "mp4", + "m4a", + "mpeg", + "mpg", + "mkv", + "mov", + "webm", + "m4v", + "flv", + "wmv", + "avi", + "ogg", + "ts", + "m3u8", + "pdf", + "txt", + "rtf", + "rtfd", + "doc", + "docx", + "ppt", + "pptx", + "xls", + "xlsx", + "atom", + "rss", + "css", + "js", + "json", + "dmg", + "iso", + "img", + "rar", + "war", + "hqx", + "zip", + "gz", + "bz2", + "7z", + # Less common extensions to consider adding later + # jar, swf, bin, com, exe, dll, deb + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, + # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml + # These are always treated as pages, not as static files, never add them: + # html, htm, shtml, xhtml, xml, aspx, php, cgi + ), + ) - # Less common extensions to consider adding later - # jar, swf, bin, com, exe, dll, deb - # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, - # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, - # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - - # These are always treated as pages, not as static files, never add them: - # html, htm, shtml, xhtml, xml, aspx, php, cgi - )) - - PIP_RELATED_NAMES: frozenset[str] = frozenset(( - ".venv", - "venv", - "virtualenv", - ".virtualenv", - )) - NPM_RELATED_NAMES: frozenset[str] = frozenset(( - "node_modules", - "package.json", - "package-lock.json", - "yarn.lock", - )) + PIP_RELATED_NAMES: frozenset[str] = frozenset( + ( + ".venv", + "venv", + "virtualenv", + ".virtualenv", + ), + ) + NPM_RELATED_NAMES: frozenset[str] = frozenset( + ( + "node_modules", + "package.json", + "package-lock.json", + "yarn.lock", + ), + ) # When initializing archivebox in a new directory, we check to make sure the dir is # actually empty so that we dont clobber someone's home directory or desktop by accident. # These files are exceptions to the is_empty check when we're trying to init a new dir, # as they could be from a previous archivebox version, system artifacts, dependencies, etc. - ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset(( - *PIP_RELATED_NAMES, - *NPM_RELATED_NAMES, - - ### Dirs: - ARCHIVE_DIR_NAME, - SOURCES_DIR_NAME, - LOGS_DIR_NAME, - CACHE_DIR_NAME, - LIB_DIR_NAME, - TMP_DIR_NAME, - PERSONAS_DIR_NAME, - CUSTOM_TEMPLATES_DIR_NAME, - CUSTOM_PLUGINS_DIR_NAME, - "invalid", - "users", - "machine", - # Backwards compatibility with old directory names - "user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins') - "user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates') - "static", # created by old static exports None: # This warning can be triggered during django.setup() but is safe to ignore # since we're doing intentional setup operations import warnings - warnings.filterwarnings('ignore', - message='.*Accessing the database during app initialization.*', - category=RuntimeWarning) + + warnings.filterwarnings( + "ignore", + message=".*Accessing the database during app initialization.*", + category=RuntimeWarning, + ) try: from django.core.management import call_command if in_memory_db: - raise Exception('dont use this anymore') + raise Exception("dont use this anymore") # some commands dont store a long-lived sqlite3 db file on disk. # in those cases we create a temporary in-memory db and run the migrations @@ -84,19 +88,22 @@ def setup_django(check_db=False, in_memory_db=False) -> None: try: django.setup() except Exception as e: - is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version')) + is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ("help", "version", "--help", "--version")) if not is_using_meta_cmd: # show error message to user only if they're not running a meta command / just trying to get help STDERR.print() - STDERR.print(Panel( - f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', - title='\n\n[red][X] Error while trying to load database![/red]', - subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', - expand=False, - style='bold red', - )) + STDERR.print( + Panel( + f"\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n", + title="\n\n[red][X] Error while trying to load database![/red]", + subtitle="[grey53]NO WRITES CAN BE PERFORMED[/grey53]", + expand=False, + style="bold red", + ), + ) STDERR.print() import traceback + traceback.print_exc() return @@ -104,28 +111,29 @@ def setup_django(check_db=False, in_memory_db=False) -> None: from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG # log startup message to the error log - error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG) - with open(error_log, "a", encoding='utf-8') as f: - command = ' '.join(sys.argv) - ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') + error_log = getattr(settings, "ERROR_LOG", DEFAULT_ERROR_LOG) + with open(error_log, "a", encoding="utf-8") as f: + command = " ".join(sys.argv) + ts = datetime.now(timezone.utc).strftime("%Y-%m-%d__%H:%M:%S") f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") if check_db: # make sure the data dir is owned by a non-root user if CONSTANTS.DATA_DIR.stat().st_uid == 0: - STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]') - STDERR.print(f' {CONSTANTS.DATA_DIR}') + STDERR.print("[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]") + STDERR.print(f" {CONSTANTS.DATA_DIR}") STDERR.print() - STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)') - STDERR.print(' cd path/to/your/archive/data') - STDERR.print(' archivebox [command]') + STDERR.print("[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)") + STDERR.print(" cd path/to/your/archive/data") + STDERR.print(" archivebox [command]") STDERR.print() raise SystemExit(9) # Create cache table in DB if needed try: from django.core.cache import cache - cache.get('test', None) + + cache.get("test", None) except django.db.utils.OperationalError: call_command("createcachetable", verbosity=0) @@ -133,12 +141,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None: # the sqlite3 whenever we init from scratch to avoid multiple threads # sharing the same connection by accident from django.db import connections + for conn in connections.all(): conn.close_if_unusable_or_obsolete() sql_index_path = CONSTANTS.DATABASE_FILE assert os.access(sql_index_path, os.F_OK), ( - f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)') + f"No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)" + ) # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging # if settings.DEBUG_LOGFIRE: diff --git a/archivebox/config/ldap.py b/archivebox/config/ldap.py index 2fe146a1..a2eadee3 100644 --- a/archivebox/config/ldap.py +++ b/archivebox/config/ldap.py @@ -1,6 +1,5 @@ __package__ = "archivebox.config" -from typing import Optional from pydantic import Field from archivebox.config.configset import BaseConfigSet @@ -13,13 +12,14 @@ class LDAPConfig(BaseConfigSet): Only loads and validates if django-auth-ldap is installed. These settings integrate with Django's LDAP authentication backend. """ + toml_section_header: str = "LDAP_CONFIG" LDAP_ENABLED: bool = Field(default=False) - LDAP_SERVER_URI: Optional[str] = Field(default=None) - LDAP_BIND_DN: Optional[str] = Field(default=None) - LDAP_BIND_PASSWORD: Optional[str] = Field(default=None) - LDAP_USER_BASE: Optional[str] = Field(default=None) + LDAP_SERVER_URI: str | None = Field(default=None) + LDAP_BIND_DN: str | None = Field(default=None) + LDAP_BIND_PASSWORD: str | None = Field(default=None) + LDAP_USER_BASE: str | None = Field(default=None) LDAP_USER_FILTER: str = Field(default="(uid=%(user)s)") LDAP_USERNAME_ATTR: str = Field(default="username") LDAP_FIRSTNAME_ATTR: str = Field(default="givenName") diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py index 8a4d81fe..59885dcc 100644 --- a/archivebox/config/paths.py +++ b/archivebox/config/paths.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" import os import socket @@ -15,24 +15,25 @@ from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER ############################################################################################# -PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir -DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir -ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir +ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir -IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") -DATABASE_FILE = DATA_DIR / 'index.sqlite3' +DATABASE_FILE = DATA_DIR / "index.sqlite3" ############################################################################################# + def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str: - collection_id_file = DATA_DIR / '.archivebox_id' - + collection_id_file = DATA_DIR / ".archivebox_id" + try: return collection_id_file.read_text().strip() except (OSError, FileNotFoundError, PermissionError): pass - + # hash the machine_id + collection dir path + creation time to get a unique collection_id machine_id = get_machine_id() collection_path = DATA_DIR.resolve() @@ -40,55 +41,60 @@ def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str: creation_date = DATA_DIR.stat().st_ctime except Exception: creation_date = datetime.now().isoformat() - collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8] - + collection_id = hashlib.sha256(f"{machine_id}:{collection_path}@{creation_date}".encode()).hexdigest()[:8] + try: # only persist collection_id file if we already have an index.sqlite3 file present # otherwise we might be running in a directory that is not a collection, no point creating cruft files collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK) if collection_is_active or force_create: collection_id_file.write_text(collection_id) - + # if we're running as root right now, make sure the collection_id file is owned by the archivebox user if IS_ROOT: with SudoPermission(uid=0): if ARCHIVEBOX_USER == 0: os.system(f'chmod 777 "{collection_id_file}"') - else: + else: os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"') except (OSError, FileNotFoundError, PermissionError): pass return collection_id + @cache def get_collection_id(DATA_DIR=DATA_DIR) -> str: """Get a short, stable, unique ID for the current collection (e.g. abc45678)""" return _get_collection_id(DATA_DIR=DATA_DIR) + @cache def get_machine_id() -> str: """Get a short, stable, unique ID for the current machine (e.g. abc45678)""" - - MACHINE_ID = 'unknown' + + MACHINE_ID = "unknown" try: import machineid - MACHINE_ID = machineid.hashed_id('archivebox')[:8] + + MACHINE_ID = machineid.hashed_id("archivebox")[:8] except Exception: try: import uuid import hashlib + MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8] except Exception: pass return MACHINE_ID + @cache def get_machine_type() -> str: """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)""" - - OS: str = platform.system().lower() # darwin, linux, etc. - ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. - LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}' + + OS: str = platform.system().lower() # darwin, linux, etc. + ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. + LIB_DIR_SCOPE: str = f"{ARCH}-{OS}-docker" if IN_DOCKER else f"{ARCH}-{OS}" return LIB_DIR_SCOPE @@ -97,27 +103,28 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No current_uid, current_gid = os.geteuid(), os.getegid() uid, gid = uid or current_uid, gid or current_gid - test_file = dir_path / '.permissions_test' + test_file = dir_path / ".permissions_test" try: with SudoPermission(uid=uid, fallback=fallback): test_file.exists() - test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir') + test_file.write_text(f"Checking if PUID={uid} PGID={gid} can write to dir") test_file.unlink() return True - except (IOError, OSError, PermissionError): - if chown: + except (OSError, PermissionError): + if chown: # try fixing it using sudo permissions with SudoPermission(uid=uid, fallback=fallback): os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null') return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False) return False + def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)""" from archivebox.misc.logging_util import pretty_path - + try: - socket_path = str(dir_path / '.test_socket.sock') + socket_path = str(dir_path / ".test_socket.sock") s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: os.remove(socket_path) @@ -130,8 +137,8 @@ def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: except OSError: pass except Exception as e: - raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e - + raise Exception(f"ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}") from e + return True @@ -143,8 +150,9 @@ def create_and_chown_dir(dir_path: Path) -> None: def tmp_dir_socket_path_is_short_enough(dir_path: Path) -> bool: - socket_file = dir_path.absolute().resolve() / 'supervisord.sock' - return len(f'file://{socket_file}') <= 96 + socket_file = dir_path.absolute().resolve() / "supervisord.sock" + return len(f"file://{socket_file}") <= 96 + @cache def get_or_create_working_tmp_dir(autofix=True, quiet=True): @@ -154,14 +162,18 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True): # try a few potential directories in order of preference CANDIDATES = [ - STORAGE_CONFIG.TMP_DIR, # - CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/ - Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512 - Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512 - Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512 - Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512 - Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d - Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5 + STORAGE_CONFIG.TMP_DIR, # + CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/ + Path("/var/run/archivebox") / get_collection_id(), # /var/run/archivebox/abc5d8512 + Path("/tmp") / "archivebox" / get_collection_id(), # /tmp/archivebox/abc5d8512 + Path("~/.tmp/archivebox").expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512 + Path(tempfile.gettempdir()) + / "archivebox" + / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512 + Path(tempfile.gettempdir()) + / "archivebox" + / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d + Path(tempfile.gettempdir()) / "abx" / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5 ] fallback_candidate = None for candidate in CANDIDATES: @@ -174,7 +186,12 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True): STORAGE_CONFIG.update_in_place(TMP_DIR=candidate) return candidate try: - if fallback_candidate is None and candidate.exists() and dir_is_writable(candidate) and tmp_dir_socket_path_is_short_enough(candidate): + if ( + fallback_candidate is None + and candidate.exists() + and dir_is_writable(candidate) + and tmp_dir_socket_path_is_short_enough(candidate) + ): fallback_candidate = candidate except Exception: pass @@ -186,25 +203,28 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=True): if autofix and STORAGE_CONFIG.TMP_DIR != fallback_candidate: STORAGE_CONFIG.update_in_place(TMP_DIR=fallback_candidate) return fallback_candidate - + if not quiet: - raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!') + raise OSError(f"ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!") + @cache def get_or_create_working_lib_dir(autofix=True, quiet=False): from archivebox import CONSTANTS from archivebox.config.common import STORAGE_CONFIG from archivebox.misc.checks import check_lib_dir - + # try a few potential directories in order of preference CANDIDATES = [ - STORAGE_CONFIG.LIB_DIR, # - CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker - Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5 - *([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5 - Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5 + STORAGE_CONFIG.LIB_DIR, # + CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker + Path("/usr/local/share/archivebox") / get_collection_id(), # /usr/local/share/archivebox/abc5 + *( + [Path("/opt/homebrew/share/archivebox") / get_collection_id()] if os.path.isfile("/opt/homebrew/bin/archivebox") else [] + ), # /opt/homebrew/share/archivebox/abc5 + Path("~/.local/share/archivebox").expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5 ] - + for candidate in CANDIDATES: try: create_and_chown_dir(candidate) @@ -214,10 +234,9 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False): if autofix and STORAGE_CONFIG.LIB_DIR != candidate: STORAGE_CONFIG.update_in_place(LIB_DIR=candidate) return candidate - - if not quiet: - raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!') + if not quiet: + raise OSError(f"ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!") @cache @@ -229,57 +248,68 @@ def get_data_locations(): tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR except Exception: tmp_dir = STORAGE_CONFIG.TMP_DIR - - return benedict({ - "DATA_DIR": { - "path": DATA_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), - "is_mount": os.path.ismount(DATA_DIR.resolve()), + + return benedict( + { + "DATA_DIR": { + "path": DATA_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), + "is_mount": os.path.ismount(DATA_DIR.resolve()), + }, + "CONFIG_FILE": { + "path": CONSTANTS.CONFIG_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) + and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) + and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), + }, + "SQL_INDEX": { + "path": DATABASE_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(DATABASE_FILE.resolve()), + }, + "ARCHIVE_DIR": { + "path": ARCHIVE_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), + "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), + }, + "SOURCES_DIR": { + "path": CONSTANTS.SOURCES_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) + and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) + and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), + }, + "PERSONAS_DIR": { + "path": CONSTANTS.PERSONAS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), + "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) + and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) + and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write + }, + "LOGS_DIR": { + "path": CONSTANTS.LOGS_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) + and os.access(CONSTANTS.LOGS_DIR, os.R_OK) + and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write + }, + "TMP_DIR": { + "path": tmp_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write + }, + # "CACHE_DIR": { + # "path": CACHE_DIR.resolve(), + # "enabled": True, + # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write + # }, }, - "CONFIG_FILE": { - "path": CONSTANTS.CONFIG_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), - }, - "SQL_INDEX": { - "path": DATABASE_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), - "is_mount": os.path.ismount(DATABASE_FILE.resolve()), - }, - "ARCHIVE_DIR": { - "path": ARCHIVE_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), - "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), - }, - "SOURCES_DIR": { - "path": CONSTANTS.SOURCES_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), - }, - "PERSONAS_DIR": { - "path": CONSTANTS.PERSONAS_DIR.resolve(), - "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), - "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write - }, - "LOGS_DIR": { - "path": CONSTANTS.LOGS_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write - }, - 'TMP_DIR': { - 'path': tmp_dir.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write - }, - # "CACHE_DIR": { - # "path": CACHE_DIR.resolve(), - # "enabled": True, - # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write - # }, - }) + ) + @cache def get_code_locations(): @@ -291,41 +321,45 @@ def get_code_locations(): except Exception: lib_dir = STORAGE_CONFIG.LIB_DIR - lib_bin_dir = lib_dir / 'bin' - - return benedict({ - 'PACKAGE_DIR': { - 'path': (PACKAGE_DIR).resolve(), - 'enabled': True, - 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable - }, - 'TEMPLATES_DIR': { - 'path': CONSTANTS.TEMPLATES_DIR.resolve(), - 'enabled': True, - 'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list - }, - 'CUSTOM_TEMPLATES_DIR': { - 'path': STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(), - 'enabled': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR), - 'is_valid': os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read - }, - 'USER_PLUGINS_DIR': { - 'path': CONSTANTS.USER_PLUGINS_DIR.resolve(), - 'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), - 'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read - }, - 'LIB_DIR': { - 'path': lib_dir.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write - }, - 'LIB_BIN_DIR': { - 'path': lib_bin_dir.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(lib_bin_dir) and os.access(lib_bin_dir, os.R_OK) and os.access(lib_bin_dir, os.W_OK), # read + write - }, - }) + lib_bin_dir = lib_dir / "bin" + return benedict( + { + "PACKAGE_DIR": { + "path": (PACKAGE_DIR).resolve(), + "enabled": True, + "is_valid": os.access(PACKAGE_DIR / "__main__.py", os.X_OK), # executable + }, + "TEMPLATES_DIR": { + "path": CONSTANTS.TEMPLATES_DIR.resolve(), + "enabled": True, + "is_valid": os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list + }, + "CUSTOM_TEMPLATES_DIR": { + "path": STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(), + "enabled": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR), + "is_valid": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) + and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read + }, + "USER_PLUGINS_DIR": { + "path": CONSTANTS.USER_PLUGINS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), + "is_valid": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read + }, + "LIB_DIR": { + "path": lib_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write + }, + "LIB_BIN_DIR": { + "path": lib_bin_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(lib_bin_dir) + and os.access(lib_bin_dir, os.R_OK) + and os.access(lib_bin_dir, os.W_OK), # read + write + }, + }, + ) # @cache @@ -340,9 +374,9 @@ def get_code_locations(): # - ok to have a long path (doesnt contain SOCKETS) # """ # from .version import detect_installed_version - + # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) - + # lib_dir = tempfile.gettempdir() # try: # if 'SYSTEM_LIB_DIR' in os.environ: @@ -350,7 +384,7 @@ def get_code_locations(): # else: # with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True): # lib_dir = HOST_DIRS.site_data_path - + # # Docker: /usr/local/share/archivebox/0.8.5 # # Ubuntu: /usr/local/share/archivebox/0.8.5 # # macOS: /Library/Application Support/archivebox @@ -358,16 +392,16 @@ def get_code_locations(): # with SudoPermission(uid=0, fallback=True): # lib_dir.mkdir(parents=True, exist_ok=True) # except PermissionError: -# # our user cannot +# # our user cannot # lib_dir = HOST_DIRS.user_data_path # lib_dir.mkdir(parents=True, exist_ok=True) - + # if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER): # if IS_ROOT: # # make sure lib dir is owned by the archivebox user, not root # with SudoPermission(uid=0): # if ARCHIVEBOX_USER == 0: -# # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr) +# # print(f'[yellow]:warning: Warning: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr) # os.system(f'chmod -R 777 "{lib_dir}"') # else: # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"') @@ -376,9 +410,9 @@ def get_code_locations(): # except (PermissionError, AssertionError): # # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') # print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) - + # return lib_dir - + # @cache # def get_TMP_DIR(): # """ @@ -390,9 +424,9 @@ def get_code_locations(): # - must be cleared on every archivebox version upgrade # """ # from .version import detect_installed_version - + # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) - + # # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) # # print('RUNNING AS:', self.PUID, self.PGID) # run_dir = tempfile.gettempdir() @@ -405,7 +439,7 @@ def get_code_locations(): # if IS_ROOT: # with SudoPermission(uid=0, fallback=False): # if ARCHIVEBOX_USER == 0: -# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) +# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) # os.system(f'chmod -R 777 "{run_dir}"') # else: # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') @@ -413,30 +447,30 @@ def get_code_locations(): # raise PermissionError() # assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' # return run_dir - + # run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve() # try: # assert len(str(run_dir)) + len('/supervisord.sock') < 95 # except AssertionError: # run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR) # assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' - + # with SudoPermission(uid=0, fallback=True): # run_dir.mkdir(parents=True, exist_ok=True) - + # if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): # if IS_ROOT: # with SudoPermission(uid=0): # if ARCHIVEBOX_USER == 0: -# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) +# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) # os.system(f'chmod -R 777 "{run_dir}"') # else: # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') # else: # raise PermissionError() - + # except (PermissionError, AssertionError): # # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') # print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) - + # return run_dir diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py index 1207ee38..45afb3ca 100644 --- a/archivebox/config/permissions.py +++ b/archivebox/config/permissions.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" import os import pwd @@ -17,26 +17,26 @@ from contextlib import contextmanager DATA_DIR = Path(os.getcwd()) try: - DATA_DIR_STAT = DATA_DIR.stat() - DATA_DIR_UID = DATA_DIR_STAT.st_uid - DATA_DIR_GID = DATA_DIR_STAT.st_gid + DATA_DIR_STAT = DATA_DIR.stat() + DATA_DIR_UID = DATA_DIR_STAT.st_uid + DATA_DIR_GID = DATA_DIR_STAT.st_gid except PermissionError: - DATA_DIR_UID = 0 - DATA_DIR_GID = 0 + DATA_DIR_UID = 0 + DATA_DIR_GID = 0 -DEFAULT_PUID = 911 -DEFAULT_PGID = 911 -RUNNING_AS_UID = os.getuid() -RUNNING_AS_GID = os.getgid() -EUID = os.geteuid() -EGID = os.getegid() -SUDO_UID = int(os.environ.get('SUDO_UID', 0)) -SUDO_GID = int(os.environ.get('SUDO_GID', 0)) -USER: str = Path('~').expanduser().resolve().name -HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len)) +DEFAULT_PUID = 911 +DEFAULT_PGID = 911 +RUNNING_AS_UID = os.getuid() +RUNNING_AS_GID = os.getgid() +EUID = os.geteuid() +EGID = os.getegid() +SUDO_UID = int(os.environ.get("SUDO_UID", 0)) +SUDO_GID = int(os.environ.get("SUDO_GID", 0)) +USER: str = Path("~").expanduser().resolve().name +HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len)) IS_ROOT = RUNNING_AS_UID == 0 -IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") # IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose @@ -47,74 +47,79 @@ if RUNNING_AS_UID == 0: # if we are running as root it's really hard to figure out what the correct archivebox user should be # as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users) # check if 911:911 archivebox user exists on host system, and use it instead of 0 - if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox': + if pwd.getpwuid(DEFAULT_PUID).pw_name == "archivebox": FALLBACK_UID = DEFAULT_PUID FALLBACK_GID = DEFAULT_PGID except Exception: pass -os.environ.setdefault('PUID', str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID)) -os.environ.setdefault('PGID', str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID)) +os.environ.setdefault("PUID", str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID)) +os.environ.setdefault("PGID", str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID)) -ARCHIVEBOX_USER = int(os.environ['PUID']) -ARCHIVEBOX_GROUP = int(os.environ['PGID']) +ARCHIVEBOX_USER = int(os.environ["PUID"]) +ARCHIVEBOX_GROUP = int(os.environ["PGID"]) if not USER: try: # alternative method 1 to get username USER = pwd.getpwuid(ARCHIVEBOX_USER).pw_name except Exception: pass - + if not USER: try: # alternative method 2 to get username import getpass + USER = getpass.getuser() except Exception: pass - + if not USER: try: # alternative method 3 to get username - USER = os.getlogin() or 'archivebox' + USER = os.getlogin() or "archivebox" except Exception: - USER = 'archivebox' - + USER = "archivebox" + ARCHIVEBOX_USER_EXISTS = False try: pwd.getpwuid(ARCHIVEBOX_USER) ARCHIVEBOX_USER_EXISTS = True except Exception: ARCHIVEBOX_USER_EXISTS = False - + ############################################################################################# + def drop_privileges(): """If running as root, drop privileges to the user that owns the data dir (or PUID)""" - + # always run archivebox as the user that owns the data dir, never as root if os.getuid() == 0: # drop permissions to the user that owns the data dir / provided PUID if os.geteuid() != ARCHIVEBOX_USER and ARCHIVEBOX_USER != 0 and ARCHIVEBOX_USER_EXISTS: # drop our effective UID to the archivebox user's UID os.seteuid(ARCHIVEBOX_USER) - + # update environment variables so that subprocesses dont try to write to /root pw_record = pwd.getpwuid(ARCHIVEBOX_USER) - os.environ['HOME'] = pw_record.pw_dir - os.environ['LOGNAME'] = pw_record.pw_name - os.environ['USER'] = pw_record.pw_name + os.environ["HOME"] = pw_record.pw_dir + os.environ["LOGNAME"] = pw_record.pw_name + os.environ["USER"] = pw_record.pw_name if ARCHIVEBOX_USER == 0 or not ARCHIVEBOX_USER_EXISTS: - print('[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)', file=sys.stderr) + print( + "[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)", + file=sys.stderr, + ) @contextmanager def SudoPermission(uid=0, fallback=False): """Attempt to run code with sudo permissions for a given user (or root)""" - + if os.geteuid() == uid: # no need to change effective UID, we are already that user yield @@ -125,7 +130,7 @@ def SudoPermission(uid=0, fallback=False): os.seteuid(uid) except PermissionError as err: if not fallback: - raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err + raise PermissionError(f"Not enough permissions to run code as uid={uid}, please retry with sudo") from err try: # yield back to the caller so they can run code inside context as root yield @@ -135,4 +140,4 @@ def SudoPermission(uid=0, fallback=False): os.seteuid(ARCHIVEBOX_USER) except PermissionError as err: if not fallback: - raise PermissionError(f'Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo') from err + raise PermissionError(f"Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo") from err diff --git a/archivebox/config/version.py b/archivebox/config/version.py index 415bf81b..fde5533d 100644 --- a/archivebox/config/version.py +++ b/archivebox/config/version.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" import os import importlib.metadata @@ -6,71 +6,71 @@ import importlib.metadata from pathlib import Path from functools import cache from datetime import datetime -from typing import Optional ############################################################################################# -IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") -PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir -DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir -ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir +ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir ############################################################################################# @cache -def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR): +def detect_installed_version(PACKAGE_DIR: Path = PACKAGE_DIR): """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file""" try: # if in production install, use pip-installed package metadata - return importlib.metadata.version('archivebox').strip() + return importlib.metadata.version("archivebox").strip() except importlib.metadata.PackageNotFoundError: pass try: # if in dev Git repo dir, use pyproject.toml file - pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n') + pyproject_config = (PACKAGE_DIR.parent / "pyproject.toml").read_text().split("\n") for line in pyproject_config: - if line.startswith('version = '): - return line.split(' = ', 1)[-1].strip('"').strip() + if line.startswith("version = "): + return line.split(" = ", 1)[-1].strip('"').strip() except FileNotFoundError: # building docs, pyproject.toml is not available pass # raise Exception('Failed to detect installed archivebox version!') - return 'dev' + return "dev" @cache -def get_COMMIT_HASH() -> Optional[str]: +def get_COMMIT_HASH() -> str | None: try: - git_dir = PACKAGE_DIR.parent / '.git' - ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1] + git_dir = PACKAGE_DIR.parent / ".git" + ref = (git_dir / "HEAD").read_text().strip().split(" ")[-1] commit_hash = git_dir.joinpath(ref).read_text().strip() return commit_hash except Exception: pass try: - return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip() + return list((PACKAGE_DIR.parent / ".git/refs/heads/").glob("*"))[0].read_text().strip() except Exception: pass - + return None - + + @cache def get_BUILD_TIME() -> str: if IN_DOCKER: try: # if we're in the archivebox official docker image, /VERSION.txt will contain the build time - docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0] + docker_build_end_time = Path("/VERSION.txt").read_text().rsplit("BUILD_END_TIME=")[-1].split("\n", 1)[0] return docker_build_end_time except Exception: pass - src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime - return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s') + src_last_modified_unix_timestamp = (PACKAGE_DIR / "README.md").stat().st_mtime + return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime("%Y-%m-%d %H:%M:%S %s") # def get_versions_available_on_github(config): @@ -78,14 +78,14 @@ def get_BUILD_TIME() -> str: # returns a dictionary containing the ArchiveBox GitHub release info for # the recommended upgrade version and the currently installed version # """ - + # # we only want to perform the (relatively expensive) check for new versions # # when its most relevant, e.g. when the user runs a long-running command # subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' # long_running_commands = ('add', 'schedule', 'update', 'status', 'server') # if subcommand_run_by_user not in long_running_commands: # return None - + # github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" # response = requests.get(github_releases_api) # if response.status_code != 200: @@ -104,7 +104,7 @@ def get_BUILD_TIME() -> str: # break # current_version = current_version or all_releases[-1] - + # # recommended version is whatever comes after current_version in the release list # # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) # try: diff --git a/archivebox/config/views.py b/archivebox/config/views.py index df7a83d6..a6f821c8 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.config' +__package__ = "archivebox.config" import html import json @@ -6,7 +6,8 @@ import os import inspect import re from pathlib import Path -from typing import Any, Callable, Dict +from typing import Any +from collections.abc import Callable from urllib.parse import quote, urlencode from django.http import HttpRequest from django.utils import timezone @@ -21,30 +22,48 @@ from archivebox.misc.util import parse_date from archivebox.machine.models import Binary -ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/' -ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/' -LIVE_CONFIG_BASE_URL = '/admin/environment/config/' -ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/' -INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/' +ABX_PLUGINS_DOCS_BASE_URL = "https://archivebox.github.io/abx-plugins/" +ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/" +LIVE_CONFIG_BASE_URL = "/admin/environment/config/" +ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/" +INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/" # Common binaries to check for KNOWN_BINARIES = [ - 'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable', - 'node', 'npm', 'npx', 'yt-dlp', - 'git', 'singlefile', 'readability-extractor', 'mercury-parser', - 'python3', 'python', 'bash', 'zsh', - 'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox', + "wget", + "curl", + "chromium", + "chrome", + "google-chrome", + "google-chrome-stable", + "node", + "npm", + "npx", + "yt-dlp", + "git", + "singlefile", + "readability-extractor", + "mercury-parser", + "python3", + "python", + "bash", + "zsh", + "ffmpeg", + "ripgrep", + "rg", + "sonic", + "archivebox", ] CANONICAL_BINARY_ALIASES = { - 'youtube-dl': 'yt-dlp', - 'ytdlp': 'yt-dlp', + "youtube-dl": "yt-dlp", + "ytdlp": "yt-dlp", } def is_superuser(request: HttpRequest) -> bool: - return bool(getattr(request.user, 'is_superuser', False)) + return bool(getattr(request.user, "is_superuser", False)) def format_parsed_datetime(value: object) -> str: @@ -55,9 +74,9 @@ def format_parsed_datetime(value: object) -> str: JSON_TOKEN_RE = re.compile( r'(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)' r'|(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")' - r'|(?P\btrue\b|\bfalse\b)' - r'|(?P\bnull\b)' - r'|(?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)' + r"|(?P\btrue\b|\bfalse\b)" + r"|(?P\bnull\b)" + r"|(?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)", ) @@ -65,13 +84,14 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str: code = html.escape(text, quote=False) if highlighted: + def _wrap_token(match: re.Match[str]) -> str: styles = { - 'key': 'color: #0550ae;', - 'string': 'color: #0a7f45;', - 'boolean': 'color: #8250df; font-weight: 600;', - 'null': 'color: #6e7781; font-style: italic;', - 'number': 'color: #b35900;', + "key": "color: #0550ae;", + "string": "color: #0a7f45;", + "boolean": "color: #8250df; font-weight: 600;", + "null": "color: #6e7781; font-style: italic;", + "number": "color: #b35900;", } token_type = next(name for name, value in match.groupdict().items() if value is not None) return f'{match.group(0)}' @@ -82,9 +102,9 @@ def render_code_block(text: str, *, highlighted: bool = False) -> str: '
'
         ''
-        f'{code}'
-        '
' + "'Liberation Mono', monospace; white-space: pre; line-height: 1.5;\">" + f"{code}" + "" ) @@ -93,34 +113,35 @@ def render_highlighted_json_block(value: Any) -> str: def get_plugin_docs_url(plugin_name: str) -> str: - return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}' + return f"{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}" def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str: - return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}' + return f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}" def get_live_config_url(key: str) -> str: - return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/' + return f"{LIVE_CONFIG_BASE_URL}{quote(key)}/" def get_environment_binary_url(name: str) -> str: - return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/' + return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/" def get_installed_binary_change_url(name: str, binary: Any) -> str | None: - binary_id = getattr(binary, 'id', None) + binary_id = getattr(binary, "id", None) if not binary_id: return None - base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/' - changelist_filters = urlencode({'q': canonical_binary_name(name)}) - return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}' + base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/" + changelist_filters = urlencode({"q": canonical_binary_name(name)}) + return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}" def get_machine_admin_url() -> str | None: try: from archivebox.machine.models import Machine + return Machine.current().admin_change_url except Exception: return None @@ -130,12 +151,14 @@ def render_code_tag_list(values: list[str]) -> str: if not values: return '(none)' - tags = ''.join( - str(format_html( - '{}', - value, - )) + tags = "".join( + str( + format_html( + '{}', + value, + ), + ) for value in values ) return f'
{tags}
' @@ -143,22 +166,21 @@ def render_code_tag_list(values: list[str]) -> str: def render_plugin_metadata_html(config: dict[str, Any]) -> str: rows = ( - ('Title', config.get('title') or '(none)'), - ('Description', config.get('description') or '(none)'), - ('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))), - ('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))), - ('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))), + ("Title", config.get("title") or "(none)"), + ("Description", config.get("description") or "(none)"), + ("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))), + ("Required Binaries", mark_safe(render_link_tag_list(config.get("required_binaries") or [], get_environment_binary_url))), + ("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))), ) - rendered_rows = ''.join( - str(format_html( - '
' - '
{}
' - '
{}
' - '
', - label, - value, - )) + rendered_rows = "".join( + str( + format_html( + '
{}
{}
', + label, + value, + ), + ) for label, value in rows ) return f'
{rendered_rows}
' @@ -171,20 +193,28 @@ def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | tags = [] for value in values: if url_resolver is None: - tags.append(str(format_html( - '{}', - value, - ))) + tags.append( + str( + format_html( + '{}', + value, + ), + ), + ) else: - tags.append(str(format_html( - '' - '{}' - '', - url_resolver(value), - value, - ))) + tags.append( + str( + format_html( + '' + '{}' + "", + url_resolver(value), + value, + ), + ), + ) return f'
{"".join(tags)}
' @@ -195,21 +225,21 @@ def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_adm if machine_admin_url: links.append(str(format_html('Edit override', machine_admin_url))) - fallback = prop_info.get('x-fallback') + fallback = prop_info.get("x-fallback") if isinstance(fallback, str) and fallback: links.append(str(format_html('Fallback: {}', get_live_config_url(fallback), fallback))) - aliases = prop_info.get('x-aliases') or [] + aliases = prop_info.get("x-aliases") or [] if isinstance(aliases, list): for alias in aliases: if isinstance(alias, str) and alias: links.append(str(format_html('Alias: {}', get_live_config_url(alias), alias))) - default = prop_info.get('default') - if prop_name.endswith('_BINARY') and isinstance(default, str) and default: + default = prop_info.get("default") + if prop_name.endswith("_BINARY") and isinstance(default, str) and default: links.append(str(format_html('Binary: {}', get_environment_binary_url(default), default))) - return '   '.join(links) + return "   ".join(links) def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str: @@ -221,42 +251,48 @@ def render_config_properties_html(properties: dict[str, Any], machine_admin_url: header_links.insert(0, str(format_html('Machine Config Editor', machine_admin_url))) cards = [ - f'
{"   |   ".join(header_links)}
' + f'
{"   |   ".join(header_links)}
', ] for prop_name, prop_info in properties.items(): - prop_type = prop_info.get('type', 'unknown') + prop_type = prop_info.get("type", "unknown") if isinstance(prop_type, list): - prop_type = ' | '.join(str(type_name) for type_name in prop_type) - prop_desc = prop_info.get('description', '') + prop_type = " | ".join(str(type_name) for type_name in prop_type) + prop_desc = prop_info.get("description", "") - default_html = '' - if 'default' in prop_info: - default_html = str(format_html( - '
Default: {}
', - prop_info['default'], - )) + default_html = "" + if "default" in prop_info: + default_html = str( + format_html( + '
Default: {}
', + prop_info["default"], + ), + ) description_html = prop_desc or mark_safe('(no description)') - cards.append(str(format_html( - '
' - '
' - '{}' - ' ({})' - '
' - '
{}
' - '
{}
' - '{}' - '
', - get_live_config_url(prop_name), - prop_name, - prop_type, - description_html, - mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)), - mark_safe(default_html), - ))) + cards.append( + str( + format_html( + '
' + '
' + '{}' + ' ({})' + "
" + '
{}
' + '
{}
' + "{}" + "
", + get_live_config_url(prop_name), + prop_name, + prop_type, + description_html, + mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)), + mark_safe(default_html), + ), + ), + ) - return ''.join(cards) + return "".join(cards) def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str: @@ -265,40 +301,47 @@ def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> s items = [] for hook_name in hooks: - if source == 'builtin': - items.append(str(format_html( - '
' - '{}' - '
', - get_plugin_hook_source_url(plugin_name, hook_name), - hook_name, - ))) + if source == "builtin": + items.append( + str( + format_html( + '', + get_plugin_hook_source_url(plugin_name, hook_name), + hook_name, + ), + ), + ) else: - items.append(str(format_html( - '
{}
', - hook_name, - ))) - return ''.join(items) + items.append( + str( + format_html( + '
{}
', + hook_name, + ), + ), + ) + return "".join(items) def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str: installed_binary_url = get_installed_binary_change_url(name, db_binary) if installed_binary_url: - return str(format_html( - '{}
' - 'View Installed Binary Record', - merged['abspath'], - installed_binary_url, - )) + return str( + format_html( + '{}
View Installed Binary Record', + merged["abspath"], + installed_binary_url, + ), + ) - return str(format_html('{}', merged['abspath'])) + return str(format_html("{}", merged["abspath"])) def obj_to_yaml(obj: Any, indent: int = 0) -> str: indent_str = " " * indent if indent == 0: - indent_str = '\n' # put extra newline between top-level entries + indent_str = "\n" # put extra newline between top-level entries if isinstance(obj, dict): if not obj: @@ -326,11 +369,11 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str: return f" {str(obj)}" elif callable(obj): - source = '\n'.join( - '' if 'def ' in line else line - for line in inspect.getsource(obj).split('\n') - if line.strip() - ).split('lambda: ')[-1].rstrip(',') + source = ( + "\n".join("" if "def " in line else line for line in inspect.getsource(obj).split("\n") if line.strip()) + .split("lambda: ")[-1] + .rstrip(",") + ) return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ") else: @@ -350,67 +393,64 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]: ) -def get_db_binaries_by_name() -> Dict[str, Binary]: - grouped: Dict[str, list[Binary]] = {} +def get_db_binaries_by_name() -> dict[str, Binary]: + grouped: dict[str, list[Binary]] = {} for binary in Binary.objects.all(): grouped.setdefault(canonical_binary_name(binary.name), []).append(binary) - return { - name: max(records, key=_binary_sort_key) - for name, records in grouped.items() - } + return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()} -def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]: +def serialize_binary_record(name: str, binary: Binary | None) -> dict[str, Any]: is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED) return { - 'name': canonical_binary_name(name), - 'version': str(getattr(binary, 'version', '') or ''), - 'binprovider': str(getattr(binary, 'binprovider', '') or ''), - 'abspath': str(getattr(binary, 'abspath', '') or ''), - 'sha256': str(getattr(binary, 'sha256', '') or ''), - 'status': str(getattr(binary, 'status', '') or ''), - 'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''), + "name": canonical_binary_name(name), + "version": str(getattr(binary, "version", "") or ""), + "binprovider": str(getattr(binary, "binprovider", "") or ""), + "abspath": str(getattr(binary, "abspath", "") or ""), + "sha256": str(getattr(binary, "sha256", "") or ""), + "status": str(getattr(binary, "status", "") or ""), + "is_available": is_installed and bool(getattr(binary, "abspath", "") or ""), } -def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]: +def get_filesystem_plugins() -> dict[str, dict[str, Any]]: """Discover plugins from filesystem directories.""" import json from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR plugins = {} - for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]: + for base_dir, source in [(BUILTIN_PLUGINS_DIR, "builtin"), (USER_PLUGINS_DIR, "user")]: if not base_dir.exists(): continue for plugin_dir in base_dir.iterdir(): - if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'): - plugin_id = f'{source}.{plugin_dir.name}' + if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"): + plugin_id = f"{source}.{plugin_dir.name}" # Find hook scripts hooks = [] - for ext in ('sh', 'py', 'js'): - hooks.extend(plugin_dir.glob(f'on_*__*.{ext}')) + for ext in ("sh", "py", "js"): + hooks.extend(plugin_dir.glob(f"on_*__*.{ext}")) # Load config.json if it exists - config_file = plugin_dir / 'config.json' + config_file = plugin_dir / "config.json" config_data = None if config_file.exists(): try: - with open(config_file, 'r') as f: + with open(config_file) as f: config_data = json.load(f) - except (json.JSONDecodeError, IOError): + except (json.JSONDecodeError, OSError): config_data = None plugins[plugin_id] = { - 'id': plugin_id, - 'name': plugin_dir.name, - 'path': str(plugin_dir), - 'source': source, - 'hooks': [str(h.name) for h in hooks], - 'config': config_data, + "id": plugin_id, + "name": plugin_dir.name, + "path": str(plugin_dir), + "source": source, + "hooks": [str(h.name) for h in hooks], + "config": config_data, } return plugins @@ -418,7 +458,7 @@ def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]: @render_with_table_view def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: - assert is_superuser(request), 'Must be a superuser to view configuration settings.' + assert is_superuser(request), "Must be a superuser to view configuration settings." rows = { "Binary Name": [], @@ -433,16 +473,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: for name in all_binary_names: merged = serialize_binary_record(name, db_binaries.get(name)) - rows['Binary Name'].append(ItemLink(name, key=name)) + rows["Binary Name"].append(ItemLink(name, key=name)) - if merged['is_available']: - rows['Found Version'].append(f"✅ {merged['version']}" if merged['version'] else '✅ found') - rows['Provided By'].append(merged['binprovider'] or '-') - rows['Found Abspath'].append(merged['abspath'] or '-') + if merged["is_available"]: + rows["Found Version"].append(f"✅ {merged['version']}" if merged["version"] else "✅ found") + rows["Provided By"].append(merged["binprovider"] or "-") + rows["Found Abspath"].append(merged["abspath"] or "-") else: - rows['Found Version'].append('❌ missing') - rows['Provided By'].append('-') - rows['Found Abspath'].append('-') + rows["Found Version"].append("❌ missing") + rows["Provided By"].append("-") + rows["Found Abspath"].append("-") return TableContext( title="Binaries", @@ -452,23 +492,23 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: - assert is_superuser(request), 'Must be a superuser to view configuration settings.' + assert is_superuser(request), "Must be a superuser to view configuration settings." key = canonical_binary_name(key) db_binary = get_db_binaries_by_name().get(key) merged = serialize_binary_record(key, db_binary) - if merged['is_available']: + if merged["is_available"]: section: SectionData = { "name": key, "description": mark_safe(render_binary_detail_description(key, merged, db_binary)), "fields": { - 'name': key, - 'binprovider': merged['binprovider'] or '-', - 'abspath': merged['abspath'] or 'not found', - 'version': merged['version'] or 'unknown', - 'sha256': merged['sha256'], - 'status': merged['status'], + "name": key, + "binprovider": merged["binprovider"] or "-", + "abspath": merged["abspath"] or "not found", + "version": merged["version"] or "unknown", + "sha256": merged["sha256"], + "status": merged["status"], }, "help_texts": {}, } @@ -482,11 +522,11 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: "name": key, "description": "No persisted Binary record found", "fields": { - 'name': key, - 'binprovider': merged['binprovider'] or 'not recorded', - 'abspath': merged['abspath'] or 'not recorded', - 'version': merged['version'] or 'N/A', - 'status': merged['status'] or 'unrecorded', + "name": key, + "binprovider": merged["binprovider"] or "not recorded", + "abspath": merged["abspath"] or "not recorded", + "version": merged["version"] or "N/A", + "status": merged["status"] or "unrecorded", }, "help_texts": {}, } @@ -499,7 +539,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: @render_with_table_view def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: - assert is_superuser(request), 'Must be a superuser to view configuration settings.' + assert is_superuser(request), "Must be a superuser to view configuration settings." rows = { "Name": [], @@ -512,26 +552,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: plugins = get_filesystem_plugins() for plugin_id, plugin in plugins.items(): - rows['Name'].append(ItemLink(plugin['name'], key=plugin_id)) - rows['Source'].append(plugin['source']) - rows['Path'].append(format_html('{}', plugin['path'])) - rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)') + rows["Name"].append(ItemLink(plugin["name"], key=plugin_id)) + rows["Source"].append(plugin["source"]) + rows["Path"].append(format_html("{}", plugin["path"])) + rows["Hooks"].append(", ".join(plugin["hooks"]) or "(none)") # Show config status - if plugin.get('config'): - config_properties = plugin['config'].get('properties', {}) + if plugin.get("config"): + config_properties = plugin["config"].get("properties", {}) config_count = len(config_properties) - rows['Config'].append(f'✅ {config_count} properties' if config_count > 0 else '✅ present') + rows["Config"].append(f"✅ {config_count} properties" if config_count > 0 else "✅ present") else: - rows['Config'].append('❌ none') + rows["Config"].append("❌ none") if not plugins: # Show a helpful message when no plugins found - rows['Name'].append('(no plugins found)') - rows['Source'].append('-') - rows['Path'].append(mark_safe('abx_plugins/plugins/ or data/custom_plugins/')) - rows['Hooks'].append('-') - rows['Config'].append('-') + rows["Name"].append("(no plugins found)") + rows["Source"].append("-") + rows["Path"].append(mark_safe("abx_plugins/plugins/ or data/custom_plugins/")) + rows["Hooks"].append("-") + rows["Config"].append("-") return TableContext( title="Installed plugins", @@ -541,7 +581,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: - assert is_superuser(request), 'Must be a superuser to view configuration settings.' + assert is_superuser(request), "Must be a superuser to view configuration settings." plugins = get_filesystem_plugins() @@ -549,65 +589,75 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: if not plugin: return ItemContext( slug=key, - title=f'Plugin not found: {key}', + title=f"Plugin not found: {key}", data=[], ) # Base fields that all plugins have - docs_url = get_plugin_docs_url(plugin['name']) + docs_url = get_plugin_docs_url(plugin["name"]) machine_admin_url = get_machine_admin_url() fields = { - "id": plugin['id'], - "name": plugin['name'], - "source": plugin['source'], + "id": plugin["id"], + "name": plugin["name"], + "source": plugin["source"], } - sections: list[SectionData] = [{ - "name": plugin['name'], - "description": format_html( - '{}
ABX Plugin Docs', - plugin['path'], - docs_url, - ), - "fields": fields, - "help_texts": {}, - }] - - if plugin['hooks']: - sections.append({ - "name": "Hooks", - "description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])), - "fields": {}, + sections: list[SectionData] = [ + { + "name": plugin["name"], + "description": format_html( + '{}
ABX Plugin Docs', + plugin["path"], + docs_url, + ), + "fields": fields, "help_texts": {}, - }) + }, + ] - if plugin.get('config'): - sections.append({ - "name": "Plugin Metadata", - "description": mark_safe(render_plugin_metadata_html(plugin['config'])), - "fields": {}, - "help_texts": {}, - }) - - sections.append({ - "name": "config.json", - "description": mark_safe(render_highlighted_json_block(plugin['config'])), - "fields": {}, - "help_texts": {}, - }) - - config_properties = plugin['config'].get('properties', {}) - if config_properties: - sections.append({ - "name": "Config Properties", - "description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)), + if plugin["hooks"]: + sections.append( + { + "name": "Hooks", + "description": mark_safe(render_hook_links_html(plugin["name"], plugin["hooks"], plugin["source"])), "fields": {}, "help_texts": {}, - }) + }, + ) + + if plugin.get("config"): + sections.append( + { + "name": "Plugin Metadata", + "description": mark_safe(render_plugin_metadata_html(plugin["config"])), + "fields": {}, + "help_texts": {}, + }, + ) + + sections.append( + { + "name": "config.json", + "description": mark_safe(render_highlighted_json_block(plugin["config"])), + "fields": {}, + "help_texts": {}, + }, + ) + + config_properties = plugin["config"].get("properties", {}) + if config_properties: + sections.append( + { + "name": "Config Properties", + "description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)), + "fields": {}, + "help_texts": {}, + }, + ) return ItemContext( slug=key, - title=plugin['name'], + title=plugin["name"], data=sections, ) @@ -648,20 +698,20 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: all_config[config_name] = config_data # Add top row for supervisord process manager - rows["Name"].append(ItemLink('supervisord', key='supervisord')) + rows["Name"].append(ItemLink("supervisord", key="supervisord")) supervisor_state = supervisor.getState() - rows["State"].append(str(supervisor_state.get('statename') if isinstance(supervisor_state, dict) else '')) - rows['PID'].append(str(supervisor.getPID())) - rows["Started"].append('-') - rows["Command"].append('supervisord --configuration=tmp/supervisord.conf') + rows["State"].append(str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else "")) + rows["PID"].append(str(supervisor.getPID())) + rows["Started"].append("-") + rows["Command"].append("supervisord --configuration=tmp/supervisord.conf") rows["Logfile"].append( format_html( '{}', - 'supervisord', - 'logs/supervisord.log', - ) + "supervisord", + "logs/supervisord.log", + ), ) - rows['Exit Status'].append('0') + rows["Exit Status"].append("0") # Add a row for each worker process managed by supervisord process_items = supervisor.getAllProcessInfo() @@ -678,15 +728,15 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: rows["Name"].append(ItemLink(proc_name, key=proc_name)) rows["State"].append(str(proc_data.get("statename") or "")) - rows['PID'].append(proc_description.replace('pid ', '')) + rows["PID"].append(proc_description.replace("pid ", "")) rows["Started"].append(format_parsed_datetime(proc_start)) rows["Command"].append(str(proc_config.get("command") or "")) rows["Logfile"].append( format_html( '{}', - proc_logfile.split("/")[-1].split('.')[0], + proc_logfile.split("/")[-1].split(".")[0], proc_logfile, - ) + ), ) rows["Exit Status"].append(str(proc_data.get("exitstatus") or "")) @@ -708,8 +758,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: supervisor = get_existing_supervisord_process() if supervisor is None: return ItemContext( - slug='none', - title='error: No running supervisord process.', + slug="none", + title="error: No running supervisord process.", data=[], ) @@ -721,7 +771,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: if isinstance(config_data, dict): all_config.append(config_data) - if key == 'supervisord': + if key == "supervisord": relevant_config = CONFIG_FILE.read_text() relevant_logs = str(supervisor.readLog(0, 10_000_000)) start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0] @@ -729,7 +779,7 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else "" supervisor_state = supervisor.getState() - proc: Dict[str, object] = { + proc: dict[str, object] = { "name": "supervisord", "pid": supervisor.getPID(), "statename": str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""), @@ -737,12 +787,12 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: "stop": None, "exitstatus": "", "stdout_logfile": "logs/supervisord.log", - "description": f'pid 000, uptime {uptime}', + "description": f"pid 000, uptime {uptime}", } else: worker_data = get_worker(supervisor, key) proc = worker_data if isinstance(worker_data, dict) else {} - relevant_config = next((config for config in all_config if config.get('name') == key), {}) + relevant_config = next((config for config in all_config if config.get("name") == key), {}) log_result = supervisor.tailProcessStdoutLog(key, 0, 10_000_000) relevant_logs = str(log_result[0] if isinstance(log_result, tuple) else log_result) @@ -775,7 +825,6 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: def log_list_view(request: HttpRequest, **kwargs) -> TableContext: assert is_superuser(request), "Must be a superuser to view configuration settings." - log_files: list[Path] = [] for logfile in sorted(CONSTANTS.LOGS_DIR.glob("*.log"), key=os.path.getmtime)[::-1]: if isinstance(logfile, Path): @@ -793,14 +842,14 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext: st = logfile.stat() rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name)) rows["Last Updated"].append(format_parsed_datetime(st.st_mtime)) - rows["Size"].append(f'{st.st_size//1000} kb') + rows["Size"].append(f"{st.st_size // 1000} kb") - with open(logfile, 'rb') as f: + with open(logfile, "rb") as f: try: f.seek(-1024, os.SEEK_END) except OSError: f.seek(0) - last_lines = f.read().decode('utf-8', errors='replace').split("\n") + last_lines = f.read().decode("utf-8", errors="replace").split("\n") non_empty_lines = [line for line in last_lines if line.strip()] rows["Most Recent Lines"].append(non_empty_lines[-1]) @@ -814,7 +863,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext: def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: assert is_superuser(request), "Must be a superuser to view configuration settings." - log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0] + log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob("*.log") if key in logfile.name][0] log_text = log_file.read_text() log_stat = log_file.stat() @@ -824,7 +873,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: "description": key, "fields": { "Path": str(log_file), - "Size": f"{log_stat.st_size//1000} kb", + "Size": f"{log_stat.st_size // 1000} kb", "Last Updated": format_parsed_datetime(log_stat.st_mtime), "Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]), "Full Log": log_text, diff --git a/archivebox/core/__init__.py b/archivebox/core/__init__.py index 3501e3b0..f50f21bf 100644 --- a/archivebox/core/__init__.py +++ b/archivebox/core/__init__.py @@ -1,10 +1,11 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" __order__ = 100 def register_admin(admin_site): """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site""" from archivebox.core.admin import register_admin as do_register + do_register(admin_site) @@ -17,11 +18,12 @@ def get_CONFIG(): ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG, ) + return { - 'SHELL_CONFIG': SHELL_CONFIG, - 'STORAGE_CONFIG': STORAGE_CONFIG, - 'GENERAL_CONFIG': GENERAL_CONFIG, - 'SERVER_CONFIG': SERVER_CONFIG, - 'ARCHIVING_CONFIG': ARCHIVING_CONFIG, - 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG, + "SHELL_CONFIG": SHELL_CONFIG, + "STORAGE_CONFIG": STORAGE_CONFIG, + "GENERAL_CONFIG": GENERAL_CONFIG, + "SERVER_CONFIG": SERVER_CONFIG, + "ARCHIVING_CONFIG": ARCHIVING_CONFIG, + "SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG, } diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 24f5e5c6..9c954183 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from django.contrib.auth import get_user_model diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 6f5f3765..8c4bc602 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" import html import json @@ -21,57 +21,45 @@ from django.utils.text import smart_split from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG -from archivebox.misc.paginators import AccelleratedPaginator +from archivebox.misc.paginators import AcceleratedPaginator from archivebox.base_models.admin import BaseModelAdmin from archivebox.hooks import get_plugin_icon from archivebox.core.host_utils import build_snapshot_url from archivebox.core.widgets import InlineTagEditorWidget from archivebox.core.views import LIVE_PLUGIN_BASE_URL +from archivebox.machine.env_utils import env_to_shell_exports from archivebox.core.models import ArchiveResult, Snapshot -def _stringify_env_value(value) -> str: - if value is None: - return '' - if isinstance(value, str): - return value - return json.dumps(value, separators=(',', ':')) - - def _quote_shell_string(value: str) -> str: return "'" + str(value).replace("'", "'\"'\"'") + "'" def _get_replay_source_url(result: ArchiveResult) -> str: - process_env = getattr(getattr(result, 'process', None), 'env', None) or {} - return str(process_env.get('SOURCE_URL') or result.snapshot.url or '') + process_env = getattr(getattr(result, "process", None), "env", None) or {} + return str(process_env.get("SOURCE_URL") or result.snapshot.url or "") def build_abx_dl_display_command(result: ArchiveResult) -> str: source_url = _get_replay_source_url(result) - plugin_name = str(result.plugin or '').strip() + plugin_name = str(result.plugin or "").strip() if not plugin_name and not source_url: - return 'abx-dl' + return "abx-dl" if not source_url: - return f'abx-dl --plugins={plugin_name}' - return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}' + return f"abx-dl --plugins={plugin_name}" + return f"abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}" def build_abx_dl_replay_command(result: ArchiveResult) -> str: display_command = build_abx_dl_display_command(result) - process = getattr(result, 'process', None) - env = getattr(process, 'env', None) or {} - env_items = ' '.join( - f'{key}={shlex.quote(_stringify_env_value(value))}' - for key, value in sorted(env.items()) - if value is not None - ) + process = getattr(result, "process", None) + env_items = env_to_shell_exports(getattr(process, "env", None) or {}) snapshot_dir = shlex.quote(str(result.snapshot_dir)) if env_items: - return f'cd {snapshot_dir}; env {env_items} {display_command}' - return f'cd {snapshot_dir}; {display_command}' + return f"cd {snapshot_dir}; env {env_items} {display_command}" + return f"cd {snapshot_dir}; {display_command}" def get_plugin_admin_url(plugin_name: str) -> str: @@ -81,50 +69,87 @@ def get_plugin_admin_url(plugin_name: str) -> str: if plugin_dir: builtin_root = BUILTIN_PLUGINS_DIR.resolve() if plugin_dir.is_relative_to(builtin_root): - return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/' + return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" user_root = USER_PLUGINS_DIR.resolve() if plugin_dir.is_relative_to(user_root): - return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/' + return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/" - return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/' + return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" def render_archiveresults_list(archiveresults_qs, limit=50): """Render a nice inline list view of archive results with status, plugin, output, and actions.""" - results = list(archiveresults_qs.order_by('plugin').select_related('snapshot')[:limit]) + result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit]) + if not result_ids: + return mark_safe('
No Archive Results yet...
') + + results_by_id = { + result.pk: result + for result in ArchiveResult.objects.filter(pk__in=result_ids).select_related("snapshot", "process", "process__machine") + } + results = [results_by_id[result_id] for result_id in result_ids if result_id in results_by_id] if not results: return mark_safe('
No Archive Results yet...
') # Status colors status_colors = { - 'succeeded': ('#166534', '#dcfce7'), # green - 'failed': ('#991b1b', '#fee2e2'), # red - 'queued': ('#6b7280', '#f3f4f6'), # gray - 'started': ('#92400e', '#fef3c7'), # amber - 'backoff': ('#92400e', '#fef3c7'), - 'skipped': ('#475569', '#f1f5f9'), - 'noresults': ('#475569', '#f1f5f9'), + "succeeded": ("#166534", "#dcfce7"), # green + "failed": ("#991b1b", "#fee2e2"), # red + "queued": ("#6b7280", "#f3f4f6"), # gray + "started": ("#92400e", "#fef3c7"), # amber + "backoff": ("#92400e", "#fef3c7"), + "skipped": ("#475569", "#f1f5f9"), + "noresults": ("#475569", "#f1f5f9"), } rows = [] for idx, result in enumerate(results): - status = result.status or 'queued' - color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6')) + status = result.status or "queued" + color, bg = status_colors.get(status, ("#6b7280", "#f3f4f6")) + output_files = result.output_files or {} + if isinstance(output_files, dict): + output_file_count = len(output_files) + elif isinstance(output_files, (list, tuple, set)): + output_file_count = len(output_files) + elif isinstance(output_files, str): + try: + parsed = json.loads(output_files) + output_file_count = len(parsed) if isinstance(parsed, (dict, list, tuple, set)) else 0 + except Exception: + output_file_count = 0 + else: + output_file_count = 0 # Get plugin icon icon = get_plugin_icon(result.plugin) # Format timestamp - end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-' + end_time = result.end_ts.strftime("%Y-%m-%d %H:%M:%S") if result.end_ts else "-" + + process_display = "-" + if result.process_id and result.process: + process_display = f''' + {result.process.pid or "-"} + ''' + + machine_display = "-" + if result.process_id and result.process and result.process.machine_id: + machine_display = f''' + {result.process.machine.hostname} + ''' # Truncate output for display - full_output = result.output_str or '-' + full_output = result.output_str or "-" output_display = full_output[:60] if len(full_output) > 60: - output_display += '...' + output_display += "..." display_cmd = build_abx_dl_display_command(result) replay_cmd = build_abx_dl_replay_command(result) @@ -132,23 +157,23 @@ def render_archiveresults_list(archiveresults_qs, limit=50): cmd_attr = html.escape(replay_cmd, quote=True) # Build output link - use embed_path() which checks output_files first - embed_path = result.embed_path() if hasattr(result, 'embed_path') else None - snapshot_id = str(getattr(result, 'snapshot_id', '')) - if embed_path and result.status == 'succeeded': + embed_path = result.embed_path() if hasattr(result, "embed_path") else None + snapshot_id = str(getattr(result, "snapshot_id", "")) + if embed_path and result.status == "succeeded": output_link = build_snapshot_url(snapshot_id, embed_path) else: - output_link = build_snapshot_url(snapshot_id, '') + output_link = build_snapshot_url(snapshot_id, "") # Get version - try cmd_version field - version = result.cmd_version if result.cmd_version else '-' + version = result.cmd_version if result.cmd_version else "-" # Unique ID for this row's expandable output - row_id = f'output_{idx}_{str(result.id)[:8]}' + row_id = f"output_{idx}_{str(result.id)[:8]}" rows.append(f''' - {str(result.id)[-8:]} @@ -178,9 +203,18 @@ def render_archiveresults_list(archiveresults_qs, limit=50): {output_display} + + {output_file_count} + {end_time} + + {process_display} + + + {machine_display} + {version} @@ -189,14 +223,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50): 📄 - ✏️ - +
Details & Output @@ -205,7 +239,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
ID: {str(result.id)} Version: {version} - PWD: {result.pwd or '-'} + PWD: {result.pwd or "-"}
Output: @@ -230,19 +264,19 @@ def render_archiveresults_list(archiveresults_qs, limit=50): ''') total_count = archiveresults_qs.count() - footer = '' + footer = "" if total_count > limit: - footer = f''' + footer = f""" - + Showing {limit} of {total_count} results   - View all → - ''' + """ - return mark_safe(f''' + return mark_safe(f"""
@@ -252,86 +286,92 @@ def render_archiveresults_list(archiveresults_qs, limit=50): + + + - {''.join(rows)} + {"".join(rows)} {footer}
Plugin OutputFiles CompletedProcessMachine Version Actions
- ''') - + """) class ArchiveResultInline(admin.TabularInline): - name = 'Archive Results Log' + name = "Archive Results Log" model = ArchiveResult parent_model = Snapshot # fk_name = 'snapshot' extra = 0 - sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version') - readonly_fields = ('id', 'result_id', 'completed', 'command', 'version') - fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str') + sort_fields = ("end_ts", "plugin", "output_str", "status", "cmd_version") + readonly_fields = ("id", "result_id", "completed", "command", "version") + fields = ("start_ts", "end_ts", *readonly_fields, "plugin", "cmd", "cmd_version", "pwd", "status", "output_str") # exclude = ('id',) - ordering = ('end_ts',) + ordering = ("end_ts",) show_change_link = True # # classes = ['collapse'] def get_parent_object_from_request(self, request): resolved = resolve(request.path_info) try: - return self.parent_model.objects.get(pk=resolved.kwargs['object_id']) + return self.parent_model.objects.get(pk=resolved.kwargs["object_id"]) except (self.parent_model.DoesNotExist, ValidationError): return None @admin.display( - description='Completed', - ordering='end_ts', + description="Completed", + ordering="end_ts", ) def completed(self, obj): - return format_html('

{}

', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S')) + return format_html('

{}

', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S")) def result_id(self, obj): - return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), str(obj.id)[:8]) - + return format_html( + '[{}]', + reverse("admin:core_archiveresult_change", args=(obj.id,)), + str(obj.id)[:8], + ) + def command(self, obj): - return format_html('{}', " ".join(obj.cmd or [])) - + return format_html("{}", " ".join(obj.cmd or [])) + def version(self, obj): - return format_html('{}', obj.cmd_version or '-') - + return format_html("{}", obj.cmd_version or "-") + def get_formset(self, request, obj=None, **kwargs): formset = super().get_formset(request, obj, **kwargs) snapshot = self.get_parent_object_from_request(request) - form_class = getattr(formset, 'form', None) - base_fields = getattr(form_class, 'base_fields', {}) - snapshot_output_dir = str(snapshot.output_dir) if snapshot else '' + form_class = getattr(formset, "form", None) + base_fields = getattr(form_class, "base_fields", {}) + snapshot_output_dir = str(snapshot.output_dir) if snapshot else "" # import ipdb; ipdb.set_trace() # formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() - + # default values for new entries - base_fields['status'].initial = 'succeeded' - base_fields['start_ts'].initial = timezone.now() - base_fields['end_ts'].initial = timezone.now() - base_fields['cmd_version'].initial = '-' - base_fields['pwd'].initial = snapshot_output_dir - base_fields['cmd'].initial = '["-"]' - base_fields['output_str'].initial = 'Manually recorded cmd output...' + base_fields["status"].initial = "succeeded" + base_fields["start_ts"].initial = timezone.now() + base_fields["end_ts"].initial = timezone.now() + base_fields["cmd_version"].initial = "-" + base_fields["pwd"].initial = snapshot_output_dir + base_fields["cmd"].initial = '["-"]' + base_fields["output_str"].initial = "Manually recorded cmd output..." if obj is not None: # hidden values for existing entries and new entries - base_fields['start_ts'].widget = base_fields['start_ts'].hidden_widget() - base_fields['end_ts'].widget = base_fields['end_ts'].hidden_widget() - base_fields['cmd'].widget = base_fields['cmd'].hidden_widget() - base_fields['pwd'].widget = base_fields['pwd'].hidden_widget() - base_fields['cmd_version'].widget = base_fields['cmd_version'].hidden_widget() + base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget() + base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget() + base_fields["cmd"].widget = base_fields["cmd"].hidden_widget() + base_fields["pwd"].widget = base_fields["pwd"].hidden_widget() + base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget() return formset - + def get_readonly_fields(self, request, obj=None): if obj is not None: return self.readonly_fields @@ -339,62 +379,122 @@ class ArchiveResultInline(admin.TabularInline): return [] - class ArchiveResultAdmin(BaseModelAdmin): - list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display') + list_display = ( + "details_link", + "zip_link", + "created_at", + "snapshot_info", + "tags_inline", + "status_badge", + "plugin_with_icon", + "process_link", + "machine_link", + "cmd_str", + "output_str_display", + ) list_display_links = None - sort_fields = ('id', 'created_at', 'plugin', 'status') - readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link') - search_fields = () - autocomplete_fields = ['snapshot'] + sort_fields = ("id", "created_at", "plugin", "status") + readonly_fields = ( + "admin_actions", + "cmd", + "cmd_version", + "pwd", + "cmd_str", + "snapshot_info", + "tags_str", + "created_at", + "modified_at", + "output_summary", + "plugin_with_icon", + "process_link", + ) + search_fields = ( + "snapshot__id", + "snapshot__url", + "snapshot__tags__name", + "snapshot__crawl_id", + "plugin", + "hook_name", + "output_str", + "output_json", + "process__cmd", + ) + autocomplete_fields = ["snapshot"] fieldsets = ( - ('Snapshot', { - 'fields': ('snapshot', 'snapshot_info', 'tags_str'), - 'classes': ('card', 'wide'), - }), - ('Plugin', { - 'fields': ('plugin_with_icon', 'process_link', 'status'), - 'classes': ('card',), - }), - ('Timing', { - 'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'), - 'classes': ('card',), - }), - ('Command', { - 'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'), - 'classes': ('card',), - }), - ('Output', { - 'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'), - 'classes': ('card', 'wide'), - }), + ( + "Actions", + { + "fields": ("admin_actions",), + "classes": ("card", "wide"), + }, + ), + ( + "Snapshot", + { + "fields": ("snapshot", "snapshot_info", "tags_str"), + "classes": ("card", "wide"), + }, + ), + ( + "Plugin", + { + "fields": ("plugin_with_icon", "process_link", "status"), + "classes": ("card",), + }, + ), + ( + "Timing", + { + "fields": ("start_ts", "end_ts", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Command", + { + "fields": ("cmd", "cmd_str", "cmd_version", "pwd"), + "classes": ("card",), + }, + ), + ( + "Output", + { + "fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"), + "classes": ("card", "wide"), + }, + ), ) - list_filter = ('status', 'plugin', 'start_ts') - ordering = ['-start_ts'] + list_filter = ("status", "plugin", "start_ts") + ordering = ["-start_ts"] list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE - paginator = AccelleratedPaginator + paginator = AcceleratedPaginator save_on_top = True - actions = ['delete_selected'] + actions = ["delete_selected"] class Meta: - verbose_name = 'Archive Result' - verbose_name_plural = 'Archive Results' + verbose_name = "Archive Result" + verbose_name_plural = "Archive Results" def change_view(self, request, object_id, form_url="", extra_context=None): self.request = request return super().change_view(request, object_id, form_url, extra_context) + def changelist_view(self, request, extra_context=None): + self.request = request + return super().changelist_view(request, extra_context) + def get_queryset(self, request): return ( super() .get_queryset(request) - .select_related('snapshot', 'process') - .prefetch_related('snapshot__tags') - .annotate(snapshot_first_tag=Min('snapshot__tags__name')) + .select_related("snapshot", "process") + .prefetch_related("snapshot__tags") + .annotate(snapshot_first_tag=Min("snapshot__tags__name")) ) def get_search_results(self, request, queryset, search_term): @@ -402,15 +502,14 @@ class ArchiveResultAdmin(BaseModelAdmin): return queryset, False queryset = queryset.annotate( - snapshot_id_text=Cast('snapshot__id', output_field=TextField()), - snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()), - output_json_text=Cast('output_json', output_field=TextField()), - cmd_text=Cast('process__cmd', output_field=TextField()), + snapshot_id_text=Cast("snapshot__id", output_field=TextField()), + snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()), + output_json_text=Cast("output_json", output_field=TextField()), + cmd_text=Cast("process__cmd", output_field=TextField()), ) search_bits = [ - bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit - for bit in smart_split(search_term) + bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term) ] search_bits = [bit.strip() for bit in search_bits if bit.strip()] if not search_bits: @@ -427,22 +526,44 @@ class ArchiveResultAdmin(BaseModelAdmin): | Q(hook_name__icontains=bit) | Q(output_str__icontains=bit) | Q(output_json_text__icontains=bit) - | Q(cmd_text__icontains=bit) + | Q(cmd_text__icontains=bit), ) return queryset.filter(reduce(and_, filters)).distinct(), True - @admin.display(description='Details', ordering='id') + def get_snapshot_view_url(self, result: ArchiveResult) -> str: + return build_snapshot_url(str(result.snapshot_id), request=getattr(self, "request", None)) + + def get_output_view_url(self, result: ArchiveResult) -> str: + output_path = result.embed_path() if hasattr(result, "embed_path") else None + if not output_path: + output_path = result.plugin or "" + return build_snapshot_url(str(result.snapshot_id), output_path, request=getattr(self, "request", None)) + + def get_output_files_url(self, result: ArchiveResult) -> str: + return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=getattr(self, 'request', None))}/?files=1" + + def get_output_zip_url(self, result: ArchiveResult) -> str: + return f"{self.get_output_files_url(result)}&download=zip" + + @admin.display(description="Details", ordering="id") def details_link(self, result): return format_html( '{}', - reverse('admin:core_archiveresult_change', args=[result.id]), + reverse("admin:core_archiveresult_change", args=[result.id]), str(result.id)[-8:], ) + @admin.display(description="Zip") + def zip_link(self, result): + return format_html( + '⬇ ZIP', + self.get_output_zip_url(result), + ) + @admin.display( - description='Snapshot', - ordering='snapshot__url', + description="Snapshot", + ordering="snapshot__url", ) def snapshot_info(self, result): snapshot_id = str(result.snapshot_id) @@ -450,29 +571,28 @@ class ArchiveResultAdmin(BaseModelAdmin): '[{}]   {}   {}
', build_snapshot_url(snapshot_id, "index.html"), snapshot_id[:8], - result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'), + result.snapshot.bookmarked_at.strftime("%Y-%m-%d %H:%M"), result.snapshot.url[:128], ) - @admin.display( - description='Snapshot Tags' + description="Snapshot Tags", ) def tags_str(self, result): return result.snapshot.tags_str() - @admin.display(description='Tags', ordering='snapshot_first_tag') + @admin.display(description="Tags", ordering="snapshot_first_tag") def tags_inline(self, result): widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False) tags_html = widget.render( - name=f'tags_{result.snapshot_id}', + name=f"tags_{result.snapshot_id}", value=result.snapshot.tags.all(), - attrs={'id': f'tags_{result.snapshot_id}'}, + attrs={"id": f"tags_{result.snapshot_id}"}, snapshot_id=str(result.snapshot_id), ) return mark_safe(f'{tags_html}') - @admin.display(description='Status', ordering='status') + @admin.display(description="Status", ordering="status") def status_badge(self, result): status = result.status or ArchiveResult.StatusChoices.QUEUED return format_html( @@ -482,7 +602,7 @@ class ArchiveResultAdmin(BaseModelAdmin): result.get_status_display() or status, ) - @admin.display(description='Plugin', ordering='plugin') + @admin.display(description="Plugin", ordering="plugin") def plugin_with_icon(self, result): icon = get_plugin_icon(result.plugin) return format_html( @@ -494,36 +614,36 @@ class ArchiveResultAdmin(BaseModelAdmin): result.plugin, ) - @admin.display(description='Process', ordering='process__pid') + @admin.display(description="Process", ordering="process__pid") def process_link(self, result): if not result.process_id: - return '-' - process_label = result.process.pid if result.process and result.process.pid else '-' + return "-" + process_label = result.process.pid if result.process and result.process.pid else "-" return format_html( '{}', - reverse('admin:machine_process_change', args=[result.process_id]), + reverse("admin:machine_process_change", args=[result.process_id]), process_label, ) - @admin.display(description='Machine', ordering='process__machine__hostname') + @admin.display(description="Machine", ordering="process__machine__hostname") def machine_link(self, result): if not result.process_id or not result.process or not result.process.machine_id: - return '-' + return "-" machine = result.process.machine return format_html( '{} {}', - reverse('admin:machine_machine_change', args=[machine.id]), + reverse("admin:machine_machine_change", args=[machine.id]), str(machine.id)[:8], machine.hostname, ) - @admin.display(description='Command') + @admin.display(description="Command") def cmd_str(self, result): display_cmd = build_abx_dl_display_command(result) replay_cmd = build_abx_dl_replay_command(result) return format_html( - ''' -
+ """ +
- ''', + """, replay_cmd, replay_cmd, display_cmd, @@ -542,8 +662,8 @@ class ArchiveResultAdmin(BaseModelAdmin): def output_display(self, result): # Determine output link path - use embed_path() which checks output_files - embed_path = result.embed_path() if hasattr(result, 'embed_path') else None - output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html' + embed_path = result.embed_path() if hasattr(result, "embed_path") else None + output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html" snapshot_id = str(result.snapshot_id) return format_html( '↗️
{}
', @@ -551,13 +671,13 @@ class ArchiveResultAdmin(BaseModelAdmin): result.output_str, ) - @admin.display(description='Output', ordering='output_str') + @admin.display(description="Output", ordering="output_str") def output_str_display(self, result): - output_text = str(result.output_str or '').strip() + output_text = str(result.output_str or "").strip() if not output_text: - return '-' + return "-" - live_path = result.embed_path() if hasattr(result, 'embed_path') else None + live_path = result.embed_path() if hasattr(result, "embed_path") else None if live_path: return format_html( '{}', @@ -572,8 +692,48 @@ class ArchiveResultAdmin(BaseModelAdmin): output_text, ) + @admin.display(description="") + def admin_actions(self, result): + return format_html( + """ + + """, + self.get_output_view_url(result), + self.get_output_files_url(result), + self.get_output_zip_url(result), + self.get_snapshot_view_url(result), + ) + def output_summary(self, result): - snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1] + snapshot_dir = Path(DATA_DIR) / str(result.pwd).split("data/", 1)[-1] output_html = format_html( '
{}

', result.output_str, @@ -583,9 +743,13 @@ class ArchiveResultAdmin(BaseModelAdmin): 'See result files ...
',
             build_snapshot_url(snapshot_id, "index.html"),
         )
-        embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
-        path_from_embed = (snapshot_dir / (embed_path or ''))
-        output_html += format_html('{}/{}

', str(snapshot_dir), str(embed_path)) + embed_path = result.embed_path() if hasattr(result, "embed_path") else "" + path_from_embed = snapshot_dir / (embed_path or "") + output_html += format_html( + '{}/{}

', + str(snapshot_dir), + str(embed_path), + ) if os.access(path_from_embed, os.R_OK): root_dir = str(path_from_embed) else: @@ -594,19 +758,22 @@ class ArchiveResultAdmin(BaseModelAdmin): # print(root_dir, str(list(os.walk(root_dir)))) for root, dirs, files in os.walk(root_dir): - depth = root.replace(root_dir, '').count(os.sep) + 1 + depth = root.replace(root_dir, "").count(os.sep) + 1 if depth > 2: continue - indent = ' ' * 4 * (depth) + indent = " " * 4 * (depth) output_html += format_html('{}{}/
', indent, os.path.basename(root)) - indentation_str = ' ' * 4 * (depth + 1) + indentation_str = " " * 4 * (depth + 1) for filename in sorted(files): - is_hidden = filename.startswith('.') - output_html += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) - - return output_html + mark_safe('
') - + is_hidden = filename.startswith(".") + output_html += format_html( + '{}{}
', + int(not is_hidden), + indentation_str, + filename.strip(), + ) + return output_html + mark_safe("") def register_admin(admin_site): diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py index ae6be452..770a1d2a 100644 --- a/archivebox/core/admin_site.py +++ b/archivebox/core/admin_site.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from typing import TYPE_CHECKING, Any @@ -18,23 +18,23 @@ if TYPE_CHECKING: class ArchiveBoxAdmin(admin.AdminSite): - site_header = 'ArchiveBox' - index_title = 'Admin Views' - site_title = 'Admin' - namespace = 'admin' + site_header = "ArchiveBox" + index_title = "Admin Views" + site_title = "Admin" + namespace = "admin" - def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']: + def get_app_list(self, request: "HttpRequest", app_label: str | None = None) -> list["AppDict"]: if app_label is None: return adv_get_app_list(self, request) return adv_get_app_list(self, request, app_label) - def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse': + def admin_data_index_view(self, request: "HttpRequest", **kwargs: Any) -> "TemplateResponse": return adv_admin_data_index_view(self, request, **kwargs) - def get_admin_data_urls(self) -> list['URLResolver | URLPattern']: + def get_admin_data_urls(self) -> list["URLResolver | URLPattern"]: return adv_get_admin_data_urls(self) - def get_urls(self) -> list['URLResolver | URLPattern']: + def get_urls(self) -> list["URLResolver | URLPattern"]: return self.get_admin_data_urls() + super().get_urls() @@ -43,7 +43,6 @@ archivebox_admin = ArchiveBoxAdmin() # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel - ############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS ######### diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 0202e62c..266ed974 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -1,32 +1,30 @@ +__package__ = "archivebox.core" -__package__ = 'archivebox.core' - -import os -from pathlib import Path +import json +from functools import lru_cache from django.contrib import admin, messages from django.urls import path from django.shortcuts import get_object_or_404, redirect from django.utils.html import format_html from django.utils.safestring import mark_safe -from django.utils import timezone from django.db.models import Q, Sum, Count, Prefetch from django.db.models.functions import Coalesce from django import forms from django.template import Template, RequestContext from django.contrib.admin.helpers import ActionForm -from django.middleware.csrf import get_token from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG from archivebox.misc.util import htmldecode, urldecode -from archivebox.misc.paginators import AccelleratedPaginator +from archivebox.misc.paginators import AcceleratedPaginator from archivebox.misc.logging_util import printable_filesize from archivebox.search.admin import SearchResultsAdminMixin from archivebox.core.host_utils import build_snapshot_url, build_web_url +from archivebox.hooks import get_plugin_icon, get_plugin_name, get_plugins from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin -from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add +from archivebox.workers.tasks import bg_archive_snapshots, bg_add from archivebox.core.models import Tag, Snapshot, ArchiveResult from archivebox.core.admin_archiveresults import render_archiveresults_list @@ -37,28 +35,45 @@ from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget GLOBAL_CONTEXT = {} +@lru_cache(maxsize=1) +def _plugin_sort_order() -> dict[str, int]: + return {get_plugin_name(plugin): idx for idx, plugin in enumerate(get_plugins())} + + +@lru_cache(maxsize=256) +def _expected_snapshot_hook_total(config_json: str) -> int: + from archivebox.hooks import discover_hooks + + try: + config = json.loads(config_json) if config_json else {} + except Exception: + return 0 + + return len(discover_hooks("Snapshot", config=config)) + + class SnapshotActionForm(ActionForm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Define tags field in __init__ to avoid database access during app initialization - self.fields['tags'] = forms.CharField( - label='', + self.fields["tags"] = forms.CharField( + label="", required=False, widget=TagEditorWidget(), ) def clean_tags(self): """Parse comma-separated tag names into Tag objects.""" - tags_str = self.cleaned_data.get('tags', '') + tags_str = self.cleaned_data.get("tags", "") if not tags_str: return [] - tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] tags = [] for name in tag_names: tag, _ = Tag.objects.get_or_create( name__iexact=name, - defaults={'name': name} + defaults={"name": name}, ) # Use the existing tag if found by case-insensitive match tag = Tag.objects.filter(name__iexact=name).first() or tag @@ -74,11 +89,11 @@ class SnapshotActionForm(ActionForm): class TagNameListFilter(admin.SimpleListFilter): - title = 'By tag name' - parameter_name = 'tag' + title = "By tag name" + parameter_name = "tag" def lookups(self, request, model_admin): - return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by('name')] + return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by("name")] def queryset(self, request, queryset): if self.value(): @@ -88,23 +103,24 @@ class TagNameListFilter(admin.SimpleListFilter): class SnapshotAdminForm(forms.ModelForm): """Custom form for Snapshot admin with tag editor widget.""" + tags_editor = forms.CharField( - label='Tags', + label="Tags", required=False, widget=TagEditorWidget(), - help_text='Type tag names and press Enter or Space to add. Click × to remove.', + help_text="Type tag names and press Enter or Space to add. Click × to remove.", ) class Meta: model = Snapshot - fields = '__all__' + fields = "__all__" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Initialize tags_editor with current tags if self.instance and self.instance.pk: - self.initial['tags_editor'] = ','.join( - sorted(tag.name for tag in self.instance.tags.all()) + self.initial["tags_editor"] = ",".join( + sorted(tag.name for tag in self.instance.tags.all()), ) def save(self, commit=True): @@ -113,19 +129,19 @@ class SnapshotAdminForm(forms.ModelForm): # Handle tags_editor field if commit: instance.save() - save_m2m = getattr(self, '_save_m2m', None) + save_m2m = getattr(self, "_save_m2m", None) if callable(save_m2m): save_m2m() # Parse and save tags from tags_editor - tags_str = self.cleaned_data.get('tags_editor', '') + tags_str = self.cleaned_data.get("tags_editor", "") if tags_str: - tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] tags = [] for name in tag_names: tag, _ = Tag.objects.get_or_create( name__iexact=name, - defaults={'name': name} + defaults={"name": name}, ) tag = Tag.objects.filter(name__iexact=name).first() or tag tags.append(tag) @@ -138,58 +154,104 @@ class SnapshotAdminForm(forms.ModelForm): class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): form = SnapshotAdminForm - list_display = ('created_at', 'preview_icon', 'title_str', 'tags_inline', 'status_with_progress', 'files', 'size_with_stats') - sort_fields = ('title_str', 'created_at', 'status', 'crawl') - readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list') - search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') - list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter) + list_display = ("created_at", "preview_icon", "title_str", "tags_inline", "status_with_progress", "files", "size_with_stats") + sort_fields = ("title_str", "created_at", "status", "crawl") + readonly_fields = ( + "admin_actions", + "snapshot_summary", + "url_favicon", + "tags_badges", + "imported_timestamp", + "created_at", + "modified_at", + "downloaded_at", + "output_dir", + "archiveresults_list", + ) + search_fields = ("id", "url", "timestamp", "title", "tags__name") + list_filter = ("created_at", "downloaded_at", "archiveresult__status", "crawl__created_by", TagNameListFilter) fieldsets = ( - ('Actions', { - 'fields': ('admin_actions',), - 'classes': ('card', 'wide', 'actions-card'), - }), - ('URL', { - 'fields': ('url', 'title'), - 'classes': ('card', 'wide'), - }), - ('Tags', { - 'fields': ('tags_editor',), - 'classes': ('card',), - }), - ('Status', { - 'fields': ('status', 'retry_at', 'status_info'), - 'classes': ('card',), - }), - ('Timestamps', { - 'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'), - 'classes': ('card',), - }), - ('Relations', { - 'fields': ('crawl',), - 'classes': ('card',), - }), - ('Config', { - 'fields': ('config',), - 'classes': ('card',), - }), - ('Files', { - 'fields': ('output_dir',), - 'classes': ('card',), - }), - ('Archive Results', { - 'fields': ('archiveresults_list',), - 'classes': ('card', 'wide'), - }), + ( + "Actions", + { + "fields": ("admin_actions",), + "classes": ("card", "actions-card"), + }, + ), + ( + "Snapshot", + { + "fields": ("snapshot_summary",), + "classes": ("card",), + }, + ), + ( + "URL", + { + "fields": (("url_favicon", "url"), ("title", "tags_badges")), + "classes": ("card", "wide"), + }, + ), + ( + "Tags", + { + "fields": ("tags_editor",), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("bookmarked_at", "created_at", "modified_at", "downloaded_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("crawl",), + "classes": ("card",), + }, + ), + ( + "Config", + { + "fields": ("config",), + "description": 'Uses Crawl.config by default. Only set per-snapshot overrides here when needed.', + "classes": ("card",), + }, + ), + ( + "Files", + { + "fields": ("output_dir",), + "classes": ("card",), + }, + ), + ( + "Archive Results", + { + "fields": ("archiveresults_list",), + "classes": ("card", "wide"), + }, + ), ) - ordering = ['-created_at'] - actions = ['add_tags', 'remove_tags', 'resnapshot_snapshot', 'update_snapshots', 'overwrite_snapshots', 'delete_snapshots'] + ordering = ["-created_at"] + actions = ["add_tags", "remove_tags", "resnapshot_snapshot", "update_snapshots", "overwrite_snapshots", "delete_snapshots"] inlines = [] # Removed TagInline, using TagEditorWidget instead list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) action_form = SnapshotActionForm - paginator = AccelleratedPaginator + paginator = AcceleratedPaginator save_on_top = True show_full_result_count = False @@ -200,37 +262,48 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): try: return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) except Exception as e: - self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') + self.message_user(request, f"Error occurred while loading the page: {str(e)} {request.GET} {request.POST}") return super().changelist_view(request, GLOBAL_CONTEXT) def get_actions(self, request): actions = super().get_actions(request) if not actions: return {} - delete_selected = actions.get('delete_selected') - if delete_selected: - func, name, _desc = delete_selected - actions['delete_selected'] = (func, name, 'Delete') + actions.pop("delete_selected", None) return actions + def get_snapshot_view_url(self, obj: Snapshot) -> str: + return build_snapshot_url(str(obj.id), request=getattr(self, "request", None)) + + def get_snapshot_files_url(self, obj: Snapshot) -> str: + return f"{build_snapshot_url(str(obj.id), request=getattr(self, 'request', None))}/?files=1" + + def get_snapshot_zip_url(self, obj: Snapshot) -> str: + return f"{self.get_snapshot_files_url(obj)}&download=zip" def get_urls(self): urls = super().get_urls() custom_urls = [ - path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'), - path('/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'), + path("grid/", self.admin_site.admin_view(self.grid_view), name="grid"), + path("/redo-failed/", self.admin_site.admin_view(self.redo_failed_view), name="core_snapshot_redo_failed"), ] return custom_urls + urls def redo_failed_view(self, request, object_id): snapshot = get_object_or_404(Snapshot, pk=object_id) - if request.method == 'POST': - queued = bg_archive_snapshot(snapshot, overwrite=False) - messages.success( - request, - f"Queued {queued} snapshot for re-archiving. The background runner will process it.", - ) + if request.method == "POST": + retried = snapshot.retry_failed_archiveresults() + if retried: + messages.success( + request, + f"Queued {retried} failed/skipped extractors for retry on this snapshot.", + ) + else: + messages.info( + request, + "No failed/skipped extractors were found on this snapshot.", + ) return redirect(snapshot.admin_change_url) @@ -243,61 +316,65 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def get_queryset(self, request): self.request = request ordering_fields = self._get_ordering_fields(request) - needs_size_sort = 'size_with_stats' in ordering_fields - needs_files_sort = 'files' in ordering_fields - needs_tags_sort = 'tags_inline' in ordering_fields + needs_size_sort = "size_with_stats" in ordering_fields + needs_files_sort = "files" in ordering_fields + needs_tags_sort = "tags_inline" in ordering_fields + is_change_view = getattr(getattr(request, "resolver_match", None), "url_name", "") == "core_snapshot_change" - prefetch_qs = ArchiveResult.objects.filter( - Q(status='succeeded') - ).only( - 'id', - 'snapshot_id', - 'plugin', - 'status', - 'output_size', - 'output_files', - 'output_str', + prefetch_qs = ArchiveResult.objects.only( + "id", + "snapshot_id", + "plugin", + "status", + "output_size", + "output_files", + "output_str", ) + if not is_change_view: + prefetch_qs = prefetch_qs.filter(Q(status="succeeded")) qs = ( super() .get_queryset(request) - .select_related('crawl__created_by') - .defer('config', 'notes') - .prefetch_related('tags') - .prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs)) + .select_related("crawl__created_by") + .defer("config", "notes") + .prefetch_related("tags") + .prefetch_related(Prefetch("archiveresult_set", queryset=prefetch_qs)) ) if needs_size_sort: qs = qs.annotate( - output_size_sum=Coalesce(Sum( - 'archiveresult__output_size', - filter=Q(archiveresult__status='succeeded'), - ), 0), + output_size_sum=Coalesce( + Sum("archiveresult__output_size"), + 0, + ), ) if needs_files_sort: qs = qs.annotate( ar_succeeded_count=Count( - 'archiveresult', - filter=Q(archiveresult__status='succeeded'), + "archiveresult", + filter=Q(archiveresult__status="succeeded"), ), ) if needs_tags_sort: - qs = qs.annotate(tag_count=Count('tags', distinct=True)) + qs = qs.annotate(tag_count=Count("tags", distinct=True)) return qs @admin.display(description="Imported Timestamp") def imported_timestamp(self, obj): - context = RequestContext(self.request, { - 'bookmarked_date': obj.bookmarked_at, - 'timestamp': obj.timestamp, - }) + context = RequestContext( + self.request, + { + "bookmarked_date": obj.bookmarked_at, + "timestamp": obj.timestamp, + }, + ) html = Template("""{{bookmarked_date}} ({{timestamp}})""") return mark_safe(html.render(context)) - + # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') # return f'{pretty_time} ({obj.timestamp})' @@ -323,14 +400,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # obj.pk, # ) - @admin.display(description='') + @admin.display(description="") def admin_actions(self, obj): - summary_url = build_web_url(f'/{obj.archive_path}') - results_url = build_web_url(f'/{obj.archive_path}/index.html#all') - redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/' - csrf_token = get_token(self.request) + summary_url = self.get_snapshot_view_url(obj) + files_url = self.get_snapshot_files_url(obj) + zip_url = self.get_snapshot_zip_url(obj) + redo_failed_url = f"/admin/core/snapshot/{obj.pk}/redo-failed/" return format_html( - ''' + """ -

- Tip: Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected. -

- ''', + """, summary_url, - results_url, + files_url, + zip_url, obj.url, obj.pk, redo_failed_url, - csrf_token, obj.pk, obj.pk, ) def status_info(self, obj): - favicon_url = build_snapshot_url(str(obj.id), 'favicon.ico') + favicon_url = build_snapshot_url(str(obj.id), "favicon.ico") return format_html( - ''' + """ Archived: {} ({} files {})     Favicon:     Extension: {}     - ''', - '✅' if obj.is_archived else '❌', + """, + "✅" if obj.is_archived else "❌", obj.num_outputs, - self.size(obj) or '0kb', + self.size(obj) or "0kb", favicon_url, - obj.extension or '-', + obj.extension or "-", ) - @admin.display(description='Archive Results') + @admin.display(description="Archive Results") def archiveresults_list(self, obj): return render_archiveresults_list(obj.archiveresult_set.all()) @admin.display( - description='Title', - ordering='title', + description="Title", + ordering="title", ) def title_str(self, obj): - title_raw = (obj.title or '').strip() - url_raw = (obj.url or '').strip() + title_raw = (obj.title or "").strip() + url_raw = (obj.url or "").strip() title_normalized = title_raw.lower() url_normalized = url_raw.lower() - show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized - css_class = 'fetched' if show_title else 'pending' + show_title = bool(title_raw) and title_normalized != "pending..." and title_normalized != url_normalized + css_class = "fetched" if show_title else "pending" - detail_url = build_web_url(f'/{obj.archive_path_from_db}/index.html') - title_html = '' + detail_url = build_web_url(f"/{obj.archive_path_from_db}/index.html") + title_html = "" if show_title: title_html = format_html( - '
' - '{}' - '', + '{}', detail_url, css_class, urldecode(htmldecode(title_raw))[:128], ) return format_html( - '{}' + "{}" '
' - '{}' - '
', + '{}' + "
", title_html, url_raw or obj.url, (url_raw or obj.url)[:128], ) - @admin.display(description='Tags', ordering='tag_count') + @admin.display(description="Tags", ordering="tag_count") def tags_inline(self, obj): widget = InlineTagEditorWidget(snapshot_id=str(obj.pk)) + tags = self._get_prefetched_tags(obj) tags_html = widget.render( - name=f'tags_{obj.pk}', - value=obj.tags.all(), - attrs={'id': f'tags_{obj.pk}'}, + name=f"tags_{obj.pk}", + value=tags if tags is not None else obj.tags.all(), + attrs={"id": f"tags_{obj.pk}"}, snapshot_id=str(obj.pk), ) return mark_safe(f'{tags_html}') - @admin.display(description='Preview', empty_value='') - def preview_icon(self, obj): + @admin.display(description="Tags") + def tags_badges(self, obj): + widget = InlineTagEditorWidget(snapshot_id=str(obj.pk), editable=False) + tags = self._get_prefetched_tags(obj) + tags_html = widget.render( + name=f"tags_readonly_{obj.pk}", + value=tags if tags is not None else obj.tags.all(), + attrs={"id": f"tags_readonly_{obj.pk}"}, + snapshot_id=str(obj.pk), + ) + return mark_safe(f'{tags_html}') + + def _get_preview_data(self, obj): results = self._get_prefetched_results(obj) - has_screenshot = False - has_favicon = False if results is not None: - has_screenshot = any(r.plugin == 'screenshot' for r in results) - has_favicon = any(r.plugin == 'favicon' for r in results) + has_screenshot = any(r.plugin == "screenshot" for r in results) + has_favicon = any(r.plugin == "favicon" for r in results) + else: + available_plugins = set(obj.archiveresult_set.filter(plugin__in=("screenshot", "favicon")).values_list("plugin", flat=True)) + has_screenshot = "screenshot" in available_plugins + has_favicon = "favicon" in available_plugins if not has_screenshot and not has_favicon: return None if has_screenshot: - img_url = build_snapshot_url(str(obj.id), 'screenshot/screenshot.png') + img_url = build_snapshot_url(str(obj.id), "screenshot/screenshot.png") fallbacks = [ - build_snapshot_url(str(obj.id), 'screenshot.png'), - build_snapshot_url(str(obj.id), 'favicon/favicon.ico'), - build_snapshot_url(str(obj.id), 'favicon.ico'), + build_snapshot_url(str(obj.id), "screenshot.png"), + build_snapshot_url(str(obj.id), "favicon/favicon.ico"), + build_snapshot_url(str(obj.id), "favicon.ico"), ] - img_alt = 'Screenshot' - preview_class = 'screenshot' + img_alt = "Screenshot" + preview_class = "screenshot" else: - img_url = build_snapshot_url(str(obj.id), 'favicon/favicon.ico') + img_url = build_snapshot_url(str(obj.id), "favicon/favicon.ico") fallbacks = [ - build_snapshot_url(str(obj.id), 'favicon.ico'), + build_snapshot_url(str(obj.id), "favicon.ico"), ] - img_alt = 'Favicon' - preview_class = 'favicon' + img_alt = "Favicon" + preview_class = "favicon" - fallback_list = ','.join(fallbacks) + fallback_list = ",".join(fallbacks) onerror_js = ( "this.dataset.fallbacks && this.dataset.fallbacks.length ? " "(this.src=this.dataset.fallbacks.split(',').shift(), " @@ -500,45 +597,153 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): "this.remove()" ) + return { + "img_url": img_url, + "img_alt": img_alt, + "preview_class": preview_class, + "onerror_js": onerror_js, + "fallback_list": fallback_list, + } + + @admin.display(description="", empty_value="") + def url_favicon(self, obj): + preview = self._get_preview_data(obj) + if not preview: + return "" + + favicon_url = build_snapshot_url(str(obj.id), "favicon/favicon.ico") + fallback_list = ",".join([build_snapshot_url(str(obj.id), "favicon.ico")]) + onerror_js = ( + "this.dataset.fallbacks && this.dataset.fallbacks.length ? " + "(this.src=this.dataset.fallbacks.split(',').shift(), " + "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : " + "this.closest('a') && this.closest('a').remove()" + ) + return format_html( - '{}', - img_url, - img_alt, - preview_class, + '' + 'Favicon' + "", + favicon_url, + favicon_url, onerror_js, fallback_list, ) + @admin.display(description="Preview", empty_value="") + def preview_icon(self, obj): + preview = self._get_preview_data(obj) + if not preview: + return None + + return format_html( + '{}', + preview["img_url"], + preview["img_alt"], + preview["preview_class"], + preview["onerror_js"], + preview["fallback_list"], + ) + + @admin.display(description=" ", empty_value="") + def snapshot_summary(self, obj): + preview = self._get_preview_data(obj) + stats = self._get_progress_stats(obj) + archive_size = stats["output_size"] or 0 + size_txt = printable_filesize(archive_size) if archive_size else "pending" + screenshot_html = "" + + if preview: + screenshot_html = format_html( + '' + '{alt}' + "", + href=build_web_url(f"/{obj.archive_path}"), + src=preview["img_url"], + alt=preview["img_alt"], + onerror=preview["onerror_js"], + fallbacks=preview["fallback_list"], + ) + + return format_html( + '
' + "{}" + '
' + '
snap_dir size
' + '
{}
' + '
' + 'Open {} to inspect files.' + "
" + "
" + "
", + screenshot_html, + size_txt, + build_web_url(f"/{obj.archive_path}"), + obj.archive_path, + ) + @admin.display( - description='Files Saved', - ordering='ar_succeeded_count', + description="Files Saved", + ordering="ar_succeeded_count", ) def files(self, obj): - # return '-' - return obj.icons(path=obj.archive_path_from_db) + results = self._get_prefetched_results(obj) + if results is None: + results = obj.archiveresult_set.only("plugin", "status", "output_files", "output_str") + plugins_with_output: dict[str, ArchiveResult] = {} + for result in results: + if result.status != ArchiveResult.StatusChoices.SUCCEEDED: + continue + if not (result.output_files or str(result.output_str or "").strip()): + continue + plugins_with_output.setdefault(result.plugin, result) + + if not plugins_with_output: + return mark_safe('...') + + sorted_results = sorted( + plugins_with_output.values(), + key=lambda result: (_plugin_sort_order().get(result.plugin, 9999), result.plugin), + ) + output = [ + format_html( + '{}', + self._result_output_href(obj, result), + result.plugin, + get_plugin_icon(result.plugin), + ) + for result in sorted_results + ] + + return format_html( + '{}', + mark_safe("".join(output)), + ) @admin.display( # ordering='archiveresult_count' ) def size(self, obj): - archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size + archive_size = self._get_progress_stats(obj)["output_size"] or 0 if archive_size: size_txt = printable_filesize(archive_size) if archive_size > 52428800: - size_txt = mark_safe(f'{size_txt}') + size_txt = mark_safe(f"{size_txt}") else: size_txt = mark_safe('...') return format_html( '{}', - build_web_url(f'/{obj.archive_path}'), + build_web_url(f"/{obj.archive_path}"), size_txt, ) @admin.display( - description='Status', - ordering='status', + description="Status", + ordering="status", ) def status_with_progress(self, obj): """Show status with progress bar for in-progress snapshots.""" @@ -546,25 +751,25 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # Status badge colors status_colors = { - 'queued': ('#f59e0b', '#fef3c7'), # amber - 'started': ('#3b82f6', '#dbeafe'), # blue - 'sealed': ('#10b981', '#d1fae5'), # green - 'succeeded': ('#10b981', '#d1fae5'), # green - 'failed': ('#ef4444', '#fee2e2'), # red - 'backoff': ('#f59e0b', '#fef3c7'), # amber - 'skipped': ('#6b7280', '#f3f4f6'), # gray + "queued": ("#f59e0b", "#fef3c7"), # amber + "started": ("#3b82f6", "#dbeafe"), # blue + "sealed": ("#10b981", "#d1fae5"), # green + "succeeded": ("#10b981", "#d1fae5"), # green + "failed": ("#ef4444", "#fee2e2"), # red + "backoff": ("#f59e0b", "#fef3c7"), # amber + "skipped": ("#6b7280", "#f3f4f6"), # gray } - fg_color, bg_color = status_colors.get(obj.status, ('#6b7280', '#f3f4f6')) + fg_color, bg_color = status_colors.get(obj.status, ("#6b7280", "#f3f4f6")) # For started snapshots, show progress bar - if obj.status == 'started' and stats['total'] > 0: - percent = stats['percent'] - running = stats['running'] - succeeded = stats['succeeded'] - failed = stats['failed'] + if obj.status == "started" and stats["total"] > 0: + percent = stats["percent"] + running = stats["running"] + succeeded = stats["succeeded"] + failed = stats["failed"] return format_html( - '''
+ """
{}/{} hooks @@ -576,13 +781,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
✓{} ✗{} ⏳{}
-
''', - succeeded + failed + stats['skipped'], - stats['total'], - int(succeeded / stats['total'] * 100) if stats['total'] else 0, - int(succeeded / stats['total'] * 100) if stats['total'] else 0, - int((succeeded + failed) / stats['total'] * 100) if stats['total'] else 0, - int((succeeded + failed) / stats['total'] * 100) if stats['total'] else 0, +
""", + succeeded + failed + stats["skipped"], + stats["total"], + int(succeeded / stats["total"] * 100) if stats["total"] else 0, + int(succeeded / stats["total"] * 100) if stats["total"] else 0, + int((succeeded + failed) / stats["total"] * 100) if stats["total"] else 0, + int((succeeded + failed) / stats["total"] * 100) if stats["total"] else 0, percent, succeeded, failed, @@ -599,85 +804,139 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) @admin.display( - description='Size', - ordering='output_size_sum', + description="Size", + ordering="output_size_sum", ) def size_with_stats(self, obj): """Show archive size with output size from archive results.""" stats = self._get_progress_stats(obj) - output_size = stats['output_size'] + output_size = stats["output_size"] size_bytes = output_size or 0 + zip_url = self.get_snapshot_zip_url(obj) + zip_link = format_html( + '⬇ ZIP', + zip_url, + ) if size_bytes: size_txt = printable_filesize(size_bytes) if size_bytes > 52428800: # 50MB - size_txt = mark_safe(f'{size_txt}') + size_txt = mark_safe(f"{size_txt}") else: size_txt = mark_safe('...') # Show hook statistics - if stats['total'] > 0: + if stats["total"] > 0: return format_html( '' - '{}' + "{}" '
' - '{}/{} hooks
', - build_web_url(f'/{obj.archive_path_from_db}'), + "{}/{} hooks
" + "{}", + build_web_url(f"/{obj.archive_path_from_db}"), size_txt, - stats['succeeded'], - stats['total'], + stats["succeeded"], + stats["total"], + zip_link, ) return format_html( - '{}', - build_web_url(f'/{obj.archive_path_from_db}'), + '{}{}', + build_web_url(f"/{obj.archive_path_from_db}"), size_txt, + zip_link, ) def _get_progress_stats(self, obj): results = self._get_prefetched_results(obj) if results is None: - return obj.get_progress_stats() + stats = obj.get_progress_stats() + expected_total = self._get_expected_hook_total(obj) + total = max(stats["total"], expected_total) + completed = stats["succeeded"] + stats["failed"] + stats.get("skipped", 0) + stats.get("noresults", 0) + stats["total"] = total + stats["pending"] = max(total - completed - stats["running"], 0) + stats["percent"] = int((completed / total * 100) if total > 0 else 0) + return stats - total = len(results) - succeeded = sum(1 for r in results if r.status == 'succeeded') - failed = sum(1 for r in results if r.status == 'failed') - running = sum(1 for r in results if r.status == 'started') - skipped = sum(1 for r in results if r.status == 'skipped') - pending = max(total - succeeded - failed - running - skipped, 0) - completed = succeeded + failed + skipped + expected_total = self._get_expected_hook_total(obj) + observed_total = len(results) + total = max(observed_total, expected_total) + succeeded = sum(1 for r in results if r.status == "succeeded") + failed = sum(1 for r in results if r.status == "failed") + running = sum(1 for r in results if r.status == "started") + skipped = sum(1 for r in results if r.status == "skipped") + noresults = sum(1 for r in results if r.status == "noresults") + pending = max(total - succeeded - failed - running - skipped - noresults, 0) + completed = succeeded + failed + skipped + noresults percent = int((completed / total * 100) if total > 0 else 0) is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED) output_size = None - if hasattr(obj, 'output_size_sum'): + if hasattr(obj, "output_size_sum"): output_size = obj.output_size_sum or 0 else: - output_size = sum(r.output_size or 0 for r in results if r.status == 'succeeded') + output_size = sum(r.output_size or 0 for r in results) return { - 'total': total, - 'succeeded': succeeded, - 'failed': failed, - 'running': running, - 'pending': pending, - 'skipped': skipped, - 'percent': percent, - 'output_size': output_size or 0, - 'is_sealed': is_sealed, + "total": total, + "succeeded": succeeded, + "failed": failed, + "running": running, + "pending": pending, + "skipped": skipped, + "noresults": noresults, + "percent": percent, + "output_size": output_size or 0, + "is_sealed": is_sealed, } def _get_prefetched_results(self, obj): - if hasattr(obj, '_prefetched_objects_cache') and 'archiveresult_set' in obj._prefetched_objects_cache: + if hasattr(obj, "_prefetched_objects_cache") and "archiveresult_set" in obj._prefetched_objects_cache: return obj.archiveresult_set.all() return None + def _get_expected_hook_total(self, obj) -> int: + from archivebox.config.configset import get_config + + try: + config = get_config(crawl=obj.crawl, snapshot=obj) + config_json = json.dumps(config, sort_keys=True, default=str, separators=(",", ":")) + return _expected_snapshot_hook_total(config_json) + except Exception: + return 0 + + def _get_prefetched_tags(self, obj): + if hasattr(obj, "_prefetched_objects_cache") and "tags" in obj._prefetched_objects_cache: + return list(obj._prefetched_objects_cache["tags"]) + return None + + def _result_output_href(self, obj, result: ArchiveResult) -> str: + ignored = {"stdout.log", "stderr.log", "hook.pid", "listener.pid", "cmd.sh"} + + for rel_path in result.output_file_paths(): + raw_path = str(rel_path or "").strip().lstrip("/") + if not raw_path: + continue + basename = raw_path.rsplit("/", 1)[-1] + if basename in ignored or raw_path.endswith((".pid", ".log", ".sh")): + continue + relative_path = raw_path if raw_path.startswith(f"{result.plugin}/") else f"{result.plugin}/{raw_path}" + return f"/{obj.archive_path_from_db}/{relative_path}" + + raw_output = str(result.output_str or "").strip().lstrip("/") + if raw_output and raw_output not in {".", "./"} and "://" not in raw_output and not raw_output.startswith("/"): + relative_path = raw_output if raw_output.startswith(f"{result.plugin}/") else f"{result.plugin}/{raw_output}" + return f"/{obj.archive_path_from_db}/{relative_path}" + + return f"/{obj.archive_path_from_db}/{result.plugin}/" + def _get_ordering_fields(self, request): - ordering = request.GET.get('o') + ordering = request.GET.get("o") if not ordering: return set() fields = set() - for part in ordering.split('.'): + for part in ordering.split("."): if not part: continue try: @@ -689,8 +948,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): return fields @admin.display( - description='Original URL', - ordering='url', + description="Original URL", + ordering="url", ) def url_str(self, obj): return format_html( @@ -699,10 +958,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): obj.url[:128], ) - @admin.display(description='Health', ordering='health') + @admin.display(description="Health", ordering="health") def health_display(self, obj): h = obj.health - color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + color = "green" if h >= 80 else "orange" if h >= 50 else "red" return format_html('{}', color, h) def grid_view(self, request, extra_context=None): @@ -716,7 +975,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): saved_list_max_show_all = admin_cls.list_max_show_all # Monkey patch here plus core_tags.py - admin_cls.change_list_template = 'private_index_grid.html' + admin_cls.change_list_template = "private_index_grid.html" admin_cls.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE admin_cls.list_max_show_all = admin_cls.list_per_page @@ -736,7 +995,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # return super().changelist_view(request, extra_context=None) @admin.action( - description="🔁 Redo Failed" + description="🔁 Redo Failed", ) def update_snapshots(self, request, queryset): queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR}) @@ -746,24 +1005,29 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): f"Queued {queued} snapshots for re-archiving. The background runner will process them.", ) - @admin.action( - description="🆕 Archive Now" + description="🆕 Archive Now", ) def resnapshot_snapshot(self, request, queryset): - for snapshot in queryset: - timestamp = timezone.now().isoformat('T', 'seconds') - new_url = snapshot.url.split('#')[0] + f'#{timestamp}' + snapshots = list(queryset) + if not snapshots: + messages.info(request, "No snapshots selected.") + return - bg_add({'urls': new_url, 'tag': snapshot.tags_str()}) + urls = "\n".join(snapshot.url for snapshot in snapshots if snapshot.url) + if not urls: + messages.info(request, "No valid snapshot URLs were found to archive.") + return + + bg_add({"urls": urls}) messages.success( request, - f"Creating {queryset.count()} new fresh snapshots. The background runner will process them.", + f"Creating 1 new crawl with {len(snapshots)} fresh snapshots. The background runner will process them.", ) @admin.action( - description="🔄 Redo" + description="🔄 Redo", ) def overwrite_snapshots(self, request, queryset): queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR}) @@ -774,7 +1038,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) @admin.action( - description="🗑️ Delete" + description="🗑️ Delete", ) def delete_snapshots(self, request, queryset): """Delete snapshots in a single transaction to avoid SQLite concurrency issues.""" @@ -783,7 +1047,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): total = queryset.count() # Get list of IDs to delete first (outside transaction) - ids_to_delete = list(queryset.values_list('pk', flat=True)) + ids_to_delete = list(queryset.values_list("pk", flat=True)) # Delete everything in a single atomic transaction with transaction.atomic(): @@ -791,44 +1055,45 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): messages.success( request, - mark_safe(f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), + mark_safe( + f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed.", + ), ) - @admin.action( - description="+" + description="+", ) def add_tags(self, request, queryset): from archivebox.core.models import SnapshotTag # Get tags from the form - now comma-separated string - tags_str = request.POST.get('tags', '') + tags_str = request.POST.get("tags", "") if not tags_str: messages.warning(request, "No tags specified.") return # Parse comma-separated tag names and get/create Tag objects - tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] tags = [] for name in tag_names: tag, _ = Tag.objects.get_or_create( name__iexact=name, - defaults={'name': name} + defaults={"name": name}, ) tag = Tag.objects.filter(name__iexact=name).first() or tag tags.append(tag) # Get snapshot IDs efficiently (works with select_across for all pages) - snapshot_ids = list(queryset.values_list('id', flat=True)) + snapshot_ids = list(queryset.values_list("id", flat=True)) num_snapshots = len(snapshot_ids) - print('[+] Adding tags', [t.name for t in tags], 'to', num_snapshots, 'Snapshots') + print("[+] Adding tags", [t.name for t in tags], "to", num_snapshots, "Snapshots") # Bulk create M2M relationships (1 query per tag, not per snapshot) for tag in tags: SnapshotTag.objects.bulk_create( [SnapshotTag(snapshot_id=sid, tag=tag) for sid in snapshot_ids], - ignore_conflicts=True # Skip if relationship already exists + ignore_conflicts=True, # Skip if relationship already exists ) messages.success( @@ -836,21 +1101,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): f"Added {len(tags)} tag(s) to {num_snapshots} Snapshot(s).", ) - @admin.action( - description="–" + description="–", ) def remove_tags(self, request, queryset): from archivebox.core.models import SnapshotTag # Get tags from the form - now comma-separated string - tags_str = request.POST.get('tags', '') + tags_str = request.POST.get("tags", "") if not tags_str: messages.warning(request, "No tags specified.") return # Parse comma-separated tag names and find matching Tag objects (case-insensitive) - tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] tags = [] for name in tag_names: tag = Tag.objects.filter(name__iexact=name).first() @@ -862,16 +1126,16 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): return # Get snapshot IDs efficiently (works with select_across for all pages) - snapshot_ids = list(queryset.values_list('id', flat=True)) + snapshot_ids = list(queryset.values_list("id", flat=True)) num_snapshots = len(snapshot_ids) tag_ids = [t.pk for t in tags] - print('[-] Removing tags', [t.name for t in tags], 'from', num_snapshots, 'Snapshots') + print("[-] Removing tags", [t.name for t in tags], "from", num_snapshots, "Snapshots") # Bulk delete M2M relationships (1 query total, not per snapshot) deleted_count, _ = SnapshotTag.objects.filter( snapshot_id__in=snapshot_ids, - tag_id__in=tag_ids + tag_id__in=tag_ids, ).delete() messages.success( diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py index 3658badc..dfa456bb 100644 --- a/archivebox/core/admin_tags.py +++ b/archivebox/core/admin_tags.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from urllib.parse import quote @@ -28,92 +28,107 @@ from archivebox.core.host_utils import build_snapshot_url class TagInline(admin.TabularInline): model = SnapshotTag - fields = ('id', 'tag') + fields = ("id", "tag") extra = 1 max_num = 1000 - autocomplete_fields = ( - 'tag', - ) + autocomplete_fields = ("tag",) class TagAdminForm(forms.ModelForm): class Meta: model = Tag - fields = '__all__' + fields = "__all__" widgets = { - 'name': forms.TextInput(attrs={ - 'placeholder': 'research, receipts, product-design...', - 'autocomplete': 'off', - 'spellcheck': 'false', - 'data-tag-name-input': '1', - }), + "name": forms.TextInput( + attrs={ + "placeholder": "research, receipts, product-design...", + "autocomplete": "off", + "spellcheck": "false", + "data-tag-name-input": "1", + }, + ), } def clean_name(self): - name = (self.cleaned_data.get('name') or '').strip() + name = (self.cleaned_data.get("name") or "").strip() if not name: - raise forms.ValidationError('Tag name is required.') + raise forms.ValidationError("Tag name is required.") return name class TagAdmin(BaseModelAdmin): form = TagAdminForm - change_list_template = 'admin/core/tag/change_list.html' - change_form_template = 'admin/core/tag/change_form.html' - list_display = ('name', 'num_snapshots', 'created_at', 'created_by') - list_filter = ('created_at', 'created_by') - search_fields = ('id', 'name', 'slug') - readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots') - actions = ['delete_selected'] - ordering = ['name', 'id'] + change_list_template = "admin/core/tag/change_list.html" + change_form_template = "admin/core/tag/change_form.html" + list_display = ("name", "num_snapshots", "created_at", "created_by") + list_filter = ("created_at", "created_by") + search_fields = ("id", "name", "slug") + readonly_fields = ("slug", "id", "created_at", "modified_at", "snapshots") + actions = ["delete_selected"] + ordering = ["name", "id"] fieldsets = ( - ('Tag', { - 'fields': ('name', 'slug'), - 'classes': ('card',), - }), - ('Metadata', { - 'fields': ('id', 'created_by', 'created_at', 'modified_at'), - 'classes': ('card',), - }), - ('Recent Snapshots', { - 'fields': ('snapshots',), - 'classes': ('card', 'wide'), - }), + ( + "Tag", + { + "fields": ("name", "slug"), + "classes": ("card",), + }, + ), + ( + "Metadata", + { + "fields": ("id", "created_by", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Recent Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), ) add_fieldsets = ( - ('Tag', { - 'fields': ('name',), - 'classes': ('card', 'wide'), - }), - ('Metadata', { - 'fields': ('created_by',), - 'classes': ('card',), - }), + ( + "Tag", + { + "fields": ("name",), + "classes": ("card", "wide"), + }, + ), + ( + "Metadata", + { + "fields": ("created_by",), + "classes": ("card",), + }, + ), ) def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None): return self.fieldsets if obj else self.add_fieldsets def changelist_view(self, request: HttpRequest, extra_context=None): - query = (request.GET.get('q') or '').strip() - sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip()) - created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip()) - year = normalize_created_year_filter((request.GET.get('year') or '').strip()) - has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip()) + query = (request.GET.get("q") or "").strip() + sort = normalize_tag_sort((request.GET.get("sort") or "created_desc").strip()) + created_by = normalize_created_by_filter((request.GET.get("created_by") or "").strip()) + year = normalize_created_year_filter((request.GET.get("year") or "").strip()) + has_snapshots = normalize_has_snapshots_filter((request.GET.get("has_snapshots") or "all").strip()) extra_context = { **(extra_context or {}), - 'initial_query': query, - 'initial_sort': sort, - 'initial_created_by': created_by, - 'initial_year': year, - 'initial_has_snapshots': has_snapshots, - 'tag_sort_choices': TAG_SORT_CHOICES, - 'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES, - 'tag_created_by_choices': get_tag_creator_choices(), - 'tag_year_choices': get_tag_year_choices(), - 'initial_tag_cards': build_tag_cards( + "initial_query": query, + "initial_sort": sort, + "initial_created_by": created_by, + "initial_year": year, + "initial_has_snapshots": has_snapshots, + "tag_sort_choices": TAG_SORT_CHOICES, + "tag_has_snapshots_choices": TAG_HAS_SNAPSHOTS_CHOICES, + "tag_created_by_choices": get_tag_creator_choices(), + "tag_year_choices": get_tag_year_choices(), + "initial_tag_cards": build_tag_cards( query=query, request=request, sort=sort, @@ -121,62 +136,67 @@ class TagAdmin(BaseModelAdmin): year=year, has_snapshots=has_snapshots, ), - 'tag_search_api_url': reverse('api-1:search_tags'), - 'tag_create_api_url': reverse('api-1:tags_create'), + "tag_search_api_url": reverse("api-1:search_tags"), + "tag_create_api_url": reverse("api-1:tags_create"), } return super().changelist_view(request, extra_context=extra_context) - def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None): - current_name = (request.POST.get('name') or '').strip() + def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None): + current_name = (request.POST.get("name") or "").strip() if not current_name and obj: current_name = obj.name - similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12) + similar_tag_cards = ( + build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12) + ) if obj: - similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk] + similar_tag_cards = [card for card in similar_tag_cards if card["id"] != obj.pk] - context.update({ - 'tag_search_api_url': reverse('api-1:search_tags'), - 'tag_similar_cards': similar_tag_cards, - 'tag_similar_query': current_name, - }) + context.update( + { + "tag_search_api_url": reverse("api-1:search_tags"), + "tag_similar_cards": similar_tag_cards, + "tag_similar_query": current_name, + }, + ) return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None): - if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST: + if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST: return super().response_add(request, obj, post_url_continue=post_url_continue) self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS) return self._redirect_to_changelist(obj.name) def response_change(self, request: HttpRequest, obj: Tag): - if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST: + if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST or "_saveasnew" in request.POST: return super().response_change(request, obj) self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS) return self._redirect_to_changelist(obj.name) - def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect: - changelist_url = reverse('admin:core_tag_changelist') + def _redirect_to_changelist(self, query: str = "") -> HttpResponseRedirect: + changelist_url = reverse("admin:core_tag_changelist") if query: - changelist_url = f'{changelist_url}?q={quote(query)}' + changelist_url = f"{changelist_url}?q={quote(query)}" return HttpResponseRedirect(changelist_url) - @admin.display(description='Snapshots') + @admin.display(description="Snapshots") def snapshots(self, tag: Tag): - snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10] + snapshots = tag.snapshot_set.select_related("crawl__created_by").order_by("-downloaded_at", "-created_at", "-pk")[:10] total_count = tag.snapshot_set.count() if not snapshots: return mark_safe( f'

No snapshots use this tag yet. ' - f'Open filtered snapshot list.

' + f'Open filtered snapshot list.

', ) cards = [] for snapshot in snapshots: - title = (snapshot.title or '').strip() or snapshot.url - cards.append(format_html( - ''' + title = (snapshot.title or "").strip() or snapshot.url + cards.append( + format_html( + """ @@ -184,23 +204,26 @@ class TagAdmin(BaseModelAdmin): {} - ''', - reverse('admin:core_snapshot_change', args=[snapshot.pk]), - build_snapshot_url(str(snapshot.pk), 'favicon.ico'), - title[:120], - snapshot.url[:120], - )) + """, + reverse("admin:core_snapshot_change", args=[snapshot.pk]), + build_snapshot_url(str(snapshot.pk), "favicon.ico"), + title[:120], + snapshot.url[:120], + ), + ) - cards.append(format_html( - 'View all {} tagged snapshots', - tag.id, - total_count, - )) - return mark_safe('
' + ''.join(cards) + '
') + cards.append( + format_html( + 'View all {} tagged snapshots', + tag.id, + total_count, + ), + ) + return mark_safe('
' + "".join(cards) + "
") - @admin.display(description='Snapshots', ordering='num_snapshots') + @admin.display(description="Snapshots", ordering="num_snapshots") def num_snapshots(self, tag: Tag): - count = getattr(tag, 'num_snapshots', tag.snapshot_set.count()) + count = getattr(tag, "num_snapshots", tag.snapshot_set.count()) return format_html( '{} total', tag.id, diff --git a/archivebox/core/admin_users.py b/archivebox/core/admin_users.py index 371317f3..7a38271b 100644 --- a/archivebox/core/admin_users.py +++ b/archivebox/core/admin_users.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from django.contrib import admin from django.contrib.auth.admin import UserAdmin @@ -8,87 +8,100 @@ from django.utils.safestring import mark_safe class CustomUserAdmin(UserAdmin): - sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined'] - list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined'] - readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set') + sort_fields = ["id", "email", "username", "is_superuser", "last_login", "date_joined"] + list_display = ["username", "id", "email", "is_superuser", "last_login", "date_joined"] + readonly_fields = ("snapshot_set", "archiveresult_set", "tag_set", "apitoken_set", "outboundwebhook_set") # Preserve Django's default user creation form and fieldsets # This ensures passwords are properly hashed and permissions are set correctly add_fieldsets = UserAdmin.add_fieldsets # Extend fieldsets for change form only (not user creation) - fieldsets = [*(UserAdmin.fieldsets or ()), ('Data', {'fields': readonly_fields})] + fieldsets = [*(UserAdmin.fieldsets or ()), ("Data", {"fields": readonly_fields})] - @admin.display(description='Snapshots') + @admin.display(description="Snapshots") def snapshot_set(self, obj): total_count = obj.snapshot_set.count() - return mark_safe('
'.join( - format_html( - '[{}] 📅 {} {}', - snap.pk, - str(snap.id)[:8], - snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', - snap.url[:64], + return mark_safe( + "
".join( + format_html( + '[{}] 📅 {} {}', + snap.pk, + str(snap.id)[:8], + snap.downloaded_at.strftime("%Y-%m-%d %H:%M") if snap.downloaded_at else "pending...", + snap.url[:64], + ) + for snap in obj.snapshot_set.order_by("-modified_at")[:10] ) - for snap in obj.snapshot_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') + + f'
{total_count} total records...', + ) - @admin.display(description='Archive Result Logs') + @admin.display(description="Archive Result Logs") def archiveresult_set(self, obj): total_count = obj.archiveresult_set.count() - return mark_safe('
'.join( - format_html( - '
[{}] 📅 {} 📄 {} {}', - result.pk, - str(result.id)[:8], - result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...', - result.extractor, - result.snapshot.url[:64], + return mark_safe( + "
".join( + format_html( + '[{}] 📅 {} 📄 {} {}', + result.pk, + str(result.id)[:8], + result.snapshot.downloaded_at.strftime("%Y-%m-%d %H:%M") if result.snapshot.downloaded_at else "pending...", + result.extractor, + result.snapshot.url[:64], + ) + for result in obj.archiveresult_set.order_by("-modified_at")[:10] ) - for result in obj.archiveresult_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') + + f'
{total_count} total records...', + ) - @admin.display(description='Tags') + @admin.display(description="Tags") def tag_set(self, obj): total_count = obj.tag_set.count() - return mark_safe(', '.join( - format_html( - '{}', - tag.pk, - tag.name, + return mark_safe( + ", ".join( + format_html( + '{}', + tag.pk, + tag.name, + ) + for tag in obj.tag_set.order_by("-modified_at")[:10] ) - for tag in obj.tag_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') + + f'
{total_count} total records...', + ) - @admin.display(description='API Tokens') + @admin.display(description="API Tokens") def apitoken_set(self, obj): total_count = obj.apitoken_set.count() - return mark_safe('
'.join( - format_html( - '
[{}] {} (expires {})', - apitoken.pk, - str(apitoken.id)[:8], - apitoken.token_redacted[:64], - apitoken.expires, + return mark_safe( + "
".join( + format_html( + '[{}] {} (expires {})', + apitoken.pk, + str(apitoken.id)[:8], + apitoken.token_redacted[:64], + apitoken.expires, + ) + for apitoken in obj.apitoken_set.order_by("-modified_at")[:10] ) - for apitoken in obj.apitoken_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') + + f'
{total_count} total records...', + ) - @admin.display(description='API Outbound Webhooks') + @admin.display(description="API Outbound Webhooks") def outboundwebhook_set(self, obj): total_count = obj.outboundwebhook_set.count() - return mark_safe('
'.join( - format_html( - '
[{}] {} -> {}', - outboundwebhook.pk, - str(outboundwebhook.id)[:8], - outboundwebhook.referenced_model, - outboundwebhook.endpoint, + return mark_safe( + "
".join( + format_html( + '[{}] {} -> {}', + outboundwebhook.pk, + str(outboundwebhook.id)[:8], + outboundwebhook.referenced_model, + outboundwebhook.endpoint, + ) + for outboundwebhook in obj.outboundwebhook_set.order_by("-modified_at")[:10] ) - for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') - - + + f'
{total_count} total records...', + ) def register_admin(admin_site): diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index c0dd29fe..b173ae90 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -1,12 +1,12 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from django.apps import AppConfig import os class CoreConfig(AppConfig): - name = 'archivebox.core' - label = 'core' + name = "archivebox.core" + label = "core" def ready(self): """Register the archivebox.core.admin_site as the main django admin site""" @@ -14,29 +14,30 @@ class CoreConfig(AppConfig): from django.utils.autoreload import DJANGO_AUTORELOAD_ENV from archivebox.core.admin_site import register_admin_site + register_admin_site() # Import models to register state machines with the registry # Skip during makemigrations to avoid premature state machine access - if 'makemigrations' not in sys.argv: + if "makemigrations" not in sys.argv: from archivebox.core import models # noqa: F401 - pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE') + pidfile = os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE") if pidfile: should_write_pid = True - if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1': - should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true' + if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1": + should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == "true" if should_write_pid: try: - with open(pidfile, 'w') as handle: + with open(pidfile, "w") as handle: handle.write(str(os.getpid())) except Exception: pass def _should_prepare_runtime() -> bool: - if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1': - if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1': - return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true' + if os.environ.get("ARCHIVEBOX_RUNSERVER") == "1": + if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1": + return os.environ.get(DJANGO_AUTORELOAD_ENV) == "true" return True return False @@ -44,4 +45,5 @@ class CoreConfig(AppConfig): from archivebox.machine.models import Process, Machine Process.cleanup_stale_running() + Process.cleanup_orphaned_workers() Machine.current() diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 6050a6a7..edc0403d 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -1,9 +1,9 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from django import forms from django.utils.html import format_html -from archivebox.misc.util import URL_REGEX, find_all_urls +from archivebox.misc.util import URL_REGEX, find_all_urls, parse_filesize_to_bytes from taggit.utils import edit_string_for_tags, parse_tags from archivebox.base_models.admin import KeyValueWidget from archivebox.crawls.schedule_utils import validate_schedule @@ -13,11 +13,11 @@ from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_ic from archivebox.personas.models import Persona DEPTH_CHOICES = ( - ('0', 'depth = 0 (archive just these URLs)'), - ('1', 'depth = 1 (+ URLs one hop away)'), - ('2', 'depth = 2 (+ URLs two hops away)'), - ('3', 'depth = 3 (+ URLs three hops away)'), - ('4', 'depth = 4 (+ URLs four hops away)'), + ("0", "depth = 0 (archive just these URLs)"), + ("1", "depth = 1 (+ URLs one hop away)"), + ("2", "depth = 2 (+ URLs two hops away)"), + ("3", "depth = 3 (+ URLs three hops away)"), + ("4", "depth = 4 (+ URLs four hops away)"), ) @@ -28,7 +28,7 @@ def get_plugin_choices(): def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str: schema = plugin_configs.get(plugin_name, {}) - description = str(schema.get('description') or '').strip() + description = str(schema.get("description") or "").strip() if not description: return plugin_name icon_html = get_plugin_icon(plugin_name) @@ -45,7 +45,7 @@ def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) - def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField: field = form.fields[name] if not isinstance(field, forms.ChoiceField): - raise TypeError(f'{name} must be a ChoiceField') + raise TypeError(f"{name} must be a ChoiceField") return field @@ -54,10 +54,12 @@ class AddLinkForm(forms.Form): url = forms.CharField( label="URLs", strip=True, - widget=forms.Textarea(attrs={ - 'data-url-regex': URL_REGEX.pattern, - }), - required=True + widget=forms.Textarea( + attrs={ + "data-url-regex": URL_REGEX.pattern, + }, + ), + required=True, ) tag = forms.CharField( label="Tags", @@ -68,16 +70,41 @@ class AddLinkForm(forms.Form): depth = forms.ChoiceField( label="Archive depth", choices=DEPTH_CHOICES, - initial='0', - widget=forms.RadioSelect(attrs={"class": "depth-selection"}) + initial="0", + widget=forms.RadioSelect(attrs={"class": "depth-selection"}), + ) + max_urls = forms.IntegerField( + label="Max URLs", + required=False, + min_value=0, + initial=0, + widget=forms.NumberInput( + attrs={ + "min": 0, + "step": 1, + "placeholder": "0 = unlimited", + }, + ), + ) + max_size = forms.CharField( + label="Max size", + required=False, + initial="0", + widget=forms.TextInput( + attrs={ + "placeholder": "0 = unlimited, or e.g. 45mb / 1gb", + }, + ), ) notes = forms.CharField( label="Notes", strip=True, required=False, - widget=forms.TextInput(attrs={ - 'placeholder': 'Optional notes about this crawl', - }) + widget=forms.TextInput( + attrs={ + "placeholder": "Optional notes about this crawl", + }, + ), ) url_filters = forms.Field( label="URL allowlist / denylist", @@ -128,16 +155,18 @@ class AddLinkForm(forms.Form): label="Repeat schedule", max_length=64, required=False, - widget=forms.TextInput(attrs={ - 'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)', - }) + widget=forms.TextInput( + attrs={ + "placeholder": "e.g., daily, weekly, 0 */6 * * * (every 6 hours)", + }, + ), ) persona = forms.ModelChoiceField( label="Persona (authentication profile)", required=False, queryset=Persona.objects.none(), empty_label=None, - to_field_name='name', + to_field_name="name", ) index_only = forms.BooleanField( label="Index only dry run (add crawl but don't archive yet)", @@ -155,8 +184,8 @@ class AddLinkForm(forms.Form): super().__init__(*args, **kwargs) default_persona = Persona.get_or_create_default() - self.fields['persona'].queryset = Persona.objects.order_by('name') - self.fields['persona'].initial = default_persona.name + self.fields["persona"].queryset = Persona.objects.order_by("name") + self.fields["persona"].initial = default_persona.name # Get all plugins all_plugins = get_plugins() @@ -164,86 +193,136 @@ class AddLinkForm(forms.Form): # Define plugin groups chrome_dependent = { - 'accessibility', 'chrome', 'consolelog', 'dom', 'headers', - 'parse_dom_outlinks', 'pdf', 'redirects', 'responses', - 'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title' + "accessibility", + "chrome", + "consolelog", + "dom", + "headers", + "parse_dom_outlinks", + "pdf", + "redirects", + "responses", + "screenshot", + "seo", + "singlefile", + "ssl", + "staticfile", + "title", } archiving = { - 'archivedotorg', 'defuddle', 'favicon', 'forumdl', 'gallerydl', 'git', - 'htmltotext', 'mercury', 'papersdl', 'readability', 'trafilatura', 'wget', 'ytdlp' + "archivedotorg", + "defuddle", + "favicon", + "forumdl", + "gallerydl", + "git", + "htmltotext", + "mercury", + "papersdl", + "readability", + "trafilatura", + "wget", + "ytdlp", } parsing = { - 'parse_html_urls', 'parse_jsonl_urls', - 'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls' + "parse_html_urls", + "parse_jsonl_urls", + "parse_netscape_urls", + "parse_rss_urls", + "parse_txt_urls", } search = { - 'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite' + "search_backend_ripgrep", + "search_backend_sonic", + "search_backend_sqlite", } - binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'} - extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'} + binary = {"apt", "brew", "custom", "env", "npm", "pip"} + extensions = {"twocaptcha", "istilldontcareaboutcookies", "ublock"} # Populate plugin field choices - get_choice_field(self, 'chrome_plugins').choices = [ + get_choice_field(self, "chrome_plugins").choices = [ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent ] - get_choice_field(self, 'archiving_plugins').choices = [ + get_choice_field(self, "archiving_plugins").choices = [ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving ] - get_choice_field(self, 'parsing_plugins').choices = [ + get_choice_field(self, "parsing_plugins").choices = [ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing ] - get_choice_field(self, 'search_plugins').choices = [ + get_choice_field(self, "search_plugins").choices = [ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search ] - get_choice_field(self, 'binary_plugins').choices = [ + get_choice_field(self, "binary_plugins").choices = [ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary ] - get_choice_field(self, 'extension_plugins').choices = [ + get_choice_field(self, "extension_plugins").choices = [ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions ] - required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip() - search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices] + required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip() + search_choices = [choice[0] for choice in get_choice_field(self, "search_plugins").choices] if required_search_plugin in search_choices: - get_choice_field(self, 'search_plugins').initial = [required_search_plugin] + get_choice_field(self, "search_plugins").initial = [required_search_plugin] def clean(self): cleaned_data = super().clean() or {} # Combine all plugin groups into single list all_selected_plugins = [] - for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins', - 'search_plugins', 'binary_plugins', 'extension_plugins']: + for field in [ + "chrome_plugins", + "archiving_plugins", + "parsing_plugins", + "search_plugins", + "binary_plugins", + "extension_plugins", + ]: selected = cleaned_data.get(field) if isinstance(selected, list): all_selected_plugins.extend(selected) # Store combined list for easy access - cleaned_data['plugins'] = all_selected_plugins + cleaned_data["plugins"] = all_selected_plugins return cleaned_data def clean_url(self): - value = self.cleaned_data.get('url') or '' - urls = '\n'.join(find_all_urls(value)) + value = self.cleaned_data.get("url") or "" + urls = "\n".join(find_all_urls(value)) if not urls: - raise forms.ValidationError('Enter at least one valid URL.') + raise forms.ValidationError("Enter at least one valid URL.") return urls def clean_url_filters(self): from archivebox.crawls.models import Crawl - value = self.cleaned_data.get('url_filters') or {} + value = self.cleaned_data.get("url_filters") or {} return { - 'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))), - 'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))), - 'same_domain_only': bool(value.get('same_domain_only')), + "allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))), + "denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))), + "same_domain_only": bool(value.get("same_domain_only")), } + def clean_max_urls(self): + value = self.cleaned_data.get("max_urls") + return int(value or 0) + + def clean_max_size(self): + raw_value = str(self.cleaned_data.get("max_size") or "").strip() + if not raw_value: + return 0 + try: + value = parse_filesize_to_bytes(raw_value) + except ValueError as err: + raise forms.ValidationError(str(err)) + if value < 0: + raise forms.ValidationError("Max size must be 0 or a positive number of bytes.") + return value + def clean_schedule(self): - schedule = (self.cleaned_data.get('schedule') or '').strip() + schedule = (self.cleaned_data.get("schedule") or "").strip() if not schedule: - return '' + return "" try: validate_schedule(schedule) @@ -269,7 +348,7 @@ class TagField(forms.CharField): return parse_tags(value) except ValueError: raise forms.ValidationError( - "Please provide a comma-separated list of tags." + "Please provide a comma-separated list of tags.", ) def has_changed(self, initial, data): diff --git a/archivebox/core/host_utils.py b/archivebox/core/host_utils.py index c3581d4f..21479788 100644 --- a/archivebox/core/host_utils.py +++ b/archivebox/core/host_utils.py @@ -1,7 +1,5 @@ from __future__ import annotations -from __future__ import annotations - import re from urllib.parse import urlparse @@ -9,6 +7,7 @@ from archivebox.config.common import SERVER_CONFIG _SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$") +_SNAPSHOT_SUBDOMAIN_RE = re.compile(r"^snap-(?P[0-9a-fA-F]{12})$") def split_host_port(host: str) -> tuple[str, str | None]: @@ -71,21 +70,29 @@ def get_web_host() -> str: return urlparse(override).netloc.lower() return _build_listen_host("web") + def get_api_host() -> str: if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: return get_listen_host().lower() return _build_listen_host("api") + def get_public_host() -> str: if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: return get_listen_host().lower() return _build_listen_host("public") +def get_snapshot_subdomain(snapshot_id: str) -> str: + normalized = re.sub(r"[^0-9a-fA-F]", "", snapshot_id or "") + suffix = (normalized[-12:] if len(normalized) >= 12 else normalized).lower() + return f"snap-{suffix}" + + def get_snapshot_host(snapshot_id: str) -> str: if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: return get_listen_host().lower() - return _build_listen_host(snapshot_id) + return _build_listen_host(get_snapshot_subdomain(snapshot_id)) def get_original_host(domain: str) -> str: @@ -95,7 +102,16 @@ def get_original_host(domain: str) -> str: def is_snapshot_subdomain(subdomain: str) -> bool: - return bool(_SNAPSHOT_ID_RE.match(subdomain or "")) + value = (subdomain or "").strip() + return bool(_SNAPSHOT_SUBDOMAIN_RE.match(value) or _SNAPSHOT_ID_RE.match(value)) + + +def get_snapshot_lookup_key(snapshot_ref: str) -> str: + value = (snapshot_ref or "").strip().lower() + match = _SNAPSHOT_SUBDOMAIN_RE.match(value) + if match: + return match.group("suffix") + return value def get_listen_subdomain(request_host: str) -> str: @@ -141,22 +157,23 @@ def _build_base_url_for_host(host: str, request=None) -> str: def get_admin_base_url(request=None) -> str: - if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: - return _build_base_url_for_host(get_listen_host(), request=request) override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL) if override: return override + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_base_url_for_host(get_listen_host(), request=request) return _build_base_url_for_host(get_admin_host(), request=request) def get_web_base_url(request=None) -> str: - if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: - return _build_base_url_for_host(get_listen_host(), request=request) override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL) if override: return override + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_base_url_for_host(get_listen_host(), request=request) return _build_base_url_for_host(get_web_host(), request=request) + def get_api_base_url(request=None) -> str: if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: return _build_base_url_for_host(get_listen_host(), request=request) @@ -191,6 +208,7 @@ def build_admin_url(path: str = "", request=None) -> str: def build_web_url(path: str = "", request=None) -> str: return _build_url(get_web_base_url(request), path) + def build_api_url(path: str = "", request=None) -> str: return _build_url(get_api_base_url(request), path) diff --git a/archivebox/core/management/commands/archivebox.py b/archivebox/core/management/commands/archivebox.py index 582ef344..4e663fe8 100644 --- a/archivebox/core/management/commands/archivebox.py +++ b/archivebox/core/management/commands/archivebox.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox' +__package__ = "archivebox" from django.core.management.base import BaseCommand @@ -6,13 +6,12 @@ from archivebox.cli import main as run_cli class Command(BaseCommand): - help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)' + help = "Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)" def add_arguments(self, parser): - parser.add_argument('subcommand', type=str, help='The subcommand you want to run') - parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand') - + parser.add_argument("subcommand", type=str, help="The subcommand you want to run") + parser.add_argument("command_args", nargs="*", help="Arguments to pass to the subcommand") def handle(self, *args, **kwargs): - command_args = [kwargs['subcommand'], *kwargs['command_args']] + command_args = [kwargs["subcommand"], *kwargs["command_args"]] run_cli(args=command_args) diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index b7778966..3dddcbc6 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" import ipaddress import re @@ -16,6 +16,7 @@ from archivebox.config.common import SERVER_CONFIG from archivebox.config import VERSION from archivebox.config.version import get_COMMIT_HASH from archivebox.core.host_utils import ( + build_snapshot_url, build_admin_url, build_web_url, get_api_host, @@ -31,10 +32,10 @@ from archivebox.core.host_utils import ( from archivebox.core.views import SnapshotHostView, OriginalDomainHostView -def detect_timezone(request, activate: bool=True): - gmt_offset = (request.COOKIES.get('GMT_OFFSET') or '').strip() +def detect_timezone(request, activate: bool = True): + gmt_offset = (request.COOKIES.get("GMT_OFFSET") or "").strip() tz = None - if gmt_offset.replace('-', '').isdigit(): + if gmt_offset.replace("-", "").isdigit(): tz = timezone.get_fixed_timezone(int(gmt_offset)) if activate: timezone.activate(tz) @@ -53,11 +54,12 @@ def TimezoneMiddleware(get_response): def CacheControlMiddleware(get_response): snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/") static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip() + def middleware(request): response = get_response(request) - if request.path.startswith('/static/'): - rel_path = request.path[len('/static/'):] + if request.path.startswith("/static/"): + rel_path = request.path[len("/static/") :] static_path = finders.find(rel_path) if static_path: try: @@ -81,10 +83,10 @@ def CacheControlMiddleware(get_response): response.headers["Last-Modified"] = http_date(mtime) return response - if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path): - if not response.get('Cache-Control'): - policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private' - response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300' + if "/archive/" in request.path or "/static/" in request.path or snapshot_path_re.match(request.path): + if not response.get("Cache-Control"): + policy = "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private" + response["Cache-Control"] = f"{policy}, max-age=60, stale-while-revalidate=300" # print('Set Cache-Control header to', response['Cache-Control']) return response @@ -115,6 +117,10 @@ def ServerSecurityModeMiddleware(get_response): def HostRoutingMiddleware(get_response): + snapshot_path_re = re.compile( + r"^/(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?$", + ) + def middleware(request): request_host = (request.get_host() or "").lower() admin_host = get_admin_host() @@ -124,6 +130,23 @@ def HostRoutingMiddleware(get_response): listen_host = get_listen_host() subdomain = get_listen_subdomain(request_host) + # Framework-owned assets must bypass snapshot/original-domain replay routing. + # Otherwise pages on snapshot subdomains can receive HTML for JS/CSS requests. + if request.path.startswith("/static/") or request.path in {"/favicon.ico", "/robots.txt"}: + return get_response(request) + + if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and not host_matches(request_host, admin_host): + if ( + request.path == "/admin" + or request.path.startswith("/admin/") + or request.path == "/accounts" + or request.path.startswith("/accounts/") + ): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: if host_matches(request_host, listen_host): return get_response(request) @@ -140,6 +163,16 @@ def HostRoutingMiddleware(get_response): return get_response(request) if host_matches(request_host, admin_host): + snapshot_match = snapshot_path_re.match(request.path) + if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and snapshot_match: + snapshot_id = snapshot_match.group("snapshot_id") + replay_path = (snapshot_match.group("path") or "").strip("/") + if replay_path == "index.html": + replay_path = "" + target = build_snapshot_url(snapshot_id, replay_path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) return get_response(request) if host_matches(request_host, api_host): @@ -160,16 +193,9 @@ def HostRoutingMiddleware(get_response): if host_matches(request_host, web_host): request.user = AnonymousUser() request._cached_user = request.user - if request.path.startswith("/admin"): - target = build_admin_url(request.path, request=request) - if request.META.get("QUERY_STRING"): - target = f"{target}?{request.META['QUERY_STRING']}" - return redirect(target) return get_response(request) if host_matches(request_host, public_host): - request.user = AnonymousUser() - request._cached_user = request.user return get_response(request) if subdomain: @@ -196,24 +222,26 @@ def HostRoutingMiddleware(get_response): return middleware + class ReverseProxyAuthMiddleware(RemoteUserMiddleware): - header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper()) + header = "HTTP_{normalized}".format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace("-", "_").upper()) def process_request(self, request): - if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '': + if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == "": return - ip = request.META.get('REMOTE_ADDR') + ip = request.META.get("REMOTE_ADDR") if not isinstance(ip, str): return - for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','): + for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(","): try: network = ipaddress.ip_network(cidr) except ValueError: raise ImproperlyConfigured( - "The REVERSE_PROXY_WHITELIST config paramater is in invalid format, or " - "contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.") + "The REVERSE_PROXY_WHITELIST config parameter is in invalid format, or " + "contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.", + ) if ipaddress.ip_address(ip) in network: return super().process_request(request) diff --git a/archivebox/core/migrations/0001_initial.py b/archivebox/core/migrations/0001_initial.py index 73ac78e7..f64cdcca 100644 --- a/archivebox/core/migrations/0001_initial.py +++ b/archivebox/core/migrations/0001_initial.py @@ -5,23 +5,21 @@ import uuid class Migration(migrations.Migration): - initial = True - dependencies = [ - ] + dependencies = [] operations = [ migrations.CreateModel( - name='Snapshot', + name="Snapshot", fields=[ - ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), - ('url', models.URLField(unique=True)), - ('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)), - ('title', models.CharField(default=None, max_length=128, null=True)), - ('tags', models.CharField(default=None, max_length=256, null=True)), - ('added', models.DateTimeField(auto_now_add=True)), - ('updated', models.DateTimeField(default=None, null=True)), + ("id", models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ("url", models.URLField(unique=True)), + ("timestamp", models.CharField(default=None, max_length=32, null=True, unique=True)), + ("title", models.CharField(default=None, max_length=128, null=True)), + ("tags", models.CharField(default=None, max_length=256, null=True)), + ("added", models.DateTimeField(auto_now_add=True)), + ("updated", models.DateTimeField(default=None, null=True)), ], ), ] diff --git a/archivebox/core/migrations/0002_auto_20200625_1521.py b/archivebox/core/migrations/0002_auto_20200625_1521.py index 48112829..ff825ba6 100644 --- a/archivebox/core/migrations/0002_auto_20200625_1521.py +++ b/archivebox/core/migrations/0002_auto_20200625_1521.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0001_initial'), + ("core", "0001_initial"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(default=None, max_length=32, null=True), ), ] diff --git a/archivebox/core/migrations/0003_auto_20200630_1034.py b/archivebox/core/migrations/0003_auto_20200630_1034.py index 61fd4727..0d378f07 100644 --- a/archivebox/core/migrations/0003_auto_20200630_1034.py +++ b/archivebox/core/migrations/0003_auto_20200630_1034.py @@ -4,35 +4,34 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0002_auto_20200625_1521'), + ("core", "0002_auto_20200625_1521"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='added', + model_name="snapshot", + name="added", field=models.DateTimeField(auto_now_add=True, db_index=True), ), migrations.AlterField( - model_name='snapshot', - name='tags', + model_name="snapshot", + name="tags", field=models.CharField(db_index=True, default=None, max_length=256, null=True), ), migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(db_index=True, default=None, max_length=32, null=True), ), migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(db_index=True, default=None, max_length=128, null=True), ), migrations.AlterField( - model_name='snapshot', - name='updated', + model_name="snapshot", + name="updated", field=models.DateTimeField(db_index=True, default=None, null=True), ), ] diff --git a/archivebox/core/migrations/0004_auto_20200713_1552.py b/archivebox/core/migrations/0004_auto_20200713_1552.py index 69836623..02f2738c 100644 --- a/archivebox/core/migrations/0004_auto_20200713_1552.py +++ b/archivebox/core/migrations/0004_auto_20200713_1552.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0003_auto_20200630_1034'), + ("core", "0003_auto_20200630_1034"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(db_index=True, default=None, max_length=32, unique=True), preserve_default=False, ), diff --git a/archivebox/core/migrations/0005_auto_20200728_0326.py b/archivebox/core/migrations/0005_auto_20200728_0326.py index f367aeb1..8b1c32e5 100644 --- a/archivebox/core/migrations/0005_auto_20200728_0326.py +++ b/archivebox/core/migrations/0005_auto_20200728_0326.py @@ -4,25 +4,24 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0004_auto_20200713_1552'), + ("core", "0004_auto_20200713_1552"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='tags', + model_name="snapshot", + name="tags", field=models.CharField(blank=True, db_index=True, max_length=256, null=True), ), migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=128, null=True), ), migrations.AlterField( - model_name='snapshot', - name='updated', + model_name="snapshot", + name="updated", field=models.DateTimeField(blank=True, db_index=True, null=True), ), ] diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py index 0f5df6a9..103a2877 100644 --- a/archivebox/core/migrations/0006_auto_20201012_1520.py +++ b/archivebox/core/migrations/0006_auto_20201012_1520.py @@ -3,19 +3,18 @@ from django.db import migrations, models from django.utils.text import slugify + def forwards_func(apps, schema_editor): SnapshotModel = apps.get_model("core", "Snapshot") TagModel = apps.get_model("core", "Tag") snapshots = SnapshotModel.objects.all() for snapshot in snapshots: - tag_set = ( - set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) - ) + tag_set = {tag.strip() for tag in (snapshot.tags_old or "").split(",")} tag_set.discard("") for tag in tag_set: - to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={'slug': slugify(tag)}) + to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={"slug": slugify(tag)}) snapshot.tags.add(to_add) @@ -30,37 +29,36 @@ def reverse_func(apps, schema_editor): class Migration(migrations.Migration): - dependencies = [ - ('core', '0005_auto_20200728_0326'), + ("core", "0005_auto_20200728_0326"), ] operations = [ migrations.RenameField( - model_name='snapshot', - old_name='tags', - new_name='tags_old', + model_name="snapshot", + old_name="tags", + new_name="tags_old", ), migrations.CreateModel( - name='Tag', + name="Tag", fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('name', models.CharField(max_length=100, unique=True, verbose_name='name')), - ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')), + ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=100, unique=True, verbose_name="name")), + ("slug", models.SlugField(max_length=100, unique=True, verbose_name="slug")), ], options={ - 'verbose_name': 'Tag', - 'verbose_name_plural': 'Tags', + "verbose_name": "Tag", + "verbose_name_plural": "Tags", }, ), migrations.AddField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(to='core.Tag'), + model_name="snapshot", + name="tags", + field=models.ManyToManyField(to="core.Tag"), ), migrations.RunPython(forwards_func, reverse_func), migrations.RemoveField( - model_name='snapshot', - name='tags_old', + model_name="snapshot", + name="tags_old", ), ] diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 9cf5e75d..3b31b15c 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -9,13 +9,15 @@ import django.db.models.deletion # Handle old vs new import paths try: from archivebox.config import CONSTANTS + ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR except ImportError: try: from archivebox.config import CONFIG - ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive')) + + ARCHIVE_DIR = Path(CONFIG.get("ARCHIVE_DIR", "./archive")) except ImportError: - ARCHIVE_DIR = Path('./archive') + ARCHIVE_DIR = Path("./archive") try: from archivebox.misc.util import to_json @@ -29,6 +31,7 @@ try: JSONField = models.JSONField except AttributeError: import jsonfield + JSONField = jsonfield.JSONField @@ -41,7 +44,7 @@ def forwards_func(apps, schema_editor): out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp try: - with open(out_dir / "index.json", "r") as f: + with open(out_dir / "index.json") as f: fs_index = json.load(f) except Exception: continue @@ -56,37 +59,46 @@ def forwards_func(apps, schema_editor): snapshot=snapshot, pwd=result["pwd"], cmd=result.get("cmd") or [], - cmd_version=result.get("cmd_version") or 'unknown', + cmd_version=result.get("cmd_version") or "unknown", start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], - output=result.get("output") or 'null', + output=result.get("output") or "null", ) except Exception as e: print( - ' ! Skipping import due to missing/invalid index.json:', + " ! Skipping import due to missing/invalid index.json:", out_dir, e, - '(open an issue with this index.json for help)', + "(open an issue with this index.json for help)", ) def verify_json_index_integrity(snapshot): results = snapshot.archiveresult_set.all() out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp - with open(out_dir / "index.json", "r") as f: + with open(out_dir / "index.json") as f: index = json.load(f) history = index["history"] index_results = [result for extractor in history for result in history[extractor]] flattened_results = [result["start_ts"] for result in index_results] - + missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results] for missing in missing_results: - index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(), - "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output, - "schema": "ArchiveResult", "status": missing.status}) + index["history"][missing.extractor].append( + { + "cmd": missing.cmd, + "cmd_version": missing.cmd_version, + "end_ts": missing.end_ts.isoformat(), + "start_ts": missing.start_ts.isoformat(), + "pwd": missing.pwd, + "output": missing.output, + "schema": "ArchiveResult", + "status": missing.status, + }, + ) json_index = to_json(index) with open(out_dir / "index.json", "w") as f: @@ -103,25 +115,47 @@ def reverse_func(apps, schema_editor): class Migration(migrations.Migration): - dependencies = [ - ('core', '0006_auto_20201012_1520'), + ("core", "0006_auto_20201012_1520"), ] operations = [ migrations.CreateModel( - name='ArchiveResult', + name="ArchiveResult", fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('cmd', JSONField()), - ('pwd', models.CharField(max_length=256)), - ('cmd_version', models.CharField(max_length=32)), - ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), - ('output', models.CharField(max_length=512)), - ('start_ts', models.DateTimeField()), - ('end_ts', models.DateTimeField()), - ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)), - ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), + ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("cmd", JSONField()), + ("pwd", models.CharField(max_length=256)), + ("cmd_version", models.CharField(max_length=32)), + ( + "status", + models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16), + ), + ("output", models.CharField(max_length=512)), + ("start_ts", models.DateTimeField()), + ("end_ts", models.DateTimeField()), + ( + "extractor", + models.CharField( + choices=[ + ("title", "title"), + ("favicon", "favicon"), + ("wget", "wget"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("headers", "headers"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")), ], ), migrations.RunPython(forwards_func, reverse_func), diff --git a/archivebox/core/migrations/0008_auto_20210105_1421.py b/archivebox/core/migrations/0008_auto_20210105_1421.py index e5b3387d..68c408e7 100644 --- a/archivebox/core/migrations/0008_auto_20210105_1421.py +++ b/archivebox/core/migrations/0008_auto_20210105_1421.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0007_archiveresult'), + ("core", "0007_archiveresult"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='cmd_version', + model_name="archiveresult", + name="cmd_version", field=models.CharField(blank=True, default=None, max_length=32, null=True), ), ] diff --git a/archivebox/core/migrations/0009_auto_20210216_1038.py b/archivebox/core/migrations/0009_auto_20210216_1038.py index 2817fe54..41747426 100644 --- a/archivebox/core/migrations/0009_auto_20210216_1038.py +++ b/archivebox/core/migrations/0009_auto_20210216_1038.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0008_auto_20210105_1421'), + ("core", "0008_auto_20210105_1421"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='updated', + model_name="snapshot", + name="updated", field=models.DateTimeField(auto_now=True, db_index=True, null=True), ), ] diff --git a/archivebox/core/migrations/0010_auto_20210216_1055.py b/archivebox/core/migrations/0010_auto_20210216_1055.py index 0af61a39..14bc18fd 100644 --- a/archivebox/core/migrations/0010_auto_20210216_1055.py +++ b/archivebox/core/migrations/0010_auto_20210216_1055.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0009_auto_20210216_1038'), + ("core", "0009_auto_20210216_1038"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='start_ts', + model_name="archiveresult", + name="start_ts", field=models.DateTimeField(db_index=True), ), ] diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py index c00d90ca..a1f6e753 100644 --- a/archivebox/core/migrations/0011_auto_20210216_1331.py +++ b/archivebox/core/migrations/0011_auto_20210216_1331.py @@ -5,20 +5,36 @@ import uuid class Migration(migrations.Migration): - dependencies = [ - ('core', '0010_auto_20210216_1055'), + ("core", "0010_auto_20210216_1055"), ] operations = [ migrations.AddField( - model_name='archiveresult', - name='uuid', + model_name="archiveresult", + name="uuid", field=models.UUIDField(default=uuid.uuid4, editable=False), ), migrations.AlterField( - model_name='archiveresult', - name='extractor', - field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32), + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("title", "title"), + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), ), ] diff --git a/archivebox/core/migrations/0012_auto_20210216_1425.py b/archivebox/core/migrations/0012_auto_20210216_1425.py index 310058ac..27beb897 100644 --- a/archivebox/core/migrations/0012_auto_20210216_1425.py +++ b/archivebox/core/migrations/0012_auto_20210216_1425.py @@ -4,20 +4,19 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0011_auto_20210216_1331'), + ("core", "0011_auto_20210216_1331"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='cmd_version', + model_name="archiveresult", + name="cmd_version", field=models.CharField(blank=True, default=None, max_length=128, null=True), ), migrations.AlterField( - model_name='archiveresult', - name='output', + model_name="archiveresult", + name="output", field=models.CharField(max_length=1024), ), ] diff --git a/archivebox/core/migrations/0013_auto_20210218_0729.py b/archivebox/core/migrations/0013_auto_20210218_0729.py index d3fe3b4f..a774f156 100644 --- a/archivebox/core/migrations/0013_auto_20210218_0729.py +++ b/archivebox/core/migrations/0013_auto_20210218_0729.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0012_auto_20210216_1425'), + ("core", "0012_auto_20210216_1425"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=256, null=True), ), ] diff --git a/archivebox/core/migrations/0014_auto_20210218_0729.py b/archivebox/core/migrations/0014_auto_20210218_0729.py index db81934f..d14211a6 100644 --- a/archivebox/core/migrations/0014_auto_20210218_0729.py +++ b/archivebox/core/migrations/0014_auto_20210218_0729.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0013_auto_20210218_0729'), + ("core", "0013_auto_20210218_0729"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=1024, null=True), ), ] diff --git a/archivebox/core/migrations/0015_auto_20210218_0730.py b/archivebox/core/migrations/0015_auto_20210218_0730.py index b782a217..e2d99cdb 100644 --- a/archivebox/core/migrations/0015_auto_20210218_0730.py +++ b/archivebox/core/migrations/0015_auto_20210218_0730.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0014_auto_20210218_0729'), + ("core", "0014_auto_20210218_0729"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=512, null=True), ), ] diff --git a/archivebox/core/migrations/0016_auto_20210218_1204.py b/archivebox/core/migrations/0016_auto_20210218_1204.py index 4637feab..1b996172 100644 --- a/archivebox/core/migrations/0016_auto_20210218_1204.py +++ b/archivebox/core/migrations/0016_auto_20210218_1204.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0015_auto_20210218_0730'), + ("core", "0015_auto_20210218_0730"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(blank=True, to='core.Tag'), + model_name="snapshot", + name="tags", + field=models.ManyToManyField(blank=True, to="core.Tag"), ), ] diff --git a/archivebox/core/migrations/0017_auto_20210219_0211.py b/archivebox/core/migrations/0017_auto_20210219_0211.py index 221a250b..4a9a4c82 100644 --- a/archivebox/core/migrations/0017_auto_20210219_0211.py +++ b/archivebox/core/migrations/0017_auto_20210219_0211.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0016_auto_20210218_1204'), + ("core", "0016_auto_20210218_1204"), ] operations = [ migrations.AlterField( - model_name='tag', - name='slug', - field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'), + model_name="tag", + name="slug", + field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name="slug"), ), ] diff --git a/archivebox/core/migrations/0018_auto_20210327_0952.py b/archivebox/core/migrations/0018_auto_20210327_0952.py index d0f3dde1..dc5b2d1f 100644 --- a/archivebox/core/migrations/0018_auto_20210327_0952.py +++ b/archivebox/core/migrations/0018_auto_20210327_0952.py @@ -4,20 +4,19 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0017_auto_20210219_0211'), + ("core", "0017_auto_20210219_0211"), ] operations = [ migrations.AlterField( - model_name='tag', - name='name', + model_name="tag", + name="name", field=models.CharField(max_length=100, unique=True), ), migrations.AlterField( - model_name='tag', - name='slug', + model_name="tag", + name="slug", field=models.SlugField(blank=True, max_length=100, unique=True), ), ] diff --git a/archivebox/core/migrations/0019_auto_20210401_0654.py b/archivebox/core/migrations/0019_auto_20210401_0654.py index 735a6549..846bb619 100644 --- a/archivebox/core/migrations/0019_auto_20210401_0654.py +++ b/archivebox/core/migrations/0019_auto_20210401_0654.py @@ -4,15 +4,14 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0018_auto_20210327_0952'), + ("core", "0018_auto_20210327_0952"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='url', + model_name="snapshot", + name="url", field=models.URLField(db_index=True, unique=True), ), ] diff --git a/archivebox/core/migrations/0020_auto_20210410_1031.py b/archivebox/core/migrations/0020_auto_20210410_1031.py index e75243c6..610eaa43 100644 --- a/archivebox/core/migrations/0020_auto_20210410_1031.py +++ b/archivebox/core/migrations/0020_auto_20210410_1031.py @@ -4,20 +4,19 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0019_auto_20210401_0654'), + ("core", "0019_auto_20210401_0654"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='id', - field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + model_name="archiveresult", + name="id", + field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"), ), migrations.AlterField( - model_name='tag', - name='id', - field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + model_name="tag", + name="id", + field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"), ), ] diff --git a/archivebox/core/migrations/0021_auto_20220914_0934.py b/archivebox/core/migrations/0021_auto_20220914_0934.py index d33f785e..3f757723 100644 --- a/archivebox/core/migrations/0021_auto_20220914_0934.py +++ b/archivebox/core/migrations/0021_auto_20220914_0934.py @@ -4,15 +4,31 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0020_auto_20210410_1031'), + ("core", "0020_auto_20210410_1031"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='extractor', - field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32), + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("title", "title"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), ), ] diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py index ffb41fbd..43dd1a69 100644 --- a/archivebox/core/migrations/0022_auto_20231023_2008.py +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -4,15 +4,32 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0021_auto_20220914_0934'), + ("core", "0021_auto_20220914_0934"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='extractor', - field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32), + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("title", "title"), + ("readability", "readability"), + ("mercury", "mercury"), + ("htmltotext", "htmltotext"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), ), ] diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index a95cc007..ea7bcb44 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -16,6 +16,7 @@ def get_table_columns(table_name): def upgrade_core_tables(apps, schema_editor): """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0.""" from archivebox.uuid_compat import uuid7 + cursor = connection.cursor() # Check if core_archiveresult table exists @@ -30,11 +31,11 @@ def upgrade_core_tables(apps, schema_editor): has_data = row_count > 0 # Detect which version we're migrating from - archiveresult_cols = get_table_columns('core_archiveresult') - has_uuid = 'uuid' in archiveresult_cols - has_abid = 'abid' in archiveresult_cols + archiveresult_cols = get_table_columns("core_archiveresult") + has_uuid = "uuid" in archiveresult_cols + has_abid = "abid" in archiveresult_cols - print(f'DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}') + print(f"DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}") # ============================================================================ # PART 1: Upgrade core_archiveresult table @@ -62,7 +63,7 @@ def upgrade_core_tables(apps, schema_editor): if has_data: if has_uuid and not has_abid: # Migrating from v0.7.2+ (has uuid column) - print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...') + print("Migrating ArchiveResult from v0.7.2+ schema (with uuid)...") cursor.execute(""" INSERT OR IGNORE INTO core_archiveresult_new ( id, uuid, snapshot_id, cmd, pwd, cmd_version, @@ -75,7 +76,7 @@ def upgrade_core_tables(apps, schema_editor): """) elif has_abid and not has_uuid: # Migrating from v0.8.6rc0 (has abid instead of uuid) - print('Migrating ArchiveResult from v0.8.6rc0 schema...') + print("Migrating ArchiveResult from v0.8.6rc0 schema...") cursor.execute(""" INSERT OR IGNORE INTO core_archiveresult_new ( id, uuid, snapshot_id, cmd, pwd, cmd_version, @@ -88,17 +89,34 @@ def upgrade_core_tables(apps, schema_editor): """) else: # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs) - print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...') - cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult") + print("Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...") + cursor.execute( + "SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult", + ) old_records = cursor.fetchall() for record in old_records: new_uuid = uuid7().hex - cursor.execute(""" + cursor.execute( + """ INSERT OR IGNORE INTO core_archiveresult_new ( id, uuid, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9])) + """, + ( + record[0], + new_uuid, + record[1], + record[2], + record[3], + record[4], + record[5], + record[6], + record[7], + record[8], + record[9], + ), + ) cursor.execute("DROP TABLE IF EXISTS core_archiveresult;") cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;") @@ -149,13 +167,13 @@ def upgrade_core_tables(apps, schema_editor): if snapshot_has_data: # Detect which version we're migrating from - snapshot_cols = get_table_columns('core_snapshot') - has_added = 'added' in snapshot_cols - has_bookmarked_at = 'bookmarked_at' in snapshot_cols + snapshot_cols = get_table_columns("core_snapshot") + has_added = "added" in snapshot_cols + has_bookmarked_at = "bookmarked_at" in snapshot_cols if has_added and not has_bookmarked_at: # Migrating from v0.7.2 (has added/updated fields) - print('Migrating Snapshot from v0.7.2 schema...') + print("Migrating Snapshot from v0.7.2 schema...") # Transform added→bookmarked_at/created_at and updated→modified_at cursor.execute(""" INSERT OR IGNORE INTO core_snapshot_new ( @@ -173,28 +191,28 @@ def upgrade_core_tables(apps, schema_editor): """) elif has_bookmarked_at and not has_added: # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at) - print('Migrating Snapshot from v0.8.6rc0 schema...') + print("Migrating Snapshot from v0.8.6rc0 schema...") # Check what fields exist - has_status = 'status' in snapshot_cols - has_retry_at = 'retry_at' in snapshot_cols - has_crawl_id = 'crawl_id' in snapshot_cols + has_status = "status" in snapshot_cols + has_retry_at = "retry_at" in snapshot_cols + has_crawl_id = "crawl_id" in snapshot_cols # Build column list based on what exists - cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'] + cols = ["id", "url", "timestamp", "title", "bookmarked_at", "created_at", "modified_at", "downloaded_at"] if has_crawl_id: - cols.append('crawl_id') + cols.append("crawl_id") if has_status: - cols.append('status') + cols.append("status") if has_retry_at: - cols.append('retry_at') + cols.append("retry_at") cursor.execute(f""" - INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)}) - SELECT {', '.join(cols)} + INSERT OR IGNORE INTO core_snapshot_new ({", ".join(cols)}) + SELECT {", ".join(cols)} FROM core_snapshot; """) else: - print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}') + print(f"Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}") cursor.execute("DROP TABLE IF EXISTS core_snapshot;") cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;") @@ -237,13 +255,13 @@ def upgrade_core_tables(apps, schema_editor): cursor.execute("PRAGMA table_info(core_tag)") tag_id_type = None for row in cursor.fetchall(): - if row[1] == 'id': # row[1] is column name + if row[1] == "id": # row[1] is column name tag_id_type = row[2] # row[2] is type break - if tag_id_type and 'char' in tag_id_type.lower(): + if tag_id_type and "char" in tag_id_type.lower(): # v0.8.6rc0: Tag IDs are UUIDs, need to convert to INTEGER - print('Converting Tag IDs from UUID to INTEGER...') + print("Converting Tag IDs from UUID to INTEGER...") # Get all tags with their UUIDs cursor.execute("SELECT id, name, slug, created_at, modified_at, created_by_id FROM core_tag ORDER BY name") @@ -255,10 +273,13 @@ def upgrade_core_tables(apps, schema_editor): old_id, name, slug, created_at, modified_at, created_by_id = tag uuid_to_int_map[old_id] = i # Insert with new INTEGER ID - cursor.execute(""" + cursor.execute( + """ INSERT OR IGNORE INTO core_tag_new (id, name, slug, created_at, modified_at, created_by_id) VALUES (?, ?, ?, ?, ?, ?) - """, (i, name, slug, created_at, modified_at, created_by_id)) + """, + (i, name, slug, created_at, modified_at, created_by_id), + ) # Update snapshot_tags to use new INTEGER IDs cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot_tags'") @@ -273,13 +294,16 @@ def upgrade_core_tables(apps, schema_editor): for st_id, snapshot_id, old_tag_id in snapshot_tags: new_tag_id = uuid_to_int_map.get(old_tag_id) if new_tag_id: - cursor.execute(""" + cursor.execute( + """ INSERT OR IGNORE INTO core_snapshot_tags (id, snapshot_id, tag_id) VALUES (?, ?, ?) - """, (st_id, snapshot_id, new_tag_id)) + """, + (st_id, snapshot_id, new_tag_id), + ) else: # v0.7.2: Tag IDs are already INTEGER - print('Migrating Tag from v0.7.2 schema...') + print("Migrating Tag from v0.7.2 schema...") cursor.execute(""" INSERT OR IGNORE INTO core_tag_new (id, name, slug) SELECT id, name, slug @@ -294,15 +318,14 @@ def upgrade_core_tables(apps, schema_editor): cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);") if has_data: - print('✓ Core tables upgraded to v0.9.0') + print("✓ Core tables upgraded to v0.9.0") class Migration(migrations.Migration): - dependencies = [ - ('core', '0022_auto_20231023_2008'), - ('crawls', '0001_initial'), - ('auth', '0012_alter_user_first_name_max_length'), + ("core", "0022_auto_20231023_2008"), + ("crawls", "0001_initial"), + ("auth", "0012_alter_user_first_name_max_length"), ] operations = [ @@ -317,60 +340,58 @@ class Migration(migrations.Migration): # NOTE: We do NOT remove extractor/output for ArchiveResult! # They are still in the database and will be removed by migration 0025 # after copying their data to plugin/output_str. - # However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields # because the SQL above already transformed them. - migrations.RemoveField(model_name='snapshot', name='added'), - migrations.RemoveField(model_name='snapshot', name='updated'), + migrations.RemoveField(model_name="snapshot", name="added"), + migrations.RemoveField(model_name="snapshot", name="updated"), migrations.AddField( - model_name='snapshot', - name='bookmarked_at', + model_name="snapshot", + name="bookmarked_at", field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), ), migrations.AddField( - model_name='snapshot', - name='created_at', + model_name="snapshot", + name="created_at", field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), ), migrations.AddField( - model_name='snapshot', - name='modified_at', + model_name="snapshot", + name="modified_at", field=models.DateTimeField(auto_now=True), ), # Declare fs_version (already created in database with DEFAULT '0.8.0') migrations.AddField( - model_name='snapshot', - name='fs_version', + model_name="snapshot", + name="fs_version", field=models.CharField( max_length=10, - default='0.8.0', - help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().' + default="0.8.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', ), ), - # SnapshotTag table already exists from v0.7.2, just declare it in state migrations.CreateModel( - name='SnapshotTag', + name="SnapshotTag", fields=[ - ('id', models.AutoField(primary_key=True, serialize=False)), - ('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)), - ('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)), + ("id", models.AutoField(primary_key=True, serialize=False)), + ("snapshot", models.ForeignKey(to="core.Snapshot", db_column="snapshot_id", on_delete=models.CASCADE)), + ("tag", models.ForeignKey(to="core.Tag", db_column="tag_id", on_delete=models.CASCADE)), ], options={ - 'db_table': 'core_snapshot_tags', - 'unique_together': {('snapshot', 'tag')}, + "db_table": "core_snapshot_tags", + "unique_together": {("snapshot", "tag")}, }, ), # Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2) migrations.AlterField( - model_name='snapshot', - name='tags', + model_name="snapshot", + name="tags", field=models.ManyToManyField( - 'Tag', + "Tag", blank=True, - related_name='snapshot_set', - through='SnapshotTag', - through_fields=('snapshot', 'tag'), + related_name="snapshot_set", + through="SnapshotTag", + through_fields=("snapshot", "tag"), ), ), ], diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py index fc435608..e5dd70d8 100644 --- a/archivebox/core/migrations/0024_assign_default_crawl.py +++ b/archivebox/core/migrations/0024_assign_default_crawl.py @@ -20,23 +20,27 @@ def create_default_crawl_and_assign_snapshots(apps, schema_editor): snapshots_without_crawl = cursor.fetchone()[0] if snapshots_without_crawl == 0: - print('✓ Fresh install or all snapshots already have crawls') + print("✓ Fresh install or all snapshots already have crawls") return # Get or create system user (pk=1) cursor.execute("SELECT id FROM auth_user WHERE id = 1") if not cursor.fetchone(): - cursor.execute(""" + cursor.execute( + """ INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined) VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?) - """, [datetime.now().isoformat()]) + """, + [datetime.now().isoformat()], + ) # Create a default crawl for migrated snapshots # At this point crawls_crawl is guaranteed to have v0.9.0 schema (crawls/0002 ran first) crawl_id = str(uuid_lib.uuid4()) now = datetime.now().isoformat() - cursor.execute(""" + cursor.execute( + """ INSERT INTO crawls_crawl ( id, created_at, modified_at, num_uses_succeeded, num_uses_failed, urls, max_depth, tags_str, label, notes, output_dir, @@ -44,20 +48,21 @@ def create_default_crawl_and_assign_snapshots(apps, schema_editor): ) VALUES (?, ?, ?, 0, 0, '', 0, '', 'Migrated from v0.7.2/v0.8.6', 'Auto-created crawl for migrated snapshots', '', 'sealed', ?, 1, NULL, '{}', NULL) - """, [crawl_id, now, now, now]) + """, + [crawl_id, now, now, now], + ) # Assign all snapshots without a crawl to the default crawl cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id]) - print(f'✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}') + print(f"✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}") class Migration(migrations.Migration): - dependencies = [ - ('core', '0023_upgrade_to_0_9_0'), - ('crawls', '0002_upgrade_from_0_8_6'), - ('auth', '0012_alter_user_first_name_max_length'), + ("core", "0023_upgrade_to_0_9_0"), + ("crawls", "0002_upgrade_from_0_8_6"), + ("auth", "0012_alter_user_first_name_max_length"), ] operations = [ @@ -137,12 +142,12 @@ class Migration(migrations.Migration): ], state_operations=[ migrations.AddField( - model_name='snapshot', - name='crawl', + model_name="snapshot", + name="crawl", field=models.ForeignKey( on_delete=models.deletion.CASCADE, - to='crawls.crawl', - help_text='Crawl that created this snapshot' + to="crawls.crawl", + help_text="Crawl that created this snapshot", ), ), ], diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py index 93cca140..9d29d8c6 100644 --- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -17,20 +17,24 @@ def copy_old_fields_to_new(apps, schema_editor): cursor.execute("PRAGMA table_info(core_archiveresult)") cols = {row[1] for row in cursor.fetchall()} - if 'extractor' in cols and 'plugin' in cols: + if "extractor" in cols and "plugin" in cols: # Copy extractor -> plugin cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL") - if 'output' in cols and 'output_str' in cols: + if "output" in cols and "output_str" in cols: # Copy output -> output_str cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '') WHERE output_str = '' OR output_str IS NULL") # Copy timestamps to new timestamp fields if they don't have values yet - if 'start_ts' in cols and 'created_at' in cols: - cursor.execute("UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''") + if "start_ts" in cols and "created_at" in cols: + cursor.execute( + "UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''", + ) - if 'end_ts' in cols and 'modified_at' in cols: - cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''") + if "end_ts" in cols and "modified_at" in cols: + cursor.execute( + "UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''", + ) # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already # transformed by migration 0023, so we don't need to copy them here. @@ -39,164 +43,191 @@ def copy_old_fields_to_new(apps, schema_editor): # Debug: Check Snapshot timestamps at end of RunPython cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2") snap_after = cursor.fetchall() - print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}') + print(f"DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}") class Migration(migrations.Migration): - dependencies = [ - ('core', '0024_assign_default_crawl'), - ('crawls', '0001_initial'), + ("core", "0024_assign_default_crawl"), + ("crawls", "0001_initial"), migrations.swappable_dependency(settings.AUTH_USER_MODEL), ] operations = [ migrations.AlterModelOptions( - name='archiveresult', - options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, + name="archiveresult", + options={"verbose_name": "Archive Result", "verbose_name_plural": "Archive Results Log"}, ), migrations.AlterModelOptions( - name='snapshot', - options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'}, + name="snapshot", + options={"verbose_name": "Snapshot", "verbose_name_plural": "Snapshots"}, ), # NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027 # to allow data migration to Process records first migrations.AddField( - model_name='archiveresult', - name='config', + model_name="archiveresult", + name="config", field=models.JSONField(blank=True, default=dict, null=True), ), migrations.AddField( - model_name='archiveresult', - name='created_at', + model_name="archiveresult", + name="created_at", field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), ), migrations.AddField( - model_name='archiveresult', - name='hook_name', - field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255), + model_name="archiveresult", + name="hook_name", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)", + max_length=255, + ), ), migrations.AddField( - model_name='archiveresult', - name='modified_at', + model_name="archiveresult", + name="modified_at", field=models.DateTimeField(auto_now=True), ), migrations.AddField( - model_name='archiveresult', - name='notes', - field=models.TextField(blank=True, default=''), + model_name="archiveresult", + name="notes", + field=models.TextField(blank=True, default=""), ), migrations.AddField( - model_name='archiveresult', - name='num_uses_failed', + model_name="archiveresult", + name="num_uses_failed", field=models.PositiveIntegerField(default=0), ), migrations.AddField( - model_name='archiveresult', - name='num_uses_succeeded', + model_name="archiveresult", + name="num_uses_succeeded", field=models.PositiveIntegerField(default=0), ), migrations.AddField( - model_name='archiveresult', - name='output_files', - field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), + model_name="archiveresult", + name="output_files", + field=models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}"), ), migrations.AddField( - model_name='archiveresult', - name='output_json', - field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), + model_name="archiveresult", + name="output_json", + field=models.JSONField(blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)", null=True), ), migrations.AddField( - model_name='archiveresult', - name='output_mimetypes', - field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), + model_name="archiveresult", + name="output_mimetypes", + field=models.CharField(blank=True, default="", help_text="CSV of mimetypes sorted by size", max_length=512), ), migrations.AddField( - model_name='archiveresult', - name='output_size', - field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'), + model_name="archiveresult", + name="output_size", + field=models.BigIntegerField(default=0, help_text="Total bytes of all output files"), ), migrations.AddField( - model_name='archiveresult', - name='output_str', - field=models.TextField(blank=True, default='', help_text='Human-readable output summary'), + model_name="archiveresult", + name="output_str", + field=models.TextField(blank=True, default="", help_text="Human-readable output summary"), ), migrations.AddField( - model_name='archiveresult', - name='plugin', - field=models.CharField(db_index=True, default='', max_length=32), + model_name="archiveresult", + name="plugin", + field=models.CharField(db_index=True, default="", max_length=32), ), migrations.AddField( - model_name='archiveresult', - name='retry_at', + model_name="archiveresult", + name="retry_at", field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), ), # NOTE: bookmarked_at and created_at already added by migration 0023 migrations.AddField( - model_name='snapshot', - name='config', + model_name="snapshot", + name="config", field=models.JSONField(default=dict), ), migrations.AddField( - model_name='snapshot', - name='current_step', - field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'), + model_name="snapshot", + name="current_step", + field=models.PositiveSmallIntegerField( + db_index=True, + default=0, + help_text="Current hook step being executed (0-9). Used for sequential hook execution.", + ), ), migrations.AddField( - model_name='snapshot', - name='depth', + model_name="snapshot", + name="depth", field=models.PositiveSmallIntegerField(db_index=True, default=0), ), migrations.AddField( - model_name='snapshot', - name='downloaded_at', + model_name="snapshot", + name="downloaded_at", field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), ), # NOTE: fs_version already added by migration 0023 with default='0.8.0' # NOTE: modified_at already added by migration 0023 migrations.AddField( - model_name='snapshot', - name='notes', - field=models.TextField(blank=True, default=''), + model_name="snapshot", + name="notes", + field=models.TextField(blank=True, default=""), ), migrations.AddField( - model_name='snapshot', - name='num_uses_failed', + model_name="snapshot", + name="num_uses_failed", field=models.PositiveIntegerField(default=0), ), migrations.AddField( - model_name='snapshot', - name='num_uses_succeeded', + model_name="snapshot", + name="num_uses_succeeded", field=models.PositiveIntegerField(default=0), ), migrations.AddField( - model_name='snapshot', - name='parent_snapshot', - field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), + model_name="snapshot", + name="parent_snapshot", + field=models.ForeignKey( + blank=True, + help_text="Parent snapshot that discovered this URL (for recursive crawling)", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="child_snapshots", + to="core.snapshot", + ), ), migrations.AddField( - model_name='snapshot', - name='retry_at', + model_name="snapshot", + name="retry_at", field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), ), migrations.AddField( - model_name='snapshot', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15), + model_name="snapshot", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), ), migrations.AddField( - model_name='tag', - name='created_at', + model_name="tag", + name="created_at", field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), ), migrations.AddField( - model_name='tag', - name='created_by', - field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + model_name="tag", + name="created_by", + field=models.ForeignKey( + default=archivebox.base_models.models.get_or_create_system_user_pk, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="tag_set", + to=settings.AUTH_USER_MODEL, + ), ), migrations.AddField( - model_name='tag', - name='modified_at', + model_name="tag", + name="modified_at", field=models.DateTimeField(auto_now=True), ), # Copy data from old field names to new field names after AddField operations @@ -206,75 +237,93 @@ class Migration(migrations.Migration): ), # Now remove the old ArchiveResult fields after data has been copied migrations.RemoveField( - model_name='archiveresult', - name='extractor', + model_name="archiveresult", + name="extractor", ), migrations.RemoveField( - model_name='archiveresult', - name='output', + model_name="archiveresult", + name="output", ), # NOTE: Snapshot's added/updated were already removed by migration 0023 migrations.AlterField( - model_name='archiveresult', - name='end_ts', + model_name="archiveresult", + name="end_ts", field=models.DateTimeField(blank=True, default=None, null=True), ), migrations.AlterField( - model_name='archiveresult', - name='id', + model_name="archiveresult", + name="id", field=models.AutoField(editable=False, primary_key=True, serialize=False), ), migrations.AlterField( - model_name='archiveresult', - name='start_ts', + model_name="archiveresult", + name="start_ts", field=models.DateTimeField(blank=True, default=None, null=True), ), migrations.AlterField( - model_name='archiveresult', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ], + db_index=True, + default="queued", + max_length=15, + ), ), migrations.AlterField( - model_name='archiveresult', - name='uuid', + model_name="archiveresult", + name="uuid", field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True), ), migrations.AlterField( - model_name='snapshot', - name='crawl', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), + model_name="snapshot", + name="crawl", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="snapshot_set", to="crawls.crawl"), ), migrations.AlterField( - model_name='snapshot', - name='id', + model_name="snapshot", + name="id", field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + model_name="snapshot", + name="tags", + field=models.ManyToManyField( + blank=True, + related_name="snapshot_set", + through="core.SnapshotTag", + through_fields=("snapshot", "tag"), + to="core.tag", + ), ), migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), ), migrations.AlterField( - model_name='snapshot', - name='url', + model_name="snapshot", + name="url", field=models.URLField(db_index=True), ), migrations.AlterField( - model_name='tag', - name='slug', + model_name="tag", + name="slug", field=models.SlugField(editable=False, max_length=100, unique=True), ), migrations.AddConstraint( - model_name='snapshot', - constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'), + model_name="snapshot", + constraint=models.UniqueConstraint(fields=("url", "crawl"), name="unique_url_per_crawl"), ), migrations.AddConstraint( - model_name='snapshot', - constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'), + model_name="snapshot", + constraint=models.UniqueConstraint(fields=("timestamp",), name="unique_timestamp"), ), ] diff --git a/archivebox/core/migrations/0026_add_process_to_archiveresult.py b/archivebox/core/migrations/0026_add_process_to_archiveresult.py index e76b8597..7381b98e 100644 --- a/archivebox/core/migrations/0026_add_process_to_archiveresult.py +++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py @@ -5,24 +5,30 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'), - ('machine', '0007_add_process_type_and_parent'), + ("core", "0025_alter_archiveresult_options_alter_snapshot_options_and_more"), + ("machine", "0007_add_process_type_and_parent"), ] operations = [ migrations.RemoveField( - model_name='archiveresult', - name='num_uses_failed', + model_name="archiveresult", + name="num_uses_failed", ), migrations.RemoveField( - model_name='archiveresult', - name='num_uses_succeeded', + model_name="archiveresult", + name="num_uses_succeeded", ), migrations.AddField( - model_name='archiveresult', - name='process', - field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), + model_name="archiveresult", + name="process", + field=models.OneToOneField( + blank=True, + help_text="Process execution details for this archive result", + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="archiveresult", + to="machine.process", + ), ), ] diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py index a26caa10..37c4f8df 100644 --- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py +++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py @@ -25,7 +25,7 @@ def parse_cmd_field(cmd_raw): return [] # Try to parse as JSON first - if cmd_raw.startswith('['): + if cmd_raw.startswith("["): try: parsed = json.loads(cmd_raw) if isinstance(parsed, list): @@ -45,7 +45,7 @@ def get_or_create_current_machine(cursor): # Simple machine detection - get hostname as guid hostname = socket.gethostname() - guid = f'host_{hostname}' # Simple but stable identifier + guid = f"host_{hostname}" # Simple but stable identifier # Check if machine exists cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid]) @@ -64,9 +64,10 @@ def get_or_create_current_machine(cursor): machine_cols = {row[1] for row in cursor.fetchall()} # Build INSERT statement based on available columns - if 'config' in machine_cols: + if "config" in machine_cols: # 0.9.x schema with config column - cursor.execute(""" + cursor.execute( + """ INSERT INTO machine_machine ( id, created_at, modified_at, guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, @@ -74,10 +75,13 @@ def get_or_create_current_machine(cursor): stats, config, num_uses_failed, num_uses_succeeded ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', '', '', '', '', '', '{}', '{}', 0, 0) - """, [machine_id, now, now, guid, hostname]) + """, + [machine_id, now, now, guid, hostname], + ) else: # 0.8.x schema without config column - cursor.execute(""" + cursor.execute( + """ INSERT INTO machine_machine ( id, created_at, modified_at, guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, @@ -85,7 +89,9 @@ def get_or_create_current_machine(cursor): stats, num_uses_failed, num_uses_succeeded ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', '', '', '', '', '', '{}', 0, 0) - """, [machine_id, now, now, guid, hostname]) + """, + [machine_id, now, now, guid, hostname], + ) return machine_id @@ -108,15 +114,18 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version): # If abspath is just a name without slashes, it's not a full path # Store it in both fields for simplicity - if '/' not in abspath: + if "/" not in abspath: # Not a full path - store as-is pass # Check if binary exists with same machine, name, abspath, version - cursor.execute(""" + cursor.execute( + """ SELECT id FROM machine_binary WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ? - """, [machine_id, name, abspath, version]) + """, + [machine_id, name, abspath, version], + ) row = cursor.fetchone() if row: @@ -134,9 +143,10 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version): # Use only columns that exist in current schema # 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded # 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir - if 'binproviders' in binary_cols: + if "binproviders" in binary_cols: # 0.9.x schema - cursor.execute(""" + cursor.execute( + """ INSERT INTO machine_binary ( id, created_at, modified_at, machine_id, name, binproviders, overrides, binprovider, abspath, version, sha256, @@ -144,16 +154,21 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version): num_uses_failed, num_uses_succeeded ) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '', 'succeeded', NULL, '', 0, 0) - """, [binary_id, now, now, machine_id, name, abspath, version]) + """, + [binary_id, now, now, machine_id, name, abspath, version], + ) else: # 0.8.x schema (simpler) - cursor.execute(""" + cursor.execute( + """ INSERT INTO machine_binary ( id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded ) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0) - """, [binary_id, now, now, machine_id, name, abspath, version]) + """, + [binary_id, now, now, machine_id, name, abspath, version], + ) return binary_id @@ -169,15 +184,15 @@ def map_status(old_status): (process_status, exit_code) tuple """ status_map = { - 'queued': ('queued', None), - 'started': ('running', None), - 'backoff': ('queued', None), - 'succeeded': ('exited', 0), - 'failed': ('exited', 1), - 'skipped': ('exited', None), # Skipped = exited without error + "queued": ("queued", None), + "started": ("running", None), + "backoff": ("queued", None), + "succeeded": ("exited", 0), + "failed": ("exited", 1), + "skipped": ("exited", None), # Skipped = exited without error } - return status_map.get(old_status, ('queued', None)) + return status_map.get(old_status, ("queued", None)) def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id): @@ -197,9 +212,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, cmd_json = json.dumps(cmd) # Set retry_at to now for queued processes, NULL otherwise - retry_at = now if status == 'queued' else None + retry_at = now if status == "queued" else None - cursor.execute(""" + cursor.execute( + """ INSERT INTO machine_process ( id, created_at, modified_at, machine_id, parent_id, process_type, pwd, cmd, env, timeout, @@ -213,14 +229,22 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ?, ?, ?, NULL, NULL, ?, ?) - """, [ - process_id, now, now, machine_id, - pwd, cmd_json, - exit_code, - started_at, ended_at, - binary_id, - status, retry_at - ]) + """, + [ + process_id, + now, + now, + machine_id, + pwd, + cmd_json, + exit_code, + started_at, + ended_at, + binary_id, + status, + retry_at, + ], + ) return process_id @@ -250,16 +274,18 @@ def copy_archiveresult_data_to_process(apps, schema_editor): cursor.execute("PRAGMA table_info(core_archiveresult)") cols = {row[1] for row in cursor.fetchall()} - print(f'DEBUG 0027: Columns found: {sorted(cols)}') - print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}') + print(f"DEBUG 0027: Columns found: {sorted(cols)}") + print( + f"DEBUG 0027: Has cmd={('cmd' in cols)}, pwd={('pwd' in cols)}, cmd_version={('cmd_version' in cols)}, process_id={('process_id' in cols)}", + ) - if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols: - print('✓ Fresh install or fields already removed - skipping data copy') + if "cmd" not in cols or "pwd" not in cols or "cmd_version" not in cols: + print("✓ Fresh install or fields already removed - skipping data copy") return # Check if process_id field exists (should exist from 0026) - if 'process_id' not in cols: - print('✗ ERROR: process_id field not found. Migration 0026 must run first.') + if "process_id" not in cols: + print("✗ ERROR: process_id field not found. Migration 0026 must run first.") return # Get or create Machine.current() @@ -278,10 +304,10 @@ def copy_archiveresult_data_to_process(apps, schema_editor): results = cursor.fetchall() if not results: - print('✓ No ArchiveResults need Process migration') + print("✓ No ArchiveResults need Process migration") return - print(f'Migrating {len(results)} ArchiveResults to Process records...') + print(f"Migrating {len(results)} ArchiveResults to Process records...") migrated_count = 0 skipped_count = 0 @@ -291,42 +317,46 @@ def copy_archiveresult_data_to_process(apps, schema_editor): ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row if i == 0: - print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}') + print(f"DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}") try: # Parse cmd field cmd_array = parse_cmd_field(cmd_raw) if i == 0: - print(f'DEBUG 0027: Parsed cmd: {cmd_array}') + print(f"DEBUG 0027: Parsed cmd: {cmd_array}") # Extract binary info from cmd[0] if available binary_id = None if cmd_array and cmd_array[0]: binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name binary_abspath = cmd_array[0] - binary_version = cmd_version or '' + binary_version = cmd_version or "" # Get or create Binary record binary_id = get_or_create_binary( - cursor, machine_id, binary_name, binary_abspath, binary_version + cursor, + machine_id, + binary_name, + binary_abspath, + binary_version, ) if i == 0: - print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}') + print(f"DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}") # Map status process_status, exit_code = map_status(status) # Set timestamps started_at = start_ts or created_at - ended_at = end_ts if process_status == 'exited' else None + ended_at = end_ts if process_status == "exited" else None # Create Process record process_id = create_process( cursor=cursor, machine_id=machine_id, - pwd=pwd or '', + pwd=pwd or "", cmd=cmd_array, status=process_status, exit_code=exit_code, @@ -336,34 +366,34 @@ def copy_archiveresult_data_to_process(apps, schema_editor): ) if i == 0: - print(f'DEBUG 0027: Created Process: id={process_id}') + print(f"DEBUG 0027: Created Process: id={process_id}") # Link ArchiveResult to Process cursor.execute( "UPDATE core_archiveresult SET process_id = ? WHERE id = ?", - [process_id, ar_id] + [process_id, ar_id], ) migrated_count += 1 if i == 0: - print('DEBUG 0027: Linked ArchiveResult to Process') + print("DEBUG 0027: Linked ArchiveResult to Process") except Exception as e: - print(f'✗ Error migrating ArchiveResult {ar_id}: {e}') + print(f"✗ Error migrating ArchiveResult {ar_id}: {e}") import traceback + traceback.print_exc() error_count += 1 continue - print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors') + print(f"✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors") class Migration(migrations.Migration): - dependencies = [ - ('core', '0026_add_process_to_archiveresult'), - ('machine', '0007_add_process_type_and_parent'), + ("core", "0026_add_process_to_archiveresult"), + ("machine", "0007_add_process_type_and_parent"), ] operations = [ @@ -372,18 +402,17 @@ class Migration(migrations.Migration): copy_archiveresult_data_to_process, reverse_code=migrations.RunPython.noop, ), - # Now safe to remove old fields (moved from 0025) migrations.RemoveField( - model_name='archiveresult', - name='cmd', + model_name="archiveresult", + name="cmd", ), migrations.RemoveField( - model_name='archiveresult', - name='cmd_version', + model_name="archiveresult", + name="cmd_version", ), migrations.RemoveField( - model_name='archiveresult', - name='pwd', + model_name="archiveresult", + name="pwd", ), ] diff --git a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py index eb86883d..1459f4ef 100644 --- a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py +++ b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py @@ -4,15 +4,18 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0027_copy_archiveresult_to_process'), + ("core", "0027_copy_archiveresult_to_process"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='fs_version', - field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), + model_name="snapshot", + name="fs_version", + field=models.CharField( + default="0.9.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + max_length=10, + ), ), ] diff --git a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py index 93139900..7ed7d36e 100644 --- a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py +++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py @@ -28,7 +28,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): # Check if table exists and has data cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") if not cursor.fetchone(): - print('ArchiveResult table does not exist, skipping migration') + print("ArchiveResult table does not exist, skipping migration") return cursor.execute("SELECT COUNT(*) FROM core_archiveresult") @@ -38,16 +38,16 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029) if row_count == 0: - print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...') + print("[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...") else: - print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...') + print(f"[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...") # Step 0: Check if machine_process table exists, if not NULL out process_id values cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'") machine_process_exists = cursor.fetchone() is not None if not machine_process_exists: - print('machine_process table does not exist yet, setting process_id to NULL') + print("machine_process table does not exist yet, setting process_id to NULL") cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL") # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns) @@ -90,7 +90,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): cursor.execute("PRAGMA table_info(core_archiveresult)") columns = cursor.fetchall() col_names = [col[1] for col in columns] - has_uuid_column = 'uuid' in col_names + has_uuid_column = "uuid" in col_names if has_uuid_column: cursor.execute("SELECT id, uuid FROM core_archiveresult") @@ -117,7 +117,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): # col_names already fetched in Step 2 inserted_count = 0 for i, record in enumerate(old_records): - old_id = record[col_names.index('id')] + old_id = record[col_names.index("id")] new_uuid = id_to_uuid[old_id] # Build insert with new structure @@ -125,37 +125,52 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): # List of fields to copy (all fields from new schema except id, old_id, uuid) fields_to_copy = [ - 'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name', - 'status', 'retry_at', 'start_ts', 'end_ts', - 'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', - 'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id' + "created_at", + "modified_at", + "snapshot_id", + "plugin", + "hook_name", + "status", + "retry_at", + "start_ts", + "end_ts", + "output_str", + "output_json", + "output_files", + "output_size", + "output_mimetypes", + "config", + "notes", + "num_uses_succeeded", + "num_uses_failed", + "process_id", ] # Build INSERT statement (only copy fields that exist in source) existing_fields = [f for f in fields_to_copy if f in values] if i == 0: - print(f'[0029] Source columns: {col_names}') - print(f'[0029] Copying fields: {existing_fields}') + print(f"[0029] Source columns: {col_names}") + print(f"[0029] Copying fields: {existing_fields}") - placeholders = ', '.join(['?'] * (len(existing_fields) + 1)) # +1 for id - field_list = 'id, ' + ', '.join(existing_fields) + placeholders = ", ".join(["?"] * (len(existing_fields) + 1)) # +1 for id + field_list = "id, " + ", ".join(existing_fields) insert_values = [new_uuid] + [values.get(f) for f in existing_fields] try: cursor.execute( f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})", - insert_values + insert_values, ) inserted_count += 1 except Exception as e: - print(f'[0029] ERROR inserting record {old_id}: {e}') + print(f"[0029] ERROR inserting record {old_id}: {e}") if i == 0: - print(f'[0029] First record values: {insert_values[:5]}...') + print(f"[0029] First record values: {insert_values[:5]}...") raise - print(f'[0029] Inserted {inserted_count}/{len(old_records)} records') + print(f"[0029] Inserted {inserted_count}/{len(old_records)} records") # Step 4: Replace old table with new table cursor.execute("DROP TABLE core_archiveresult") @@ -170,13 +185,12 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)") cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)") - print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key') + print(f"✓ Migrated {row_count} ArchiveResult records to UUID primary key") class Migration(migrations.Migration): - dependencies = [ - ('core', '0028_alter_snapshot_fs_version'), + ("core", "0028_alter_snapshot_fs_version"), ] operations = [ @@ -190,13 +204,13 @@ class Migration(migrations.Migration): state_operations=[ # Remove uuid field (was added in 0025, we're merging it into id) migrations.RemoveField( - model_name='archiveresult', - name='uuid', + model_name="archiveresult", + name="uuid", ), # Change id from AutoField to UUIDField (absorbing the uuid field) migrations.AlterField( - model_name='archiveresult', - name='id', + model_name="archiveresult", + name="id", field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True), ), ], diff --git a/archivebox/core/migrations/0030_alter_archiveresult_id.py b/archivebox/core/migrations/0030_alter_archiveresult_id.py index 80ce097c..398cca98 100644 --- a/archivebox/core/migrations/0030_alter_archiveresult_id.py +++ b/archivebox/core/migrations/0030_alter_archiveresult_id.py @@ -6,15 +6,14 @@ from archivebox.uuid_compat import uuid7 class Migration(migrations.Migration): - dependencies = [ - ('core', '0029_migrate_archiveresult_to_uuid_pk'), + ("core", "0029_migrate_archiveresult_to_uuid_pk"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='id', + model_name="archiveresult", + name="id", field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), ] diff --git a/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py index cea2b04d..4d31b518 100644 --- a/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py +++ b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py @@ -4,14 +4,13 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('core', '0030_alter_archiveresult_id'), + ("core", "0030_alter_archiveresult_id"), ] operations = [ migrations.AddIndex( - model_name='archiveresult', - index=models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'), + model_name="archiveresult", + index=models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"), ), ] diff --git a/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py index 4a8f74d1..78831950 100644 --- a/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py +++ b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py @@ -2,7 +2,6 @@ from django.db import migrations class Migration(migrations.Migration): - dependencies = [ ("core", "0031_add_archiveresult_snapshot_status_index"), ] diff --git a/archivebox/core/migrations/0033_alter_archiveresult_status.py b/archivebox/core/migrations/0033_alter_archiveresult_status.py new file mode 100644 index 00000000..8f2315cd --- /dev/null +++ b/archivebox/core/migrations/0033_alter_archiveresult_status.py @@ -0,0 +1,28 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0032_remove_archiveresult_retry_at"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ("noresults", "No Results"), + ], + db_index=True, + default="queued", + max_length=16, + ), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 7f33bf0a..7994540a 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,6 +1,7 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" -from typing import Optional, Dict, Iterable, Any, List, Sequence, cast +from typing import Optional, Any, cast +from collections.abc import Iterable, Sequence import uuid from archivebox.uuid_compat import uuid7 from datetime import datetime, timedelta @@ -26,11 +27,16 @@ from archivebox.config import CONSTANTS from archivebox.misc.system import get_dir_size, atomic_write from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.hooks import ( - get_plugins, get_plugin_name, get_plugin_icon, + get_plugins, + get_plugin_name, + get_plugin_icon, ) from archivebox.base_models.models import ( - ModelWithUUID, ModelWithOutputDir, - ModelWithConfig, ModelWithNotes, ModelWithHealthStats, + ModelWithUUID, + ModelWithOutputDir, + ModelWithConfig, + ModelWithNotes, + ModelWithHealthStats, get_or_create_system_user_pk, ) from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine @@ -39,21 +45,26 @@ from archivebox.crawls.models import Crawl from archivebox.machine.models import Binary - class Tag(ModelWithUUID): # Keep AutoField for compatibility with main branch migrations # Don't use UUIDField here - requires complex FK transformation - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=True, related_name='tag_set') + id = models.AutoField(primary_key=True, serialize=False, verbose_name="ID") + created_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + default=get_or_create_system_user_pk, + null=True, + related_name="tag_set", + ) created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True) modified_at = models.DateTimeField(auto_now=True) name = models.CharField(unique=True, blank=False, max_length=100) slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) - snapshot_set: models.Manager['Snapshot'] + snapshot_set: models.Manager["Snapshot"] class Meta(ModelWithUUID.Meta): - app_label = 'core' + app_label = "core" verbose_name = "Tag" verbose_name_plural = "Tags" @@ -61,7 +72,7 @@ class Tag(ModelWithUUID): return self.name def _generate_unique_slug(self) -> str: - base_slug = slugify(self.name) or 'tag' + base_slug = slugify(self.name) or "tag" existing = Tag.objects.filter(slug__startswith=base_slug) if self.pk: existing = existing.exclude(pk=self.pk) @@ -77,7 +88,7 @@ class Tag(ModelWithUUID): def save(self, *args, **kwargs): existing_name = None if self.pk: - existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first() + existing_name = Tag.objects.filter(pk=self.pk).values_list("name", flat=True).first() if not self.slug or existing_name != self.name: self.slug = self._generate_unique_slug() @@ -98,23 +109,24 @@ class Tag(ModelWithUUID): @property def api_url(self) -> str: - return str(reverse_lazy('api-1:get_tag', args=[self.id])) + return str(reverse_lazy("api-1:get_tag", args=[self.id])) def to_json(self) -> dict: """ Convert Tag model instance to a JSON-serializable dict. """ from archivebox.config import VERSION + return { - 'type': 'Tag', - 'schema_version': VERSION, - 'id': str(self.id), - 'name': self.name, - 'slug': self.slug, + "type": "Tag", + "schema_version": VERSION, + "id": str(self.id), + "name": self.name, + "slug": self.slug, } @staticmethod - def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None): + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): """ Create/update Tag from JSON dict. @@ -125,28 +137,28 @@ class Tag(ModelWithUUID): Returns: Tag instance or None """ - name = record.get('name') + name = record.get("name") if not name: return None tag, _ = Tag.objects.get_or_create(name=name) # Auto-attach to snapshot if in overrides - if overrides and 'snapshot' in overrides and tag: - overrides['snapshot'].tags.add(tag) + if overrides and "snapshot" in overrides and tag: + overrides["snapshot"].tags.add(tag) return tag class SnapshotTag(models.Model): id = models.AutoField(primary_key=True) - snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id') - tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id') + snapshot = models.ForeignKey("Snapshot", db_column="snapshot_id", on_delete=models.CASCADE, to_field="id") + tag = models.ForeignKey(Tag, db_column="tag_id", on_delete=models.CASCADE, to_field="id") class Meta: - app_label = 'core' - db_table = 'core_snapshot_tags' - unique_together = [('snapshot', 'tag')] + app_label = "core" + db_table = "core_snapshot_tags" + unique_together = [("snapshot", "tag")] class SnapshotQuerySet(models.QuerySet): @@ -157,15 +169,19 @@ class SnapshotQuerySet(models.QuerySet): # ========================================================================= FILTER_TYPES = { - 'exact': lambda pattern: models.Q(url=pattern), - 'substring': lambda pattern: models.Q(url__icontains=pattern), - 'regex': lambda pattern: models.Q(url__iregex=pattern), - 'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"), - 'tag': lambda pattern: models.Q(tags__name=pattern), - 'timestamp': lambda pattern: models.Q(timestamp=pattern), + "exact": lambda pattern: models.Q(url=pattern), + "substring": lambda pattern: models.Q(url__icontains=pattern), + "regex": lambda pattern: models.Q(url__iregex=pattern), + "domain": lambda pattern: ( + models.Q(url__istartswith=f"http://{pattern}") + | models.Q(url__istartswith=f"https://{pattern}") + | models.Q(url__istartswith=f"ftp://{pattern}") + ), + "tag": lambda pattern: models.Q(tags__name=pattern), + "timestamp": lambda pattern: models.Q(timestamp=pattern), } - def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet': + def filter_by_patterns(self, patterns: list[str], filter_type: str = "exact") -> "SnapshotQuerySet": """Filter snapshots by URL patterns using specified filter type""" from archivebox.misc.logging import stderr @@ -175,12 +191,12 @@ class SnapshotQuerySet(models.QuerySet): q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern) except KeyError: stderr() - stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red') - stderr(f' {pattern}') + stderr(f"[X] Got invalid pattern for --filter-type={filter_type}:", color="red") + stderr(f" {pattern}") raise SystemExit(2) return self.filter(q_filter) - def search(self, patterns: List[str]) -> 'SnapshotQuerySet': + def search(self, patterns: list[str]) -> "SnapshotQuerySet": """Search snapshots using the configured search backend""" from archivebox.config.common import SEARCH_BACKEND_CONFIG from archivebox.search import query_search_index @@ -188,7 +204,7 @@ class SnapshotQuerySet(models.QuerySet): if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND: stderr() - stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red') + stderr("[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True", color="red") raise SystemExit(2) qsearch = self.none() @@ -210,42 +226,46 @@ class SnapshotQuerySet(models.QuerySet): from archivebox.config import VERSION from archivebox.config.common import SERVER_CONFIG - MAIN_INDEX_HEADER = { - 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', - 'schema': 'archivebox.index.json', - 'copyright_info': SERVER_CONFIG.FOOTER_INFO, - 'meta': { - 'project': 'ArchiveBox', - 'version': VERSION, - 'git_sha': VERSION, - 'website': 'https://ArchiveBox.io', - 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', - 'source': 'https://github.com/ArchiveBox/ArchiveBox', - 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', - 'dependencies': {}, - }, - } if with_headers else {} + MAIN_INDEX_HEADER = ( + { + "info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.", + "schema": "archivebox.index.json", + "copyright_info": SERVER_CONFIG.FOOTER_INFO, + "meta": { + "project": "ArchiveBox", + "version": VERSION, + "git_sha": VERSION, + "website": "https://ArchiveBox.io", + "docs": "https://github.com/ArchiveBox/ArchiveBox/wiki", + "source": "https://github.com/ArchiveBox/ArchiveBox", + "issues": "https://github.com/ArchiveBox/ArchiveBox/issues", + "dependencies": {}, + }, + } + if with_headers + else {} + ) snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)] if with_headers: output = { **MAIN_INDEX_HEADER, - 'num_links': len(snapshot_dicts), - 'updated': datetime.now(tz.utc), - 'last_run_cmd': sys.argv, - 'links': snapshot_dicts, + "num_links": len(snapshot_dicts), + "updated": datetime.now(tz.utc), + "last_run_cmd": sys.argv, + "links": snapshot_dicts, } else: output = snapshot_dicts return to_json(output, indent=4, sort_keys=True) - def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str: + def to_csv(self, cols: list[str] | None = None, header: bool = True, separator: str = ",", ljust: int = 0) -> str: """Generate CSV output from snapshots""" - cols = cols or ['timestamp', 'is_archived', 'url'] - header_str = separator.join(col.ljust(ljust) for col in cols) if header else '' + cols = cols or ["timestamp", "is_archived", "url"] + header_str = separator.join(col.ljust(ljust) for col in cols) if header else "" row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500)) - return '\n'.join((header_str, *row_strs)) + return "\n".join((header_str, *row_strs)) def to_html(self, with_headers: bool = True) -> str: """Generate main index HTML from snapshots""" @@ -255,28 +275,31 @@ class SnapshotQuerySet(models.QuerySet): from archivebox.config.common import SERVER_CONFIG from archivebox.config.version import get_COMMIT_HASH - template = 'static_index.html' if with_headers else 'minimal_index.html' + template = "static_index.html" if with_headers else "minimal_index.html" snapshot_list = list(self.iterator(chunk_size=500)) - return render_to_string(template, { - 'version': VERSION, - 'git_sha': get_COMMIT_HASH() or VERSION, - 'num_links': str(len(snapshot_list)), - 'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'), - 'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'), - 'links': snapshot_list, - 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, - }) + return render_to_string( + template, + { + "version": VERSION, + "git_sha": get_COMMIT_HASH() or VERSION, + "num_links": str(len(snapshot_list)), + "date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"), + "time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"), + "links": snapshot_list, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + }, + ) class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base] """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods""" def filter(self, *args, **kwargs): - domain = kwargs.pop('domain', None) + domain = kwargs.pop("domain", None) qs = super().filter(*args, **kwargs) if domain: - qs = qs.filter(url__icontains=f'://{domain}') + qs = qs.filter(url__icontains=f"://{domain}") return qs def get_queryset(self): @@ -291,6 +314,7 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ig def remove(self, atomic: bool = False) -> tuple: """Remove snapshots from the database""" from django.db import transaction + if atomic: with transaction.atomic(): return self.get_queryset().delete() @@ -305,26 +329,45 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True) - crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment] - parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)') + crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name="snapshot_set", db_index=True) # type: ignore[assignment] + parent_snapshot = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="child_snapshots", + db_index=True, + help_text="Parent snapshot that discovered this URL (for recursive crawling)", + ) title = models.CharField(max_length=512, null=True, blank=True, db_index=True) downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs - fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().') - current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.') + fs_version = models.CharField( + max_length=10, + default="0.9.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + ) + current_step = models.PositiveSmallIntegerField( + default=0, + db_index=True, + help_text="Current hook step being executed (0-9). Used for sequential hook execution.", + ) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) - status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) + status = ModelWithStateMachine.StatusField( + choices=ModelWithStateMachine.StatusChoices, + default=ModelWithStateMachine.StatusChoices.QUEUED, + ) config = models.JSONField(default=dict, null=False, blank=False, editable=True) - notes = models.TextField(blank=True, null=False, default='') + notes = models.TextField(blank=True, null=False, default="") # output_dir is computed via @cached_property from fs_version and get_storage_path_for_version() - tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) + tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name="snapshot_set", through_fields=("snapshot", "tag")) - state_machine_name = 'archivebox.core.models.SnapshotMachine' - state_field_name = 'status' - retry_at_field_name = 'retry_at' + state_machine_name = "archivebox.core.models.SnapshotMachine" + state_field_name = "status" + retry_at_field_name = "retry_at" StatusChoices = ModelWithStateMachine.StatusChoices active_state = StatusChoices.STARTED @@ -333,7 +376,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea _prefetched_objects_cache: dict[str, Any] objects = SnapshotManager() - archiveresult_set: models.Manager['ArchiveResult'] + archiveresult_set: models.Manager["ArchiveResult"] class Meta( ModelWithOutputDir.Meta, @@ -342,18 +385,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ModelWithHealthStats.Meta, ModelWithStateMachine.Meta, ): - app_label = 'core' + app_label = "core" verbose_name = "Snapshot" verbose_name_plural = "Snapshots" constraints = [ # Allow same URL in different crawls, but not duplicates within same crawl - models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), + models.UniqueConstraint(fields=["url", "crawl"], name="unique_url_per_crawl"), # Global timestamp uniqueness for 1:1 symlink mapping - models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'), + models.UniqueConstraint(fields=["timestamp"], name="unique_timestamp"), ] def __str__(self): - return f'[{self.id}] {self.url[:64]}' + return f"[{self.id}] {self.url[:64]}" @property def created_by(self): @@ -364,12 +407,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def process_set(self): """Get all Process objects related to this snapshot's ArchiveResults.""" from archivebox.machine.models import Process + return Process.objects.filter(archiveresult__snapshot_id=self.id) @property def binary_set(self): """Get all Binary objects used by processes related to this snapshot.""" from archivebox.machine.models import Binary + return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct() def save(self, *args, **kwargs): @@ -380,14 +425,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Migrate filesystem if needed (happens automatically on save) if self.pk and self.fs_migration_needed: - print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}") + print( + f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}", + ) # Walk through migration chain automatically current = self.fs_version target = self._fs_current_version() while current != target: next_ver = self._fs_next_version(current) - method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}' + method = f"_fs_migrate_from_{current.replace('.', '_')}_to_{next_ver.replace('.', '_')}" # Only run if method exists (most are no-ops) if hasattr(self, method): @@ -403,7 +450,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea self.ensure_legacy_archive_symlink() existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url} if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls: - self.crawl.urls += f'\n{self.url}' + self.crawl.urls += f"\n{self.url}" self.crawl.save() # if is_new: @@ -429,14 +476,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def _fs_current_version() -> str: """Get current ArchiveBox filesystem version (normalized to x.x.0 format)""" from archivebox.config import VERSION + # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0") - parts = VERSION.split('.') + parts = VERSION.split(".") if len(parts) >= 2: major, minor = parts[0], parts[1] # Strip any non-numeric suffix from minor version - minor = ''.join(c for c in minor if c.isdigit()) - return f'{major}.{minor}.0' - return '0.9.0' # Fallback if version parsing fails + minor = "".join(c for c in minor if c.isdigit()) + return f"{major}.{minor}.0" + return "0.9.0" # Fallback if version parsing fails @property def fs_migration_needed(self) -> bool: @@ -446,8 +494,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def _fs_next_version(self, version: str) -> str: """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)""" # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp}) - if version in ('0.7.0', '0.8.0'): - return '0.9.0' + if version in ("0.7.0", "0.8.0"): + return "0.9.0" return self._fs_current_version() def _fs_migrate_from_0_8_0_to_0_9_0(self): @@ -468,10 +516,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea import shutil from django.db import transaction - old_dir = self.get_storage_path_for_version('0.8.0') - new_dir = self.get_storage_path_for_version('0.9.0') + old_dir = self.get_storage_path_for_version("0.8.0") + new_dir = self.get_storage_path_for_version("0.9.0") - print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}") + print( + f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}", + ) if not old_dir.exists() or old_dir == new_dir: # No migration needed @@ -487,7 +537,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea new_dir.mkdir(parents=True, exist_ok=True) # Copy all files (idempotent), skipping index.json (will be converted to jsonl) - for old_file in old_dir.rglob('*'): + for old_file in old_dir.rglob("*"): if not old_file.is_file(): continue @@ -502,10 +552,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea shutil.copy2(old_file, new_file) # Verify all copied - old_files = {f.relative_to(old_dir): f.stat().st_size - for f in old_dir.rglob('*') if f.is_file()} - new_files = {f.relative_to(new_dir): f.stat().st_size - for f in new_dir.rglob('*') if f.is_file()} + old_files = {f.relative_to(old_dir): f.stat().st_size for f in old_dir.rglob("*") if f.is_file()} + new_files = {f.relative_to(new_dir): f.stat().st_size for f in new_dir.rglob("*") if f.is_file()} if old_files.keys() != new_files.keys(): missing = old_files.keys() - new_files.keys() @@ -533,8 +581,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: shutil.rmtree(old_dir) except Exception as e: - logging.getLogger('archivebox.migration').warning( - f"Could not remove old migration directory {old_dir}: {e}" + logging.getLogger("archivebox.migration").warning( + f"Could not remove old migration directory {old_dir}: {e}", ) return # Don't create symlink if cleanup failed @@ -547,8 +595,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: symlink_path.symlink_to(new_dir, target_is_directory=True) except Exception as e: - logging.getLogger('archivebox.migration').warning( - f"Could not create symlink from {symlink_path} to {new_dir}: {e}" + logging.getLogger("archivebox.migration").warning( + f"Could not create symlink from {symlink_path} to {new_dir}: {e}", ) # ========================================================================= @@ -572,18 +620,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: parsed = urlparse(url) - if parsed.scheme in ('http', 'https'): + if parsed.scheme in ("http", "https"): if parsed.port: - return f"{parsed.hostname}_{parsed.port}".replace(':', '_') - return parsed.hostname or 'unknown' - elif parsed.scheme == 'file': - return 'localhost' + return f"{parsed.hostname}_{parsed.port}".replace(":", "_") + return parsed.hostname or "unknown" + elif parsed.scheme == "file": + return "localhost" elif parsed.scheme: return parsed.scheme else: - return 'unknown' + return "unknown" except Exception: - return 'unknown' + return "unknown" def get_storage_path_for_version(self, version: str) -> Path: """ @@ -595,24 +643,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ from datetime import datetime - if version in ('0.7.0', '0.8.0'): + if version in ("0.7.0", "0.8.0"): return CONSTANTS.ARCHIVE_DIR / self.timestamp - elif version in ('0.9.0', '1.0.0'): + elif version in ("0.9.0", "1.0.0"): username = self.created_by.username # Use created_at for date grouping (fallback to timestamp) if self.created_at: - date_str = self.created_at.strftime('%Y%m%d') + date_str = self.created_at.strftime("%Y%m%d") else: - date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d') + date_str = datetime.fromtimestamp(float(self.timestamp)).strftime("%Y%m%d") domain = self.extract_domain_from_url(self.url) - return ( - CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' - / date_str / domain / str(self.id) - ) + return CONSTANTS.DATA_DIR / "users" / username / "snapshots" / date_str / domain / str(self.id) else: # Unknown version - use current return self.get_storage_path_for_version(self._fs_current_version()) @@ -622,7 +667,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # ========================================================================= @classmethod - def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']: + def load_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]: """ Load existing Snapshot from DB by reading index.jsonl or index.json. @@ -643,7 +688,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: records = Process.parse_records_from_text(jsonl_path.read_text()) for record in records: - if record.get('type') == 'Snapshot': + if record.get("type") == "Snapshot": data = record break except OSError: @@ -658,14 +703,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not data: return None - url = data.get('url') + url = data.get("url") if not url: return None # Get timestamp - prefer index file, fallback to folder name timestamp = cls._select_best_timestamp( - index_timestamp=data.get('timestamp'), - folder_name=snapshot_dir.name + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, ) if not timestamp: @@ -698,7 +743,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return cls.objects.filter(url=url, timestamp=timestamp).first() @classmethod - def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']: + def create_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]: """ Create new Snapshot from orphaned directory. @@ -718,7 +763,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: records = Process.parse_records_from_text(jsonl_path.read_text()) for record in records: - if record.get('type') == 'Snapshot': + if record.get("type") == "Snapshot": data = record break except OSError: @@ -733,14 +778,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not data: return None - url = data.get('url') + url = data.get("url") if not url: return None # Get and validate timestamp timestamp = cls._select_best_timestamp( - index_timestamp=data.get('timestamp'), - folder_name=snapshot_dir.name + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, ) if not timestamp: @@ -754,32 +799,34 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Get or create catchall crawl for orphaned snapshots from archivebox.crawls.models import Crawl + system_user_id = get_or_create_system_user_pk() catchall_crawl, _ = Crawl.objects.get_or_create( - label='[migration] orphaned snapshots', + label="[migration] orphaned snapshots", defaults={ - 'urls': f'# Orphaned snapshot: {url}', - 'max_depth': 0, - 'created_by_id': system_user_id, - } + "urls": f"# Orphaned snapshot: {url}", + "max_depth": 0, + "created_by_id": system_user_id, + }, ) return cls( url=url, timestamp=timestamp, - title=data.get('title', ''), + title=data.get("title", ""), fs_version=fs_version, crawl=catchall_crawl, ) @staticmethod - def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> Optional[str]: + def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> str | None: """ Select best timestamp from index.json vs folder name. Validates range (1995-2035). Prefers index.json if valid. """ + def is_valid_timestamp(ts: object | None) -> bool: if not isinstance(ts, (str, int, float)): return False @@ -822,13 +869,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea - Has archive_results list: 0.8.0 - Default: 0.7.0 """ - if 'fs_version' in data: - return data['fs_version'] - if 'history' in data and 'archive_results' not in data: - return '0.7.0' - if 'archive_results' in data: - return '0.8.0' - return '0.7.0' + if "fs_version" in data: + return data["fs_version"] + if "history" in data and "archive_results" not in data: + return "0.7.0" + if "archive_results" in data: + return "0.8.0" + return "0.7.0" # ========================================================================= # Index.json Reconciliation @@ -860,10 +907,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if jsonl_path.exists(): # Read from JSONL format jsonl_data = self.read_index_jsonl() - if jsonl_data['snapshot']: - index_data = jsonl_data['snapshot'] + if jsonl_data["snapshot"]: + index_data = jsonl_data["snapshot"] # Convert archive_results list to expected format - index_data['archive_results'] = jsonl_data['archive_results'] + index_data["archive_results"] = jsonl_data["archive_results"] elif json_path.exists(): # Fallback to legacy JSON format try: @@ -890,8 +937,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def _merge_title_from_index(self, index_data: dict): """Merge title - prefer longest non-URL title.""" - index_title = index_data.get('title', '').strip() - db_title = self.title or '' + index_title = (index_data.get("title") or "").strip() + db_title = self.title or "" candidates = [t for t in [index_title, db_title] if t and t != self.url] if candidates: @@ -903,10 +950,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """Merge tags - union of both sources.""" from django.db import transaction - index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set() + index_tags = set(index_data.get("tags", "").split(",")) if index_data.get("tags") else set() index_tags = {t.strip() for t in index_tags if t.strip()} - db_tags = set(self.tags.values_list('name', flat=True)) + db_tags = set(self.tags.values_list("name", flat=True)) new_tags = index_tags - db_tags if new_tags: @@ -917,22 +964,19 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def _merge_archive_results_from_index(self, index_data: dict): """Merge ArchiveResults - keep both (by plugin+start_ts).""" - existing = { - (ar.plugin, ar.start_ts): ar - for ar in ArchiveResult.objects.filter(snapshot=self) - } + existing = {(ar.plugin, ar.start_ts): ar for ar in ArchiveResult.objects.filter(snapshot=self)} # Handle 0.8.x format (archive_results list) - for result_data in index_data.get('archive_results', []): + for result_data in index_data.get("archive_results", []): self._create_archive_result_if_missing(result_data, existing) # Handle 0.7.x format (history dict) - if 'history' in index_data and isinstance(index_data['history'], dict): - for plugin, result_list in index_data['history'].items(): + if "history" in index_data and isinstance(index_data["history"], dict): + for plugin, result_list in index_data["history"].items(): if isinstance(result_list, list): for result_data in result_list: # Support both old 'extractor' and new 'plugin' keys for backwards compat - result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin + result_data["plugin"] = result_data.get("plugin") or result_data.get("extractor") or plugin self._create_archive_result_if_missing(result_data, existing) def _create_archive_result_if_missing(self, result_data: dict, existing: dict): @@ -940,14 +984,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea from dateutil import parser # Support both old 'extractor' and new 'plugin' keys for backwards compat - plugin = result_data.get('plugin') or result_data.get('extractor', '') + plugin = result_data.get("plugin") or result_data.get("extractor", "") if not plugin: return start_ts = None - if result_data.get('start_ts'): + if result_data.get("start_ts"): try: - start_ts = parser.parse(result_data['start_ts']) + start_ts = parser.parse(result_data["start_ts"]) except (TypeError, ValueError, OverflowError): pass @@ -956,23 +1000,23 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: end_ts = None - if result_data.get('end_ts'): + if result_data.get("end_ts"): try: - end_ts = parser.parse(result_data['end_ts']) + end_ts = parser.parse(result_data["end_ts"]) except (TypeError, ValueError, OverflowError): pass # Support both 'output' (legacy) and 'output_str' (new JSONL) field names - output_str = result_data.get('output_str') or result_data.get('output', '') + output_str = result_data.get("output_str") or result_data.get("output", "") ArchiveResult.objects.create( snapshot=self, plugin=plugin, - hook_name=result_data.get('hook_name', ''), - status=result_data.get('status', 'failed'), + hook_name=result_data.get("hook_name", ""), + status=result_data.get("status", "failed"), output_str=output_str, - cmd=result_data.get('cmd', []), - pwd=result_data.get('pwd', str(self.output_dir)), + cmd=result_data.get("cmd", []), + pwd=result_data.get("pwd", str(self.output_dir)), start_ts=start_ts, end_ts=end_ts, ) @@ -983,32 +1027,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """Write index.json in 0.9.x format (deprecated, use write_index_jsonl).""" import json - index_path = Path(self.output_dir) / 'index.json' + index_path = Path(self.output_dir) / "index.json" data = { - 'url': self.url, - 'timestamp': self.timestamp, - 'title': self.title or '', - 'tags': ','.join(sorted(self.tags.values_list('name', flat=True))), - 'fs_version': self.fs_version, - 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, - 'created_at': self.created_at.isoformat() if self.created_at else None, - 'archive_results': [ + "url": self.url, + "timestamp": self.timestamp, + "title": self.title or "", + "tags": ",".join(sorted(self.tags.values_list("name", flat=True))), + "fs_version": self.fs_version, + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "archive_results": [ { - 'plugin': ar.plugin, - 'status': ar.status, - 'start_ts': ar.start_ts.isoformat() if ar.start_ts else None, - 'end_ts': ar.end_ts.isoformat() if ar.end_ts else None, - 'output': ar.output_str or '', - 'cmd': ar.cmd if isinstance(ar.cmd, list) else [], - 'pwd': ar.pwd, + "plugin": ar.plugin, + "status": ar.status, + "start_ts": ar.start_ts.isoformat() if ar.start_ts else None, + "end_ts": ar.end_ts.isoformat() if ar.end_ts else None, + "output": ar.output_str or "", + "cmd": ar.cmd if isinstance(ar.cmd, list) else [], + "pwd": ar.pwd, } - for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts') + for ar in ArchiveResult.objects.filter(snapshot=self).order_by("start_ts") ], } index_path.parent.mkdir(parents=True, exist_ok=True) - with open(index_path, 'w') as f: + with open(index_path, "w") as f: json.dump(data, f, indent=2, sort_keys=True) def write_index_jsonl(self): @@ -1030,25 +1074,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea binaries_seen = set() processes_seen = set() - with open(index_path, 'w') as f: + with open(index_path, "w") as f: # Write Snapshot record first (to_json includes crawl_id, fs_version) - f.write(json.dumps(self.to_json()) + '\n') + f.write(json.dumps(self.to_json()) + "\n") # Write ArchiveResult records with their associated Binary and Process # Use select_related to optimize queries - for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'): + for ar in self.archiveresult_set.select_related("process__binary").order_by("start_ts"): # Write Binary record if not already written if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen: binaries_seen.add(ar.process.binary_id) - f.write(json.dumps(ar.process.binary.to_json()) + '\n') + f.write(json.dumps(ar.process.binary.to_json()) + "\n") # Write Process record if not already written if ar.process and ar.process_id not in processes_seen: processes_seen.add(ar.process_id) - f.write(json.dumps(ar.process.to_json()) + '\n') + f.write(json.dumps(ar.process.to_json()) + "\n") # Write ArchiveResult record - f.write(json.dumps(ar.to_json()) + '\n') + f.write(json.dumps(ar.to_json()) + "\n") def read_index_jsonl(self) -> dict: """ @@ -1058,15 +1102,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ from archivebox.machine.models import Process from archivebox.misc.jsonl import ( - TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY, TYPE_PROCESS, + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_BINARY, + TYPE_PROCESS, ) index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME result: dict[str, Any] = { - 'snapshot': None, - 'archive_results': [], - 'binaries': [], - 'processes': [], + "snapshot": None, + "archive_results": [], + "binaries": [], + "processes": [], } if not index_path.exists(): @@ -1074,15 +1121,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea records = Process.parse_records_from_text(index_path.read_text()) for record in records: - record_type = record.get('type') + record_type = record.get("type") if record_type == TYPE_SNAPSHOT: - result['snapshot'] = record + result["snapshot"] = record elif record_type == TYPE_ARCHIVERESULT: - result['archive_results'].append(record) + result["archive_results"].append(record) elif record_type == TYPE_BINARY: - result['binaries'].append(record) + result["binaries"].append(record) elif record_type == TYPE_PROCESS: - result['processes'].append(record) + result["processes"].append(record) return result @@ -1103,64 +1150,64 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return False try: - with open(json_path, 'r') as f: + with open(json_path) as f: data = json.load(f) except (json.JSONDecodeError, OSError): return False # Detect format version and extract records - fs_version = data.get('fs_version', '0.7.0') + fs_version = data.get("fs_version", "0.7.0") jsonl_path.parent.mkdir(parents=True, exist_ok=True) - with open(jsonl_path, 'w') as f: + with open(jsonl_path, "w") as f: # Write Snapshot record snapshot_record = { - 'type': 'Snapshot', - 'id': str(self.id), - 'crawl_id': str(self.crawl_id) if self.crawl_id else None, - 'url': data.get('url', self.url), - 'timestamp': data.get('timestamp', self.timestamp), - 'title': data.get('title', self.title or ''), - 'tags': data.get('tags', ''), - 'fs_version': fs_version, - 'bookmarked_at': data.get('bookmarked_at'), - 'created_at': data.get('created_at'), + "type": "Snapshot", + "id": str(self.id), + "crawl_id": str(self.crawl_id) if self.crawl_id else None, + "url": data.get("url", self.url), + "timestamp": data.get("timestamp", self.timestamp), + "title": data.get("title", self.title or ""), + "tags": data.get("tags", ""), + "fs_version": fs_version, + "bookmarked_at": data.get("bookmarked_at"), + "created_at": data.get("created_at"), } - f.write(json.dumps(snapshot_record) + '\n') + f.write(json.dumps(snapshot_record) + "\n") # Handle 0.8.x/0.9.x format (archive_results list) - for result_data in data.get('archive_results', []): + for result_data in data.get("archive_results", []): ar_record = { - 'type': 'ArchiveResult', - 'snapshot_id': str(self.id), - 'plugin': result_data.get('plugin', ''), - 'status': result_data.get('status', ''), - 'output_str': result_data.get('output', ''), - 'start_ts': result_data.get('start_ts'), - 'end_ts': result_data.get('end_ts'), + "type": "ArchiveResult", + "snapshot_id": str(self.id), + "plugin": result_data.get("plugin", ""), + "status": result_data.get("status", ""), + "output_str": result_data.get("output", ""), + "start_ts": result_data.get("start_ts"), + "end_ts": result_data.get("end_ts"), } - if result_data.get('cmd'): - ar_record['cmd'] = result_data['cmd'] - f.write(json.dumps(ar_record) + '\n') + if result_data.get("cmd"): + ar_record["cmd"] = result_data["cmd"] + f.write(json.dumps(ar_record) + "\n") # Handle 0.7.x format (history dict) - if 'history' in data and isinstance(data['history'], dict): - for plugin, result_list in data['history'].items(): + if "history" in data and isinstance(data["history"], dict): + for plugin, result_list in data["history"].items(): if not isinstance(result_list, list): continue for result_data in result_list: ar_record = { - 'type': 'ArchiveResult', - 'snapshot_id': str(self.id), - 'plugin': result_data.get('plugin') or result_data.get('extractor') or plugin, - 'status': result_data.get('status', ''), - 'output_str': result_data.get('output', ''), - 'start_ts': result_data.get('start_ts'), - 'end_ts': result_data.get('end_ts'), + "type": "ArchiveResult", + "snapshot_id": str(self.id), + "plugin": result_data.get("plugin") or result_data.get("extractor") or plugin, + "status": result_data.get("status", ""), + "output_str": result_data.get("output", ""), + "start_ts": result_data.get("start_ts"), + "end_ts": result_data.get("end_ts"), } - if result_data.get('cmd'): - ar_record['cmd'] = result_data['cmd'] - f.write(json.dumps(ar_record) + '\n') + if result_data.get("cmd"): + ar_record["cmd"] = result_data["cmd"] + f.write(json.dumps(ar_record) + "\n") # Remove old index.json after successful conversion try: @@ -1184,7 +1231,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea from datetime import datetime import shutil - invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d') + invalid_dir = CONSTANTS.DATA_DIR / "invalid" / datetime.now().strftime("%Y%m%d") invalid_dir.mkdir(parents=True, exist_ok=True) dest = invalid_dir / snapshot_dir.name @@ -1208,19 +1255,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ from django.db.models import Count - duplicates = ( - cls.objects - .values('url', 'timestamp') - .annotate(count=Count('id')) - .filter(count__gt=1) - ) + duplicates = cls.objects.values("url", "timestamp").annotate(count=Count("id")).filter(count__gt=1) merged = 0 for dup in duplicates.iterator(chunk_size=500): snapshots = list( - cls.objects - .filter(url=dup['url'], timestamp=dup['timestamp']) - .order_by('created_at') # Keep oldest + cls.objects.filter(url=dup["url"], timestamp=dup["timestamp"]).order_by("created_at"), # Keep oldest ) if len(snapshots) > 1: @@ -1233,7 +1273,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return merged @classmethod - def _merge_snapshots(cls, snapshots: Sequence['Snapshot']): + def _merge_snapshots(cls, snapshots: Sequence["Snapshot"]): """ Merge exact duplicates. Keep oldest, union files + ArchiveResults. @@ -1250,7 +1290,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Merge files if dup_dir.exists() and dup_dir != keeper_dir: - for dup_file in dup_dir.rglob('*'): + for dup_file in dup_dir.rglob("*"): if not dup_file.is_file(): continue @@ -1282,7 +1322,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @property def output_dir_parent(self) -> str: - return 'archive' + return "archive" @property def output_dir_name(self) -> str: @@ -1291,31 +1331,39 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def archive(self, overwrite=False, methods=None): return bg_archive_snapshot(self, overwrite=overwrite, methods=methods) - @admin.display(description='Tags') + @admin.display(description="Tags") def tags_str(self, nocache=True) -> str | None: - calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all())) - prefetched_cache = getattr(self, '_prefetched_objects_cache', {}) - if 'tags' in prefetched_cache: + calc_tags_str = lambda: ",".join(sorted(tag.name for tag in self.tags.all())) + prefetched_cache = getattr(self, "_prefetched_objects_cache", {}) + if "tags" in prefetched_cache: return calc_tags_str() - cache_key = f'{self.pk}-tags' + cache_key = f"{self.pk}-tags" return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() - def icons(self, path: Optional[str] = None) -> str: + def icons(self, path: str | None = None) -> str: """Generate HTML icons showing which extractor plugins have succeeded for this snapshot""" from django.utils.html import format_html - cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}' + cache_key = ( + f"result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}" + ) def calc_icons(): - prefetched_cache = getattr(self, '_prefetched_objects_cache', {}) - if 'archiveresult_set' in prefetched_cache: - archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)} + prefetched_cache = getattr(self, "_prefetched_objects_cache", {}) + if "archiveresult_set" in prefetched_cache: + archive_results = { + r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str) + } else: # Filter for results that have either output_files or output_str from django.db.models import Q - archive_results = {r.plugin: r for r in self.archiveresult_set.filter( - Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str='')) - )} + + archive_results = { + r.plugin: r + for r in self.archiveresult_set.filter( + Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str="")), + ) + } archive_path = path or self.archive_path output = "" @@ -1326,7 +1374,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea for plugin in all_plugins: result = archive_results.get(plugin) - existing = result and result.status == 'succeeded' and (result.output_files or result.output_str) + existing = result and result.status == "succeeded" and (result.output_files or result.output_str) icon = mark_safe(get_plugin_icon(plugin)) # Skip plugins with empty icons that have no output @@ -1334,17 +1382,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not icon.strip() and not existing: continue - embed_path = result.embed_path() if result else f'{plugin}/' + embed_path = result.embed_path() if result else f"{plugin}/" output += format_html( output_template, archive_path, embed_path, str(bool(existing)), plugin, - icon + icon, ) - return format_html('{}', mark_safe(output)) + return format_html( + '{}', + mark_safe(output), + ) cache_result = cache.get(cache_key) if cache_result: @@ -1356,10 +1407,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @property def api_url(self) -> str: - return str(reverse_lazy('api-1:get_snapshot', args=[self.id])) + return str(reverse_lazy("api-1:get_snapshot", args=[self.id])) def get_absolute_url(self): - return f'/{self.archive_path}' + return f"/{self.archive_path}" @cached_property def domain(self) -> str: @@ -1367,7 +1418,85 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @property def title_stripped(self) -> str: - return (self.title or '').strip() + return (self.title or "").strip() + + @staticmethod + def _normalize_title_candidate(candidate: str | None, *, snapshot_url: str) -> str: + title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip() + if not title: + return "" + if title.lower() in {"pending...", "no title found"}: + return "" + if title == snapshot_url: + return "" + if title.startswith(("http://", "https://")): + return "" + if "/" in title and title.lower().endswith(".txt"): + return "" + return title + + @property + def resolved_title(self) -> str: + stored_title = self._normalize_title_candidate(self.title, snapshot_url=self.url) + if stored_title: + return stored_title + + title_result = ( + self.archiveresult_set.filter(plugin="title").exclude(output_str="").order_by("-start_ts", "-end_ts", "-created_at").first() + ) + if title_result: + result_title = self._normalize_title_candidate(title_result.output_str, snapshot_url=self.url) + if result_title: + return result_title + + title_file = self.output_dir / "title" / "title.txt" + if title_file.exists(): + try: + file_title = self._normalize_title_candidate(title_file.read_text(encoding="utf-8"), snapshot_url=self.url) + except OSError: + file_title = "" + if file_title: + return file_title + + return "" + + @cached_property + def hashes_index(self) -> dict[str, dict[str, Any]]: + hashes_path = self.output_dir / "hashes" / "hashes.json" + if not hashes_path.exists(): + return {} + + try: + data = json.loads(hashes_path.read_text(encoding="utf-8")) + except Exception: + return {} + + index: dict[str, dict[str, Any]] = {} + if isinstance(data, dict) and isinstance(data.get("files"), list): + for entry in data["files"]: + if not isinstance(entry, dict): + continue + path = str(entry.get("path") or "").strip().rstrip("/") + if not path: + continue + index[path] = { + "size": entry.get("size") or entry.get("num_bytes") or entry.get("bytes") or 0, + "is_dir": bool(entry.get("is_dir")) or str(entry.get("path") or "").endswith("/"), + "hash": entry.get("hash") or entry.get("hash_sha256"), + } + elif isinstance(data, dict): + for path, entry in data.items(): + if not isinstance(entry, dict) or path == ".": + continue + clean_path = str(path).rstrip("/") + if not clean_path: + continue + index[clean_path] = { + "size": entry.get("size") or entry.get("num_bytes") or 0, + "is_dir": bool(entry.get("mime_type") == "inode/directory" or str(path).endswith("/")), + "hash": entry.get("hash") or entry.get("hash_sha256"), + } + return index @property def output_dir(self) -> Path: @@ -1428,17 +1557,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not self.crawl_id: return - crawl = Crawl.objects.filter(id=self.crawl_id).select_related('created_by').first() + crawl = Crawl.objects.filter(id=self.crawl_id).select_related("created_by").first() if not crawl: return date_base = crawl.created_at or self.created_at or timezone.now() - date_str = date_base.strftime('%Y%m%d') + date_str = date_base.strftime("%Y%m%d") domain = self.extract_domain_from_url(self.url) - username = crawl.created_by.username if getattr(crawl, 'created_by_id', None) else 'system' + username = crawl.created_by.username if getattr(crawl, "created_by_id", None) else "system" - crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id) - link_path = crawl_dir / 'snapshots' / domain / str(self.id) + crawl_dir = DATA_DIR / "users" / username / "crawls" / date_str / domain / str(crawl.id) + link_path = crawl_dir / "snapshots" / domain / str(self.id) link_parent = link_path.parent link_parent.mkdir(parents=True, exist_ok=True) @@ -1459,33 +1588,33 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @cached_property def legacy_archive_path(self) -> str: - return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' + return f"{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}" @cached_property def archive_path_from_db(self) -> str: """Best-effort public URL path derived from DB fields only.""" - if self.fs_version in ('0.7.0', '0.8.0'): + if self.fs_version in ("0.7.0", "0.8.0"): return self.legacy_archive_path - if self.fs_version in ('0.9.0', '1.0.0'): - username = 'web' - crawl = getattr(self, 'crawl', None) - if crawl and getattr(crawl, 'created_by_id', None): + if self.fs_version in ("0.9.0", "1.0.0"): + username = "web" + crawl = getattr(self, "crawl", None) + if crawl and getattr(crawl, "created_by_id", None): username = crawl.created_by.username - if username == 'system': - username = 'web' + if username == "system": + username = "web" date_base = self.created_at or self.bookmarked_at if date_base: - date_str = date_base.strftime('%Y%m%d') + date_str = date_base.strftime("%Y%m%d") else: try: - date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d') + date_str = datetime.fromtimestamp(float(self.timestamp)).strftime("%Y%m%d") except (TypeError, ValueError, OSError): return self.legacy_archive_path domain = self.extract_domain_from_url(self.url) - return f'{username}/{date_str}/{domain}/{self.id}' + return f"{username}/{date_str}/{domain}/{self.id}" return self.legacy_archive_path @@ -1499,20 +1628,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea parts = rel_path.parts # New layout: users//snapshots//// - if len(parts) >= 6 and parts[0] == 'users' and parts[2] == 'snapshots': + if len(parts) >= 6 and parts[0] == "users" and parts[2] == "snapshots": username = parts[1] - if username == 'system': - username = 'web' + if username == "system": + username = "web" date_str = parts[3] domain = parts[4] snapshot_id = parts[5] - return f'{username}/{date_str}/{domain}/{snapshot_id}' + return f"{username}/{date_str}/{domain}/{snapshot_id}" # Legacy layout: archive// if len(parts) >= 2 and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME: - return f'{parts[0]}/{parts[1]}' + return f"{parts[0]}/{parts[1]}" - return '/'.join(parts) + return "/".join(parts) @cached_property def archive_path(self): @@ -1520,6 +1649,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @cached_property def archive_size(self): + if hasattr(self, "output_size_sum"): + return int(self.output_size_sum or 0) + + prefetched_results = None + if hasattr(self, "_prefetched_objects_cache"): + prefetched_results = self._prefetched_objects_cache.get("archiveresult_set") + if prefetched_results is not None: + return sum(result.output_size or result.output_size_from_files() for result in prefetched_results) + + stats = self.archiveresult_set.aggregate(result_count=models.Count("id"), total_size=models.Sum("output_size")) + if stats["result_count"]: + return int(stats["total_size"] or 0) try: return get_dir_size(self.output_dir)[0] except Exception: @@ -1530,10 +1671,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea self.tags.clear() self.tags.add(*tags_id) - def pending_archiveresults(self) -> QuerySet['ArchiveResult']: + def pending_archiveresults(self) -> QuerySet["ArchiveResult"]: return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) - def run(self) -> list['ArchiveResult']: + def run(self) -> list["ArchiveResult"]: """ Execute snapshot by creating pending ArchiveResults for all enabled hooks. @@ -1563,29 +1704,29 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Use Process.kill_tree() to gracefully kill parent + children killed_count = process.kill_tree(graceful_timeout=2.0) if killed_count > 0: - print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]') + print(f"[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]") # Clean up .pid files from output directory if Path(self.output_dir).exists(): - for pid_file in Path(self.output_dir).glob('**/*.pid'): + for pid_file in Path(self.output_dir).glob("**/*.pid"): pid_file.unlink(missing_ok=True) # Update all background ArchiveResults from filesystem (in case output arrived late) - results = self.archiveresult_set.filter(hook_name__contains='.bg.') + results = self.archiveresult_set.filter(hook_name__contains=".bg.") for ar in results: ar.update_from_output() # Delete ArchiveResults that produced no output files empty_ars = self.archiveresult_set.filter( - output_files={} # No output files + output_files={}, # No output files ).filter( - status__in=ArchiveResult.FINAL_STATES # Only delete finished ones + status__in=ArchiveResult.FINAL_STATES, # Only delete finished ones ) deleted_count = empty_ars.count() if deleted_count > 0: empty_ars.delete() - print(f'[yellow]🗑️ Deleted {deleted_count} empty ArchiveResults for {self.url}[/yellow]') + print(f"[yellow]🗑️ Deleted {deleted_count} empty ArchiveResults for {self.url}[/yellow]") def to_json(self) -> dict: """ @@ -1593,24 +1734,29 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Includes all fields needed to fully reconstruct/identify this snapshot. """ from archivebox.config import VERSION + + archive_size = self.archive_size + return { - 'type': 'Snapshot', - 'schema_version': VERSION, - 'id': str(self.id), - 'crawl_id': str(self.crawl_id), - 'url': self.url, - 'title': self.title, - 'tags': self.tags_str(), - 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, - 'created_at': self.created_at.isoformat() if self.created_at else None, - 'timestamp': self.timestamp, - 'depth': self.depth, - 'status': self.status, - 'fs_version': self.fs_version, + "type": "Snapshot", + "schema_version": VERSION, + "id": str(self.id), + "crawl_id": str(self.crawl_id), + "url": self.url, + "title": self.title, + "tags": self.tags_str(), + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "timestamp": self.timestamp, + "depth": self.depth, + "status": self.status, + "fs_version": self.fs_version, + "archive_size": archive_size, + "output_size": archive_size, } @staticmethod - def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None, queue_for_extraction: bool = True): + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None, queue_for_extraction: bool = True): """ Create/update Snapshot from JSON dict. @@ -1636,7 +1782,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea overrides = overrides or {} # If 'id' is provided, lookup and patch that specific snapshot - snapshot_id = record.get('id') + snapshot_id = record.get("id") if snapshot_id: try: snapshot = Snapshot.objects.get(id=snapshot_id) @@ -1645,7 +1791,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea update_fields = [] for field_name, value in record.items(): # Skip internal fields - if field_name in ('id', 'type'): + if field_name in ("id", "type"): continue # Skip if field doesn't exist on model @@ -1653,7 +1799,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea continue # Special parsing for date fields - if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'): + if field_name in ("bookmarked_at", "retry_at", "created_at", "modified_at"): if value and isinstance(value, str): value = parse_date(value) @@ -1663,29 +1809,35 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea update_fields.append(field_name) if update_fields: - snapshot.save(update_fields=update_fields + ['modified_at']) + snapshot.save(update_fields=update_fields + ["modified_at"]) return snapshot except Snapshot.DoesNotExist: # ID not found, fall through to create-by-URL logic pass - from archivebox.misc.util import fix_url_from_markdown + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url - url = fix_url_from_markdown(str(record.get('url') or '').strip()) + url = sanitize_extracted_url(fix_url_from_markdown(str(record.get("url") or "").strip())) if not url: return None # Determine or create crawl (every snapshot must have a crawl) - crawl = overrides.get('crawl') - parent_snapshot = overrides.get('snapshot') # Parent snapshot - created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk()) + crawl = overrides.get("crawl") + parent_snapshot = overrides.get("snapshot") # Parent snapshot + created_by_id = overrides.get("created_by_id") or ( + parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk() + ) # DEBUG: Check if crawl_id in record matches overrides crawl import sys - record_crawl_id = record.get('crawl_id') + + record_crawl_id = record.get("crawl_id") if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id): - print(f"[yellow]⚠️ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", file=sys.stderr) + print( + f"[yellow]⚠️ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", + file=sys.stderr, + ) # If no crawl provided, inherit from parent or auto-create one if not crawl: @@ -1698,41 +1850,40 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea from archivebox.config import CONSTANTS timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") - sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt' + sources_file = CONSTANTS.SOURCES_DIR / f"{timestamp_str}__auto_crawl.txt" sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.write_text(url) crawl = Crawl.objects.create( urls=url, max_depth=0, - label=f'auto-created for {url[:50]}', + label=f"auto-created for {url[:50]}", created_by_id=created_by_id, ) print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr) # Parse tags (accept either a list ["tag1", "tag2"] or a comma-separated string "tag1,tag2") - tags_raw = record.get('tags', '') + tags_raw = record.get("tags", "") tag_list = [] if isinstance(tags_raw, list): tag_list = list(dict.fromkeys(tag.strip() for tag in tags_raw if tag.strip())) elif tags_raw: - tag_list = list(dict.fromkeys( - tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw) - if tag.strip() - )) + tag_list = list( + dict.fromkeys(tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw) if tag.strip()), + ) # Check for existing snapshot with same URL in same crawl # (URLs can exist in multiple crawls, but should be unique within a crawl) - snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by('-created_at').first() + snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by("-created_at").first() - title = record.get('title') - timestamp = record.get('timestamp') + title = record.get("title") + timestamp = record.get("timestamp") if snapshot: # Update existing snapshot - if title and (not snapshot.title or len(title) > len(snapshot.title or '')): + if title and (not snapshot.title or len(title) > len(snapshot.title or "")): snapshot.title = title - snapshot.save(update_fields=['title', 'modified_at']) + snapshot.save(update_fields=["title", "modified_at"]) else: # Create new snapshot if timestamp: @@ -1748,7 +1899,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Update tags if tag_list: - existing_tags = set(snapshot.tags.values_list('name', flat=True)) + existing_tags = set(snapshot.tags.values_list("name", flat=True)) new_tags = set(tag_list) | existing_tags snapshot.save_tags(new_tags) @@ -1758,23 +1909,23 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if queue_for_extraction: snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.retry_at = timezone.now() - update_fields.extend(['status', 'retry_at']) + update_fields.extend(["status", "retry_at"]) # Update additional fields if provided - for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'): + for field_name in ("depth", "parent_snapshot_id", "crawl_id", "bookmarked_at"): value = record.get(field_name) if value is not None and getattr(snapshot, field_name) != value: setattr(snapshot, field_name, value) update_fields.append(field_name) if update_fields: - snapshot.save(update_fields=update_fields + ['modified_at']) + snapshot.save(update_fields=update_fields + ["modified_at"]) snapshot.ensure_crawl_symlink() return snapshot - def create_pending_archiveresults(self) -> list['ArchiveResult']: + def create_pending_archiveresults(self) -> list["ArchiveResult"]: """ Create ArchiveResult records for all enabled hooks. @@ -1790,7 +1941,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Get merged config with crawl-specific PLUGINS filter config = get_config(crawl=self.crawl, snapshot=self) - hooks = discover_hooks('Snapshot', config=config) + hooks = discover_hooks("Snapshot", config=config) archiveresults = [] for hook_path in hooks: @@ -1805,8 +1956,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea snapshot=self, hook_name=hook_name, defaults={ - 'plugin': plugin, - 'status': ArchiveResult.INITIAL_STATE, + "plugin": plugin, + "status": ArchiveResult.INITIAL_STATE, }, ) if archiveresult.status == ArchiveResult.INITIAL_STATE: @@ -1814,7 +1965,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return archiveresults - def is_finished_processing(self) -> bool: """ Check if all ArchiveResults are finished. @@ -1824,7 +1974,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ # Check if any ARs are still pending/started pending = self.archiveresult_set.exclude( - status__in=ArchiveResult.FINAL_STATES + status__in=ArchiveResult.FINAL_STATES, ).exists() return not pending @@ -1848,11 +1998,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea results = self.archiveresult_set.all() # Count by status - succeeded = results.filter(status='succeeded').count() - failed = results.filter(status='failed').count() - running = results.filter(status='started').count() - skipped = results.filter(status='skipped').count() - noresults = results.filter(status='noresults').count() + succeeded = results.filter(status="succeeded").count() + failed = results.filter(status="failed").count() + running = results.filter(status="started").count() + skipped = results.filter(status="skipped").count() + noresults = results.filter(status="noresults").count() total = results.count() pending = total - succeeded - failed - running - skipped - noresults @@ -1861,24 +2011,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea percent = int((completed / total * 100) if total > 0 else 0) # Sum output sizes - output_size = results.filter(status='succeeded').aggregate( - total_size=Sum('output_size') - )['total_size'] or 0 + output_size = results.aggregate(total_size=Sum("output_size"))["total_size"] or 0 # Check if sealed is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED) return { - 'total': total, - 'succeeded': succeeded, - 'failed': failed, - 'running': running, - 'pending': pending, - 'skipped': skipped, - 'noresults': noresults, - 'percent': percent, - 'output_size': output_size, - 'is_sealed': is_sealed, + "total": total, + "succeeded": succeeded, + "failed": failed, + "running": running, + "pending": pending, + "skipped": skipped, + "noresults": noresults, + "percent": percent, + "output_size": output_size, + "is_sealed": is_sealed, } def retry_failed_archiveresults(self) -> int: @@ -1892,23 +2040,23 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, - ] + ], ).update( status=ArchiveResult.StatusChoices.QUEUED, - output_str='', + output_str="", output_json=None, output_files={}, output_size=0, - output_mimetypes='', + output_mimetypes="", start_ts=None, end_ts=None, ) if count > 0: - self.status = self.StatusChoices.STARTED + self.status = self.StatusChoices.QUEUED self.retry_at = timezone.now() self.current_step = 0 # Reset to step 0 for retry - self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at']) + self.save(update_fields=["status", "retry_at", "current_step", "modified_at"]) return count @@ -1919,48 +2067,52 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @cached_property def url_hash(self) -> str: from hashlib import sha256 + return sha256(self.url.encode()).hexdigest()[:8] @cached_property def scheme(self) -> str: - return self.url.split('://')[0] + return self.url.split("://")[0] @cached_property def path(self) -> str: - parts = self.url.split('://', 1) - return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/' + parts = self.url.split("://", 1) + return "/" + parts[1].split("/", 1)[1] if len(parts) > 1 and "/" in parts[1] else "/" @cached_property def basename(self) -> str: - return self.path.split('/')[-1] + return self.path.split("/")[-1] @cached_property def extension(self) -> str: basename = self.basename - return basename.split('.')[-1] if '.' in basename else '' + return basename.split(".")[-1] if "." in basename else "" @cached_property def base_url(self) -> str: - return f'{self.scheme}://{self.domain}' + return f"{self.scheme}://{self.domain}" @cached_property def is_static(self) -> bool: - static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'} + static_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".mp4", ".mp3", ".wav", ".webm"} return any(self.url.lower().endswith(ext) for ext in static_extensions) @cached_property def is_archived(self) -> bool: + if self.downloaded_at or self.status == self.StatusChoices.SEALED: + return True + output_paths = ( self.domain, - 'output.html', - 'output.pdf', - 'screenshot.png', - 'singlefile.html', - 'readability/content.html', - 'mercury/content.html', - 'htmltotext.txt', - 'media', - 'git', + "output.html", + "output.pdf", + "screenshot.png", + "singlefile.html", + "readability/content.html", + "mercury/content.html", + "htmltotext.txt", + "media", + "git", ) return any((Path(self.output_dir) / path).exists() for path in output_paths) @@ -1969,66 +2121,62 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # ========================================================================= @cached_property - def bookmarked_date(self) -> Optional[str]: + def bookmarked_date(self) -> str | None: max_ts = (timezone.now() + timedelta(days=30)).timestamp() - if self.timestamp and self.timestamp.replace('.', '').isdigit(): + if self.timestamp and self.timestamp.replace(".", "").isdigit(): if 0 < float(self.timestamp) < max_ts: return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp))) return str(self.timestamp) return None @cached_property - def downloaded_datestr(self) -> Optional[str]: + def downloaded_datestr(self) -> str | None: return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None @cached_property - def archive_dates(self) -> List[datetime]: - return [ - result.start_ts - for result in self.archiveresult_set.all() - if result.start_ts - ] + def archive_dates(self) -> list[datetime]: + return [result.start_ts for result in self.archiveresult_set.all() if result.start_ts] @cached_property - def oldest_archive_date(self) -> Optional[datetime]: + def oldest_archive_date(self) -> datetime | None: dates = self.archive_dates return min(dates) if dates else None @cached_property - def newest_archive_date(self) -> Optional[datetime]: + def newest_archive_date(self) -> datetime | None: dates = self.archive_dates return max(dates) if dates else None @cached_property def num_outputs(self) -> int: - return self.archiveresult_set.filter(status='succeeded').count() + return self.archiveresult_set.filter(status="succeeded").count() @cached_property def num_failures(self) -> int: - return self.archiveresult_set.filter(status='failed').count() + return self.archiveresult_set.filter(status="failed").count() # ========================================================================= # Output Path Methods (migrated from Link schema) # ========================================================================= - def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]: + def latest_outputs(self, status: str | None = None) -> dict[str, Any]: """Get the latest output that each plugin produced""" from archivebox.hooks import get_plugins from django.db.models import Q - latest: Dict[str, Any] = {} + latest: dict[str, Any] = {} for plugin in get_plugins(): results = self.archiveresult_set.filter(plugin=plugin) if status is not None: results = results.filter(status=status) # Filter for results with output_files or output_str - results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts') + results = results.filter(Q(output_files__isnull=False) | ~Q(output_str="")).order_by("-start_ts") result = results.first() # Return embed_path() for backwards compatibility latest[plugin] = result.embed_path() if result else None return latest - def discover_outputs(self) -> list[dict]: + def discover_outputs(self, include_filesystem_fallback: bool = True) -> list[dict]: """Discover output files from ArchiveResults and filesystem.""" from archivebox.misc.util import ts_to_date_str @@ -2037,56 +2185,117 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea outputs: list[dict] = [] seen: set[str] = set() - text_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log') + text_exts = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log") def is_metadata_path(path: str | None) -> bool: - lower = (path or '').lower() + lower = (path or "").lower() return lower.endswith(text_exts) def is_compact_path(path: str | None) -> bool: - lower = (path or '').lower() + lower = (path or "").lower() return lower.endswith(text_exts) - for result in self.archiveresult_set.all().order_by('start_ts'): - embed_path = result.embed_path() - if not embed_path or embed_path.strip() in ('.', '/', './'): + for result in self.archiveresult_set.all().order_by("start_ts"): + embed_path = result.embed_path_db() + if not embed_path and include_filesystem_fallback: + embed_path = result.embed_path() + if not embed_path or embed_path.strip() in (".", "/", "./"): continue - abs_path = snap_dir / embed_path - if not abs_path.exists(): - continue - if abs_path.is_dir(): - if not any(p.is_file() for p in abs_path.rglob('*')): + size = result.output_size or result.output_size_from_files() or self.hashes_index.get(embed_path, {}).get("size") or 0 + if not size and include_filesystem_fallback: + abs_path = snap_dir / embed_path + if not abs_path.exists(): continue - size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file()) - else: - size = abs_path.stat().st_size - plugin_lower = (result.plugin or '').lower() - if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl'): - plugin_dir = snap_dir / result.plugin - if plugin_dir.exists(): - try: - size = sum(p.stat().st_size for p in plugin_dir.rglob('*') if p.is_file()) - except OSError: - pass - outputs.append({ - 'name': result.plugin, - 'path': embed_path, - 'ts': ts_to_date_str(result.end_ts), - 'size': size or 0, - 'is_metadata': is_metadata_path(embed_path), - 'is_compact': is_compact_path(embed_path), - 'result': result, - }) + if abs_path.is_dir(): + if not any(p.is_file() for p in abs_path.rglob("*")): + continue + size = sum(p.stat().st_size for p in abs_path.rglob("*") if p.is_file()) + else: + size = abs_path.stat().st_size + plugin_lower = (result.plugin or "").lower() + if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl"): + plugin_dir = snap_dir / result.plugin + if plugin_dir.exists(): + try: + size = sum(p.stat().st_size for p in plugin_dir.rglob("*") if p.is_file()) + except OSError: + pass + outputs.append( + { + "name": result.plugin, + "path": embed_path, + "ts": ts_to_date_str(result.end_ts), + "size": size or 0, + "is_metadata": is_metadata_path(embed_path), + "is_compact": is_compact_path(embed_path), + "result": result, + }, + ) seen.add(result.plugin) + hashes_index = self.hashes_index + if hashes_index: + grouped_hash_outputs: dict[str, dict[str, dict[str, Any]]] = {} + ignored_roots = {"index.html", "index.json", "index.jsonl", "favicon.ico", "warc", "hashes"} + for rel_path, meta in hashes_index.items(): + parts = Path(rel_path).parts + if len(parts) < 2: + continue + root = parts[0] + if root.startswith(".") or root in seen or root in ignored_roots: + continue + child_path = str(Path(*parts[1:])) + grouped_hash_outputs.setdefault(root, {})[child_path] = meta + + fallback_ts = ts_to_date_str(self.downloaded_at or self.created_at) + for root, root_entries in grouped_hash_outputs.items(): + fallback_path = ArchiveResult._fallback_output_file_path(list(root_entries.keys()), root, root_entries) + if not fallback_path: + continue + fallback_meta = root_entries.get(fallback_path, {}) + outputs.append( + { + "name": root, + "path": f"{root}/{fallback_path}", + "ts": fallback_ts, + "size": int(fallback_meta.get("size") or 0), + "is_metadata": is_metadata_path(fallback_path), + "is_compact": is_compact_path(fallback_path), + "result": None, + }, + ) + seen.add(root) + + if not include_filesystem_fallback: + return outputs + embeddable_exts = { - 'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', 'csv', 'tsv', - 'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', - 'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav', + "html", + "htm", + "pdf", + "txt", + "md", + "json", + "jsonl", + "csv", + "tsv", + "png", + "jpg", + "jpeg", + "gif", + "webp", + "svg", + "ico", + "mp4", + "webm", + "mp3", + "opus", + "ogg", + "wav", } for entry in snap_dir.iterdir(): - if entry.name in ('index.html', 'index.json', 'favicon.ico', 'warc'): + if entry.name in ("index.html", "index.json", "favicon.ico", "warc"): continue if entry.is_dir(): plugin = entry.name @@ -2095,33 +2304,39 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea best_file = ArchiveResult._find_best_output_file(entry, plugin) if not best_file: continue + best_file_stat = best_file.stat() rel_path = str(best_file.relative_to(snap_dir)) - outputs.append({ - 'name': plugin, - 'path': rel_path, - 'ts': ts_to_date_str(best_file.stat().st_mtime or 0), - 'size': best_file.stat().st_size or 0, - 'is_metadata': is_metadata_path(rel_path), - 'is_compact': is_compact_path(rel_path), - 'result': None, - }) + outputs.append( + { + "name": plugin, + "path": rel_path, + "ts": ts_to_date_str(best_file_stat.st_mtime or 0), + "size": best_file_stat.st_size or 0, + "is_metadata": is_metadata_path(rel_path), + "is_compact": is_compact_path(rel_path), + "result": None, + }, + ) seen.add(plugin) elif entry.is_file(): - ext = entry.suffix.lstrip('.').lower() + ext = entry.suffix.lstrip(".").lower() if ext not in embeddable_exts: continue plugin = entry.stem if plugin in seen: continue - outputs.append({ - 'name': plugin, - 'path': entry.name, - 'ts': ts_to_date_str(entry.stat().st_mtime or 0), - 'size': entry.stat().st_size or 0, - 'is_metadata': is_metadata_path(entry.name), - 'is_compact': is_compact_path(entry.name), - 'result': None, - }) + entry_stat = entry.stat() + outputs.append( + { + "name": plugin, + "path": entry.name, + "ts": ts_to_date_str(entry_stat.st_mtime or 0), + "size": entry_stat.st_size or 0, + "is_metadata": is_metadata_path(entry.name), + "is_compact": is_compact_path(entry.name), + "result": None, + }, + ) seen.add(plugin) return outputs @@ -2130,38 +2345,47 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Serialization Methods # ========================================================================= - def to_dict(self, extended: bool = False) -> Dict[str, Any]: + def to_dict(self, extended: bool = False) -> dict[str, Any]: """Convert Snapshot to a dictionary (replacement for Link._asdict())""" from archivebox.core.host_utils import build_snapshot_url + archive_size = self.archive_size + result = { - 'TYPE': 'core.models.Snapshot', - 'id': str(self.id), - 'url': self.url, - 'timestamp': self.timestamp, - 'title': self.title, - 'tags': sorted(tag.name for tag in self.tags.all()), - 'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None, - 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, - 'created_at': self.created_at.isoformat() if self.created_at else None, + "TYPE": "core.models.Snapshot", + "id": str(self.id), + "crawl_id": str(self.crawl_id), + "url": self.url, + "timestamp": self.timestamp, + "title": self.title, + "tags": sorted(tag.name for tag in self.tags.all()), + "downloaded_at": self.downloaded_at.isoformat() if self.downloaded_at else None, + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "modified_at": self.modified_at.isoformat() if self.modified_at else None, + "retry_at": self.retry_at.isoformat() if self.retry_at else None, + "depth": self.depth, + "status": self.status, + "fs_version": self.fs_version, # Computed properties - 'domain': self.domain, - 'scheme': self.scheme, - 'base_url': self.base_url, - 'path': self.path, - 'basename': self.basename, - 'extension': self.extension, - 'is_static': self.is_static, - 'is_archived': self.is_archived, - 'archive_path': self.archive_path, - 'archive_url': build_snapshot_url(str(self.id), 'index.html'), - 'output_dir': self.output_dir, - 'link_dir': self.output_dir, # backwards compatibility alias - 'archive_size': self.archive_size, - 'bookmarked_date': self.bookmarked_date, - 'downloaded_datestr': self.downloaded_datestr, - 'num_outputs': self.num_outputs, - 'num_failures': self.num_failures, + "domain": self.domain, + "scheme": self.scheme, + "base_url": self.base_url, + "path": self.path, + "basename": self.basename, + "extension": self.extension, + "is_static": self.is_static, + "is_archived": self.is_archived, + "archive_path": self.archive_path, + "archive_url": build_snapshot_url(str(self.id), "index.html"), + "output_dir": self.output_dir, + "link_dir": self.output_dir, # backwards compatibility alias + "archive_size": archive_size, + "output_size": archive_size, + "bookmarked_date": self.bookmarked_date, + "downloaded_datestr": self.downloaded_datestr, + "num_outputs": self.num_outputs, + "num_failures": self.num_failures, } return result @@ -2169,11 +2393,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """Convert to JSON string (legacy method, use to_json() for dict)""" return to_json(self.to_dict(extended=True), indent=indent) - def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str: + def to_csv(self, cols: list[str] | None = None, separator: str = ",", ljust: int = 0) -> str: """Convert to CSV string""" data = self.to_dict() - cols = cols or ['timestamp', 'is_archived', 'url'] - return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols) + cols = cols or ["timestamp", "is_archived", "url"] + return separator.join(to_json(data.get(col, ""), indent=None).ljust(ljust) for col in cols) def write_json_details(self, out_dir: Path | str | None = None) -> None: """Write JSON index file for this snapshot to its output directory""" @@ -2186,71 +2410,146 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea from django.template.loader import render_to_string from archivebox.config.common import SERVER_CONFIG from archivebox.config.configset import get_config + from archivebox.core.widgets import TagEditorWidget from archivebox.misc.logging_util import printable_filesize output_dir = Path(out_dir) if out_dir is not None else self.output_dir config = get_config() - SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True) - TITLE_LOADING_MSG = 'Not yet archived...' + SAVE_ARCHIVE_DOT_ORG = config.get("SAVE_ARCHIVE_DOT_ORG", True) + TITLE_LOADING_MSG = "Not yet archived..." preview_priority = [ - 'singlefile', - 'screenshot', - 'wget', - 'dom', - 'pdf', - 'readability', + "singlefile", + "screenshot", + "wget", + "dom", + "pdf", + "readability", ] - outputs = self.discover_outputs() - outputs_by_plugin = {out['name']: out for out in outputs} + outputs = self.discover_outputs(include_filesystem_fallback=True) + loose_items, failed_items = self.get_detail_page_auxiliary_items(outputs) + outputs_by_plugin = {out["name"]: out for out in outputs} + output_size = sum(int(out.get("size") or 0) for out in outputs) + is_archived = bool(outputs or self.downloaded_at or self.status == self.StatusChoices.SEALED) - best_preview_path = 'about:blank' - best_result = {'path': 'about:blank', 'result': None} + best_preview_path = "about:blank" + best_result = {"path": "about:blank", "result": None} for plugin in preview_priority: out = outputs_by_plugin.get(plugin) - if out and out.get('path'): - best_preview_path = str(out['path']) + if out and out.get("path"): + best_preview_path = str(out["path"]) best_result = out break - if best_preview_path == 'about:blank' and outputs: - best_preview_path = str(outputs[0].get('path') or 'about:blank') + if best_preview_path == "about:blank" and outputs: + best_preview_path = str(outputs[0].get("path") or "about:blank") best_result = outputs[0] + tag_widget = TagEditorWidget() context = { **self.to_dict(extended=True), - 'snapshot': self, - 'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)), - 'url_str': htmlencode(urldecode(self.base_url)), - 'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank', - 'extension': self.extension or 'html', - 'tags': self.tags_str() or 'untagged', - 'size': printable_filesize(self.archive_size) if self.archive_size else 'pending', - 'status': 'archived' if self.is_archived else 'not yet archived', - 'status_color': 'success' if self.is_archived else 'danger', - 'oldest_archive_date': ts_to_date_str(self.oldest_archive_date), - 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, - 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, - 'best_preview_path': best_preview_path, - 'best_result': best_result, - 'archiveresults': outputs, + "snapshot": self, + "title": htmlencode(self.resolved_title or (self.base_url if is_archived else TITLE_LOADING_MSG)), + "url_str": htmlencode(urldecode(self.base_url)), + "archive_url": urlencode(f"warc/{self.timestamp}" or (self.domain if is_archived else "")) or "about:blank", + "extension": self.extension or "html", + "tags": self.tags_str() or "untagged", + "size": printable_filesize(output_size) if output_size else "pending", + "status": "archived" if is_archived else "not yet archived", + "status_color": "success" if is_archived else "danger", + "oldest_archive_date": ts_to_date_str(self.oldest_archive_date), + "SAVE_ARCHIVE_DOT_ORG": SAVE_ARCHIVE_DOT_ORG, + "PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS, + "best_preview_path": best_preview_path, + "best_result": best_result, + "archiveresults": outputs, + "loose_items": loose_items, + "failed_items": failed_items, + "related_snapshots": [], + "related_years": [], + "title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in self.tags.all().order_by("name")], } - rendered_html = render_to_string('snapshot.html', context) + rendered_html = render_to_string("core/snapshot.html", context) atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) # ========================================================================= # Helper Methods # ========================================================================= + def get_detail_page_auxiliary_items( + self, + outputs: list[dict] | None = None, + hidden_card_plugins: set[str] | None = None, + ) -> tuple[list[dict[str, object]], list[dict[str, object]]]: + outputs = outputs or self.discover_outputs(include_filesystem_fallback=True) + hidden_card_plugins = hidden_card_plugins or set() + accounted_entries: set[str] = set() + for output in outputs: + output_name = str(output.get("name") or "") + if output_name: + accounted_entries.add(output_name) + output_path = str(output.get("path") or "") + if not output_path: + continue + parts = Path(output_path).parts + if parts: + accounted_entries.add(parts[0]) + + ignore_names = {".DS_Store", "index.html", "index.json", "index.jsonl", "favicon.ico"} + loose_items: list[dict[str, object]] = [] + if self.hashes_index: + grouped: dict[str, dict[str, object]] = {} + for rel_path, meta in self.hashes_index.items(): + parts = Path(rel_path).parts + if not parts: + continue + root = parts[0] + if root.startswith(".") or root in ignore_names or root in accounted_entries: + continue + entry = grouped.setdefault( + root, + { + "name": root, + "path": root, + "is_dir": len(parts) > 1 or bool(meta.get("is_dir")), + "size": 0, + }, + ) + entry["is_dir"] = bool(entry.get("is_dir")) or len(parts) > 1 or bool(meta.get("is_dir")) + entry["size"] = int(entry.get("size") or 0) + int(meta.get("size") or 0) + loose_items = sorted(grouped.values(), key=lambda item: str(item["name"]).lower()) + + ArchiveResult = self.archiveresult_set.model + failed_items: list[dict[str, object]] = [] + seen_failed: set[str] = set() + for result in self.archiveresult_set.all().order_by("start_ts"): + if result.status != ArchiveResult.StatusChoices.FAILED: + continue + root = str(result.plugin or "").strip() + if not root or root in seen_failed: + continue + seen_failed.add(root) + failed_items.append( + { + "name": f"{get_plugin_name(root)} ({result.status})", + "path": root, + "is_dir": True, + "size": int(result.output_size or 0), + }, + ) + + return loose_items, failed_items + @staticmethod - def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]: - return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None + def _ts_to_date_str(dt: datetime | None) -> str | None: + return dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None # ============================================================================= # Snapshot State Machine # ============================================================================= + class SnapshotMachine(BaseStateMachine): """ State machine for managing Snapshot lifecycle. @@ -2281,7 +2580,7 @@ class SnapshotMachine(BaseStateMachine): https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams """ - model_attr_name = 'snapshot' + model_attr_name = "snapshot" # States queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True) @@ -2289,11 +2588,7 @@ class SnapshotMachine(BaseStateMachine): sealed = State(value=Snapshot.StatusChoices.SEALED, final=True) # Tick Event (polled by workers) - tick = ( - queued.to.itself(unless='can_start') - | queued.to(started, cond='can_start') - | started.to(sealed, cond='is_finished') - ) + tick = queued.to.itself(unless="can_start") | queued.to(started, cond="can_start") | started.to(sealed, cond="is_finished") # Manual event (can also be triggered by last ArchiveResult finishing) seal = started.to(sealed) @@ -2320,7 +2615,7 @@ class SnapshotMachine(BaseStateMachine): """Just mark as started. The shared runner creates ArchiveResults and runs hooks.""" self.snapshot.status = Snapshot.StatusChoices.STARTED self.snapshot.retry_at = None # No more polling - self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + self.snapshot.save(update_fields=["status", "retry_at", "modified_at"]) @sealed.enter def enter_sealed(self): @@ -2334,31 +2629,31 @@ class SnapshotMachine(BaseStateMachine): status=Snapshot.StatusChoices.SEALED, ) - print(f'[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr) + print(f"[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]", file=sys.stderr) # Check if this is the last snapshot for the parent crawl - if so, seal the crawl if self.snapshot.crawl: crawl = self.snapshot.crawl remaining_active = Snapshot.objects.filter( crawl=crawl, - status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], ).count() if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED: - print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr) + print(f"[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]", file=sys.stderr) # Seal the parent crawl cast(Any, crawl).sm.seal() class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): class StatusChoices(models.TextChoices): - QUEUED = 'queued', 'Queued' - STARTED = 'started', 'Started' - BACKOFF = 'backoff', 'Waiting to retry' - SUCCEEDED = 'succeeded', 'Succeeded' - FAILED = 'failed', 'Failed' - SKIPPED = 'skipped', 'Skipped' - NORESULTS = 'noresults', 'No Results' + QUEUED = "queued", "Queued" + STARTED = "started", "Started" + BACKOFF = "backoff", "Waiting to retry" + SUCCEEDED = "succeeded", "Succeeded" + FAILED = "failed", "Failed" + SKIPPED = "skipped", "Skipped" + NORESULTS = "noresults", "No Results" INITIAL_STATE = StatusChoices.QUEUED ACTIVE_STATE = StatusChoices.STARTED @@ -2383,32 +2678,38 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore # No choices= constraint - plugin names come from plugin system and can be any string - plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True, default='') - hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)') + plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True, default="") + hook_name = models.CharField( + max_length=255, + blank=True, + default="", + db_index=True, + help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)", + ) # Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.) # Added POST-v0.9.0, will be added in a separate migration process = models.OneToOneField( - 'machine.Process', + "machine.Process", on_delete=models.PROTECT, null=True, blank=True, - related_name='archiveresult', - help_text='Process execution details for this archive result' + related_name="archiveresult", + help_text="Process execution details for this archive result", ) # New output fields (replacing old 'output' field) - output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary') - output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)') - output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}') - output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files') - output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size') + output_str = models.TextField(blank=True, default="", help_text="Human-readable output summary") + output_json = models.JSONField(null=True, blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)") + output_files = models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}") + output_size = models.BigIntegerField(default=0, help_text="Total bytes of all output files") + output_mimetypes = models.CharField(max_length=512, blank=True, default="", help_text="CSV of mimetypes sorted by size") start_ts = models.DateTimeField(default=None, null=True, blank=True) end_ts = models.DateTimeField(default=None, null=True, blank=True) status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True) - notes = models.TextField(blank=True, null=False, default='') + notes = models.TextField(blank=True, null=False, default="") # output_dir is computed via @property from snapshot.output_dir / plugin snapshot_id: uuid.UUID @@ -2419,15 +2720,15 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): ModelWithConfig.Meta, ModelWithNotes.Meta, ): - app_label = 'core' - verbose_name = 'Archive Result' - verbose_name_plural = 'Archive Results Log' + app_label = "core" + verbose_name = "Archive Result" + verbose_name_plural = "Archive Results Log" indexes = [ - models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'), + models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"), ] def __str__(self): - return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}' + return f"[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}" @property def created_by(self): @@ -2439,37 +2740,38 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): Convert ArchiveResult model instance to a JSON-serializable dict. """ from archivebox.config import VERSION + record = { - 'type': 'ArchiveResult', - 'schema_version': VERSION, - 'id': str(self.id), - 'snapshot_id': str(self.snapshot_id), - 'plugin': self.plugin, - 'hook_name': self.hook_name, - 'status': self.status, - 'output_str': self.output_str, - 'start_ts': self.start_ts.isoformat() if self.start_ts else None, - 'end_ts': self.end_ts.isoformat() if self.end_ts else None, + "type": "ArchiveResult", + "schema_version": VERSION, + "id": str(self.id), + "snapshot_id": str(self.snapshot_id), + "plugin": self.plugin, + "hook_name": self.hook_name, + "status": self.status, + "output_str": self.output_str, + "start_ts": self.start_ts.isoformat() if self.start_ts else None, + "end_ts": self.end_ts.isoformat() if self.end_ts else None, } # Include optional fields if set if self.output_json: - record['output_json'] = self.output_json + record["output_json"] = self.output_json if self.output_files: - record['output_files'] = self.output_files + record["output_files"] = self.output_files if self.output_size: - record['output_size'] = self.output_size + record["output_size"] = self.output_size if self.output_mimetypes: - record['output_mimetypes'] = self.output_mimetypes + record["output_mimetypes"] = self.output_mimetypes if self.cmd: - record['cmd'] = self.cmd + record["cmd"] = self.cmd if self.cmd_version: - record['cmd_version'] = self.cmd_version + record["cmd_version"] = self.cmd_version if self.process_id: - record['process_id'] = str(self.process_id) + record["process_id"] = str(self.process_id) return record @staticmethod - def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None): + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): """ Create/update ArchiveResult from JSON dict. @@ -2480,14 +2782,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): Returns: ArchiveResult instance or None """ - snapshot_id = record.get('snapshot_id') - plugin = record.get('plugin') + snapshot_id = record.get("snapshot_id") + plugin = record.get("plugin") if not snapshot_id or not plugin: return None # Try to get existing by ID first - result_id = record.get('id') + result_id = record.get("id") if result_id: try: return ArchiveResult.objects.get(id=result_id) @@ -2502,10 +2804,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): snapshot=snapshot, plugin=plugin, defaults={ - 'hook_name': record.get('hook_name', ''), - 'status': record.get('status', 'queued'), - 'output_str': record.get('output_str', ''), - } + "hook_name": record.get("hook_name", ""), + "status": record.get("status", "queued"), + "output_str": record.get("output_str", ""), + }, ) return result except Snapshot.DoesNotExist: @@ -2541,32 +2843,34 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): @property def api_url(self) -> str: - return str(reverse_lazy('api-1:get_archiveresult', args=[self.id])) + return str(reverse_lazy("api-1:get_archiveresult", args=[self.id])) def get_absolute_url(self): - return f'/{self.snapshot.archive_path}/{self.plugin}' + return f"/{self.snapshot.archive_path}/{self.plugin}" def reset_for_retry(self, *, save: bool = True) -> None: self.status = self.StatusChoices.QUEUED - self.output_str = '' + self.output_str = "" self.output_json = None self.output_files = {} self.output_size = 0 - self.output_mimetypes = '' + self.output_mimetypes = "" self.start_ts = None self.end_ts = None if save: - self.save(update_fields=[ - 'status', - 'output_str', - 'output_json', - 'output_files', - 'output_size', - 'output_mimetypes', - 'start_ts', - 'end_ts', - 'modified_at', - ]) + self.save( + update_fields=[ + "status", + "output_str", + "output_json", + "output_files", + "output_size", + "output_mimetypes", + "start_ts", + "end_ts", + "modified_at", + ], + ) @property def plugin_module(self) -> Any | None: @@ -2574,156 +2878,239 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): # The plugin name maps to hooks in abx_plugins/plugins/{plugin}/ return None + @staticmethod + def _normalize_output_files(raw_output_files: Any) -> dict[str, dict[str, Any]]: + from abx_dl.output_files import guess_mimetype + + def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]: + normalized = dict(metadata) + if "extension" not in normalized: + normalized["extension"] = Path(path).suffix.lower().lstrip(".") + if "mimetype" not in normalized: + guessed = guess_mimetype(path) + if guessed: + normalized["mimetype"] = guessed + return normalized + + if raw_output_files is None: + return {} + if isinstance(raw_output_files, str): + try: + raw_output_files = json.loads(raw_output_files) + except json.JSONDecodeError: + return {} + if isinstance(raw_output_files, dict): + normalized: dict[str, dict[str, Any]] = {} + for path, metadata in raw_output_files.items(): + if not path: + continue + metadata_dict = dict(metadata) if isinstance(metadata, dict) else {} + metadata_dict.pop("path", None) + normalized[str(path)] = _enrich_metadata(str(path), metadata_dict) + return normalized + if isinstance(raw_output_files, (list, tuple, set)): + normalized: dict[str, dict[str, Any]] = {} + for item in raw_output_files: + if isinstance(item, str): + normalized[item] = _enrich_metadata(item, {}) + continue + if not isinstance(item, dict): + continue + path = str(item.get("path") or "").strip() + if not path: + continue + normalized[path] = _enrich_metadata( + path, + {key: value for key, value in item.items() if key != "path" and value not in (None, "")}, + ) + return normalized + return {} + + @staticmethod + def _coerce_output_file_size(value: Any) -> int: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return 0 + + def output_file_map(self) -> dict[str, dict[str, Any]]: + return self._normalize_output_files(self.output_files) + + def output_file_paths(self) -> list[str]: + return list(self.output_file_map().keys()) + + def output_file_count(self) -> int: + return len(self.output_file_paths()) + + def output_size_from_files(self) -> int: + return sum(self._coerce_output_file_size(metadata.get("size")) for metadata in self.output_file_map().values()) + def output_exists(self) -> bool: return os.path.exists(Path(self.snapshot_dir) / self.plugin) @staticmethod - def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Optional[Path]: - if not dir_path.exists() or not dir_path.is_dir(): + def _looks_like_output_path(raw_output: str | None, plugin_name: str | None = None) -> bool: + value = str(raw_output or "").strip() + if value in ("", ".", "./", "/"): + return False + if plugin_name and value.startswith(f"{plugin_name}/"): + return True + if Path(value).is_absolute(): + return True + if Path(value).suffix: + return True + if "/" in value and "\\" not in value and " " not in value: + left, _, right = value.partition("/") + if left and right and all(ch.isalnum() or ch in "+-." for ch in left + right): + return False + return False + + def _existing_output_path(self, raw_output: str | None) -> str | None: + value = str(raw_output or "").strip() + if not value: return None - embeddable_exts = { - 'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', 'csv', 'tsv', - 'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', - 'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav', - } + output_path = Path(value) + snapshot_dir = Path(self.snapshot_dir).resolve(strict=False) + candidates: list[str] = [] - plugin_lower = (plugin_name or '').lower() - prefer_media = plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') - - preferred_text = [] - if plugin_lower: - preferred_text.extend([ - f'{plugin_lower}.jsonl', - f'{plugin_lower}.json', - f'{plugin_lower}.txt', - f'{plugin_lower}.log', - ]) - preferred_text.extend(['index.jsonl', 'index.json']) - for name in preferred_text: - candidate = dir_path / name - if candidate.exists() and candidate.is_file(): - return candidate - - if not prefer_media: - for name in ('index.html', 'index.htm'): - candidate = dir_path / name - if candidate.exists() and candidate.is_file(): - return candidate - - candidates = [] - file_count = 0 - max_scan = 200 - media_exts = {'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav'} - for file_path in dir_path.rglob('*'): - file_count += 1 - if file_count > max_scan: - break - if file_path.is_dir() or file_path.name.startswith('.'): - continue - ext = file_path.suffix.lstrip('.').lower() - if ext in ('pid', 'log', 'sh'): - continue - if ext not in embeddable_exts: - continue + if output_path.is_absolute(): try: - size = file_path.stat().st_size + candidates.append(str(output_path.resolve(strict=False).relative_to(snapshot_dir))) + except (OSError, ValueError): + return None + elif value.startswith(f"{self.plugin}/"): + candidates.append(value) + elif len(output_path.parts) == 1: + candidates.append(f"{self.plugin}/{value}") + else: + candidates.append(value) + + output_file_map = self.output_file_map() + hashes_index = self.snapshot.hashes_index + for relative_path in candidates: + if relative_path in hashes_index: + return relative_path + + if relative_path in output_file_map: + return relative_path + + plugin_relative = relative_path.removeprefix(f"{self.plugin}/") + if plugin_relative in output_file_map: + return relative_path + + candidate = snapshot_dir / relative_path + try: + if candidate.is_file(): + return relative_path except OSError: continue - name_lower = file_path.name.lower() - priority = 0 - if name_lower.startswith('index'): - priority = 100 - elif plugin_lower and name_lower.startswith(('output', 'content', plugin_lower)): - priority = 60 - elif ext in ('html', 'htm', 'pdf'): - priority = 40 - elif ext in media_exts: - priority = 50 if prefer_media else 10 - elif ext in ('png', 'jpg', 'jpeg', 'webp', 'svg', 'gif', 'ico'): - priority = 30 - elif ext in ('json', 'jsonl', 'txt', 'md', 'csv', 'tsv'): - priority = 20 - else: - priority = 10 - candidates.append((priority, size, file_path)) + return None + + @staticmethod + def _fallback_output_file_path( + output_file_paths: Sequence[str], + plugin_name: str | None = None, + output_file_map: dict[str, dict[str, Any]] | None = None, + ) -> str | None: + ignored = {"stdout.log", "stderr.log", "hook.pid", "listener.pid", "cmd.sh"} + candidates = [ + path + for path in output_file_paths + if Path(path).name not in ignored and Path(path).suffix.lower() not in (".pid", ".log", ".sh") + ] if not candidates: return None - candidates.sort(key=lambda x: (x[0], x[1]), reverse=True) - return candidates[0][2] + output_file_map = output_file_map or {} + preferred_names = [ + "index.html", + "index.htm", + "output.html", + "content.html", + "article.html", + "output.pdf", + "index.pdf", + "content.txt", + "output.txt", + "index.txt", + "index.md", + "index.json", + "article.json", + ] + for preferred_name in preferred_names: + for candidate in candidates: + if Path(candidate).name.lower() == preferred_name: + return candidate - def embed_path(self) -> Optional[str]: + ext_groups = ( + (".html", ".htm", ".pdf"), + (".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico"), + (".json", ".jsonl", ".txt", ".md", ".csv", ".tsv"), + (".mp4", ".webm", ".mp3", ".opus", ".ogg", ".wav"), + ) + for ext_group in ext_groups: + group_candidates = [candidate for candidate in candidates if Path(candidate).suffix.lower() in ext_group] + if group_candidates: + return max( + group_candidates, + key=lambda path: ArchiveResult._coerce_output_file_size(output_file_map.get(path, {}).get("size")), + ) + + return None + + @staticmethod + def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Path | None: + if not dir_path.exists() or not dir_path.is_dir(): + return None + file_map: dict[str, dict[str, Any]] = {} + file_count = 0 + max_scan = 500 + for file_path in dir_path.rglob("*"): + file_count += 1 + if file_count > max_scan: + break + if file_path.is_dir() or file_path.name.startswith("."): + continue + rel_path = str(file_path.relative_to(dir_path)) + try: + size = file_path.stat().st_size + except OSError: + size = 0 + file_map[rel_path] = {"size": size} + + fallback_path = ArchiveResult._fallback_output_file_path(list(file_map.keys()), plugin_name, file_map) + if not fallback_path: + return None + return dir_path / fallback_path + + def embed_path_db(self) -> str | None: + output_file_map = self.output_file_map() + + if self.output_str: + raw_output = str(self.output_str).strip() + if self._looks_like_output_path(raw_output, self.plugin): + existing_output = self._existing_output_path(raw_output) + if existing_output: + return existing_output + + output_file_paths = list(output_file_map.keys()) + if output_file_paths: + fallback_path = self._fallback_output_file_path(output_file_paths, self.plugin, output_file_map) + if fallback_path: + return f"{self.plugin}/{fallback_path}" + + return None + + def embed_path(self) -> str | None: """ Get the relative path to the embeddable output file for this result. - Returns the first file from output_files if set, otherwise tries to - find a reasonable default based on the plugin type. + This is intentionally DB-backed only so snapshot/admin rendering stays + fast and predictable without filesystem probes. """ - snapshot_dir = Path(self.snapshot_dir) - plugin_dir = snapshot_dir / self.plugin - - # Fallback: treat output_str as a file path only if it exists on disk - if self.output_str: - try: - raw_output = str(self.output_str).strip() - if raw_output in ('.', './', ''): - best_file = self._find_best_output_file(plugin_dir, self.plugin) - if best_file: - return str(best_file.relative_to(snapshot_dir)) - output_path = None - else: - output_path = Path(raw_output) - - if output_path and output_path.is_absolute(): - # If absolute and within snapshot dir, normalize to relative - if snapshot_dir in output_path.parents and output_path.exists(): - if output_path.is_file(): - return str(output_path.relative_to(snapshot_dir)) - if output_path.is_dir(): - best_file = self._find_best_output_file(output_path, self.plugin) - if best_file: - return str(best_file.relative_to(snapshot_dir)) - elif output_path: - # If relative, prefer plugin-prefixed path, then direct path - plugin_candidate = plugin_dir / output_path - if plugin_candidate.exists(): - if plugin_candidate.is_file(): - return f'{self.plugin}/{output_path}' - if plugin_candidate.is_dir(): - best_file = self._find_best_output_file(plugin_candidate, self.plugin) - if best_file: - return str(best_file.relative_to(snapshot_dir)) - if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'): - return None - snapshot_candidate = snapshot_dir / output_path - if snapshot_candidate.exists(): - if snapshot_candidate.is_file(): - return str(output_path) - if snapshot_candidate.is_dir(): - best_file = self._find_best_output_file(snapshot_candidate, self.plugin) - if best_file: - return str(best_file.relative_to(snapshot_dir)) - except Exception: - pass - - # Check output_files dict for primary output (ignore non-output files) - if self.output_files: - ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'} - output_candidates = [ - f for f in self.output_files.keys() - if Path(f).name not in ignored and Path(f).suffix not in ('.pid', '.log', '.sh') - ] - first_file = output_candidates[0] if output_candidates else None - if first_file and (plugin_dir / first_file).exists(): - return f'{self.plugin}/{first_file}' - - best_file = self._find_best_output_file(plugin_dir, self.plugin) - if best_file: - return str(best_file.relative_to(snapshot_dir)) - - return None + return self.embed_path_db() @property def output_dir_name(self) -> str: @@ -2744,7 +3131,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): @property def pwd(self) -> str: """Working directory (from Process).""" - return self.process.pwd if self.process_id else '' + return self.process.pwd if self.process_id else "" @property def cmd(self) -> list: @@ -2754,7 +3141,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): @property def cmd_version(self) -> str: """Command version (from Process.binary).""" - return self.process.cmd_version if self.process_id else '' + return self.process.cmd_version if self.process_id else "" @property def binary(self): @@ -2792,102 +3179,106 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): - end_ts, cmd, cmd_version, binary FK - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records() """ - import mimetypes from collections import defaultdict from pathlib import Path from django.utils import timezone + from abx_dl.output_files import guess_mimetype from archivebox.hooks import process_hook_records, extract_records_from_process from archivebox.machine.models import Process plugin_dir = Path(self.pwd) if self.pwd else None if not plugin_dir or not plugin_dir.exists(): self.status = self.StatusChoices.FAILED - self.output_str = 'Output directory not found' + self.output_str = "Output directory not found" self.end_ts = timezone.now() self.save() return # Read and parse JSONL output from stdout.log - stdout_file = plugin_dir / 'stdout.log' + stdout_file = plugin_dir / "stdout.log" records = [] if self.process_id and self.process: records = extract_records_from_process(self.process) if not records: - stdout = stdout_file.read_text() if stdout_file.exists() else '' + stdout = stdout_file.read_text() if stdout_file.exists() else "" records = Process.parse_records_from_text(stdout) # Find ArchiveResult record and update status/output from it - ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] if ar_records: hook_data = ar_records[0] # Update status status_map = { - 'succeeded': self.StatusChoices.SUCCEEDED, - 'failed': self.StatusChoices.FAILED, - 'skipped': self.StatusChoices.SKIPPED, - 'noresults': self.StatusChoices.NORESULTS, + "succeeded": self.StatusChoices.SUCCEEDED, + "failed": self.StatusChoices.FAILED, + "skipped": self.StatusChoices.SKIPPED, + "noresults": self.StatusChoices.NORESULTS, } - self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED) + self.status = status_map.get(hook_data.get("status", "failed"), self.StatusChoices.FAILED) # Update output fields - self.output_str = hook_data.get('output_str') or hook_data.get('output') or '' - self.output_json = hook_data.get('output_json') + self.output_str = hook_data.get("output_str") or hook_data.get("output") or "" + self.output_json = hook_data.get("output_json") # Update cmd fields - if hook_data.get('cmd'): + if hook_data.get("cmd"): if self.process_id: - self.process.cmd = hook_data['cmd'] + self.process.cmd = hook_data["cmd"] self.process.save() - self._set_binary_from_cmd(hook_data['cmd']) + self._set_binary_from_cmd(hook_data["cmd"]) # Note: cmd_version is derived from binary.version, not stored on Process else: # No ArchiveResult record: treat background hooks or clean exits as skipped is_background = False try: from archivebox.hooks import is_background_hook + is_background = bool(self.hook_name and is_background_hook(self.hook_name)) except Exception: pass if is_background or (self.process_id and self.process and self.process.exit_code == 0): self.status = self.StatusChoices.SKIPPED - self.output_str = 'Hook did not output ArchiveResult record' + self.output_str = "Hook did not output ArchiveResult record" else: self.status = self.StatusChoices.FAILED - self.output_str = 'Hook did not output ArchiveResult record' + self.output_str = "Hook did not output ArchiveResult record" # Walk filesystem and populate output_files, output_size, output_mimetypes - exclude_names = {'stdout.log', 'stderr.log', 'process.pid', 'hook.pid', 'listener.pid', 'cmd.sh'} + exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid", "cmd.sh"} mime_sizes = defaultdict(int) total_size = 0 output_files = {} - for file_path in plugin_dir.rglob('*'): + for file_path in plugin_dir.rglob("*"): if not file_path.is_file(): continue - if '.hooks' in file_path.parts: + if ".hooks" in file_path.parts: continue if file_path.name in exclude_names: continue try: stat = file_path.stat() - mime_type, _ = mimetypes.guess_type(str(file_path)) - mime_type = mime_type or 'application/octet-stream' + mime_type = guess_mimetype(file_path) or "application/octet-stream" relative_path = str(file_path.relative_to(plugin_dir)) - output_files[relative_path] = {} + output_files[relative_path] = { + "extension": file_path.suffix.lower().lstrip("."), + "mimetype": mime_type, + "size": stat.st_size, + } mime_sizes[mime_type] += stat.st_size total_size += stat.st_size - except (OSError, IOError): + except OSError: continue self.output_files = output_files self.output_size = total_size sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) - self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes) + self.output_mimetypes = ",".join(mime for mime, _ in sorted_mimes) # Update timestamps self.end_ts = timezone.now() @@ -2897,19 +3288,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): # Process side-effect records (filter Snapshots for depth/URL) filtered_records = [] for record in records: - record_type = record.get('type') + record_type = record.get("type") # Skip ArchiveResult records (already processed above) - if record_type == 'ArchiveResult': + if record_type == "ArchiveResult": continue # Filter Snapshot records for depth/URL constraints - if record_type == 'Snapshot': - url = record.get('url') + if record_type == "Snapshot": + url = record.get("url") if not url: continue - depth = record.get('depth', self.snapshot.depth + 1) + depth = record.get("depth", self.snapshot.depth + 1) if depth > self.snapshot.crawl.max_depth: continue @@ -2920,14 +3311,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): # Process filtered records with unified dispatcher overrides = { - 'snapshot': self.snapshot, - 'crawl': self.snapshot.crawl, - 'created_by_id': self.created_by.pk, + "snapshot": self.snapshot, + "crawl": self.snapshot.crawl, + "created_by_id": self.created_by.pk, } process_hook_records(filtered_records, overrides=overrides) # Cleanup PID files (keep logs even if empty so they can be tailed) - pid_file = plugin_dir / 'hook.pid' + pid_file = plugin_dir / "hook.pid" pid_file.unlink(missing_ok=True) def _set_binary_from_cmd(self, cmd: list) -> None: @@ -2948,7 +3339,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): # Try matching by absolute path first binary = Binary.objects.filter( abspath=bin_path_or_name, - machine=machine + machine=machine, ).first() if binary: @@ -2961,7 +3352,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): bin_name = Path(bin_path_or_name).name binary = Binary.objects.filter( name=bin_name, - machine=machine + machine=machine, ).first() if binary: @@ -2981,6 +3372,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): """Get the output directory for this plugin's results.""" return Path(self.snapshot.output_dir) / self.plugin + # ============================================================================= # State Machine Registration # ============================================================================= diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 3a296516..966909c1 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -117,6 +117,7 @@ try: is_valid, error_msg = LDAP_CONFIG.validate_ldap_config() if not is_valid: from rich import print + print(f"[red][X] Error: {error_msg}[/red]") raise ValueError(error_msg) @@ -154,6 +155,7 @@ try: except ImportError as e: from rich import print + print("[red][X] Error: LDAP_ENABLED=True but required LDAP libraries are not installed![/red]") print(f"[red] {e}[/red]") print("[yellow] To install LDAP support, run:[/yellow]") @@ -271,7 +273,6 @@ MIGRATION_MODULES = {"signal_webhooks": None} DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" - # class FilestoreDBRouter: # """ # A router to store all the File models in the filestore.sqlite3 database. @@ -560,7 +561,7 @@ if DEBUG: AUTOTYPING = { "STUBS_GENERATION": { "LOCAL_STUBS_DIR": PACKAGE_DIR / "typings", - } + }, } # https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar) diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py index 0d3a2dd5..0816fde4 100644 --- a/archivebox/core/settings_logging.py +++ b/archivebox/core/settings_logging.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" import re import os @@ -19,6 +19,7 @@ IGNORABLE_URL_PATTERNS = [ re.compile(r"/admin/jsi18n/"), ] + class NoisyRequestsFilter(logging.Filter): def filter(self, record) -> bool: logline = record.getMessage() @@ -34,7 +35,7 @@ class NoisyRequestsFilter(logging.Filter): if ignorable_GET_request.match(logline): return False - ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M) + ignorable_404_pattern = re.compile(f"Not Found: {pattern.pattern}", re.I | re.M) if ignorable_404_pattern.match(logline): return False @@ -44,17 +45,18 @@ class NoisyRequestsFilter(logging.Filter): class CustomOutboundWebhookLogFormatter(logging.Formatter): def format(self, record): result = super().format(record) - return result.replace('HTTP Request: ', 'OutboundWebhook: ') + return result.replace("HTTP Request: ", "OutboundWebhook: ") + class StripANSIColorCodesFilter(logging.Filter): - _ansi_re = re.compile(r'\x1b\[[0-9;]*m') - _bare_re = re.compile(r'\[[0-9;]*m') + _ansi_re = re.compile(r"\x1b\[[0-9;]*m") + _bare_re = re.compile(r"\[[0-9;]*m") def filter(self, record) -> bool: msg = record.getMessage() - if isinstance(msg, str) and ('\x1b[' in msg or '[m' in msg): - msg = self._ansi_re.sub('', msg) - msg = self._bare_re.sub('', msg) + if isinstance(msg, str) and ("\x1b[" in msg or "[m" in msg): + msg = self._ansi_re.sub("", msg) + msg = self._bare_re.sub("", msg) record.msg = msg record.args = () return True @@ -65,18 +67,18 @@ ERROR_LOG = tempfile.NamedTemporaryFile().name LOGS_DIR = CONSTANTS.LOGS_DIR if os.access(LOGS_DIR, os.W_OK) and LOGS_DIR.is_dir(): - ERROR_LOG = (LOGS_DIR / 'errors.log') + ERROR_LOG = LOGS_DIR / "errors.log" else: # historically too many edge cases here around creating log dir w/ correct permissions early on # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr # print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}') pass -LOG_LEVEL_DATABASE = 'WARNING' # change to DEBUG to log all SQL queries -LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING' +LOG_LEVEL_DATABASE = "WARNING" # change to DEBUG to log all SQL queries +LOG_LEVEL_REQUEST = "WARNING" # if DEBUG else 'WARNING' -if LOG_LEVEL_DATABASE == 'DEBUG': - db_logger = logging.getLogger('django.db.backends') +if LOG_LEVEL_DATABASE == "DEBUG": + db_logger = logging.getLogger("django.db.backends") db_logger.setLevel(logging.DEBUG) db_logger.addHandler(logging.StreamHandler()) diff --git a/archivebox/core/tag_utils.py b/archivebox/core/tag_utils.py index de562b34..d0efd427 100644 --- a/archivebox/core/tag_utils.py +++ b/archivebox/core/tag_utils.py @@ -16,52 +16,52 @@ from archivebox.core.models import Snapshot, SnapshotTag, Tag TAG_SNAPSHOT_PREVIEW_LIMIT = 10 TAG_SORT_CHOICES = ( - ('name_asc', 'Name A-Z'), - ('name_desc', 'Name Z-A'), - ('created_desc', 'Created newest'), - ('created_asc', 'Created oldest'), - ('snapshots_desc', 'Most snapshots'), - ('snapshots_asc', 'Fewest snapshots'), + ("name_asc", "Name A-Z"), + ("name_desc", "Name Z-A"), + ("created_desc", "Created newest"), + ("created_asc", "Created oldest"), + ("snapshots_desc", "Most snapshots"), + ("snapshots_asc", "Fewest snapshots"), ) TAG_HAS_SNAPSHOTS_CHOICES = ( - ('all', 'All'), - ('yes', 'Has snapshots'), - ('no', 'No snapshots'), + ("all", "All"), + ("yes", "Has snapshots"), + ("no", "No snapshots"), ) def normalize_tag_name(name: str) -> str: - return (name or '').strip() + return (name or "").strip() -def normalize_tag_sort(sort: str = 'created_desc') -> str: +def normalize_tag_sort(sort: str = "created_desc") -> str: valid_sorts = {key for key, _label in TAG_SORT_CHOICES} - return sort if sort in valid_sorts else 'created_desc' + return sort if sort in valid_sorts else "created_desc" -def normalize_has_snapshots_filter(value: str = 'all') -> str: +def normalize_has_snapshots_filter(value: str = "all") -> str: valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES} - return value if value in valid_filters else 'all' + return value if value in valid_filters else "all" -def normalize_created_by_filter(created_by: str = '') -> str: - return created_by if str(created_by).isdigit() else '' +def normalize_created_by_filter(created_by: str = "") -> str: + return created_by if str(created_by).isdigit() else "" -def normalize_created_year_filter(year: str = '') -> str: - year = (year or '').strip() - return year if len(year) == 4 and year.isdigit() else '' +def normalize_created_year_filter(year: str = "") -> str: + year = (year or "").strip() + return year if len(year) == 4 and year.isdigit() else "" def get_matching_tags( - query: str = '', - sort: str = 'created_desc', - created_by: str = '', - year: str = '', - has_snapshots: str = 'all', + query: str = "", + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", ) -> QuerySet[Tag]: - queryset = Tag.objects.select_related('created_by').annotate( - num_snapshots=Count('snapshot_set', distinct=True), + queryset = Tag.objects.select_related("created_by").annotate( + num_snapshots=Count("snapshot_set", distinct=True), ) query = normalize_tag_name(query) @@ -79,41 +79,40 @@ def get_matching_tags( queryset = queryset.filter(created_at__year=int(year)) has_snapshots = normalize_has_snapshots_filter(has_snapshots) - if has_snapshots == 'yes': + if has_snapshots == "yes": queryset = queryset.filter(num_snapshots__gt=0) - elif has_snapshots == 'no': + elif has_snapshots == "no": queryset = queryset.filter(num_snapshots=0) sort = normalize_tag_sort(sort) - if sort == 'name_asc': - queryset = queryset.order_by(Lower('name'), 'id') - elif sort == 'name_desc': - queryset = queryset.order_by(Lower('name').desc(), '-id') - elif sort == 'created_asc': - queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name')) - elif sort == 'snapshots_desc': - queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name')) - elif sort == 'snapshots_asc': - queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id') + if sort == "name_asc": + queryset = queryset.order_by(Lower("name"), "id") + elif sort == "name_desc": + queryset = queryset.order_by(Lower("name").desc(), "-id") + elif sort == "created_asc": + queryset = queryset.order_by(F("created_at").asc(nulls_first=True), "id", Lower("name")) + elif sort == "snapshots_desc": + queryset = queryset.order_by(F("num_snapshots").desc(nulls_last=True), F("created_at").desc(nulls_last=True), "-id", Lower("name")) + elif sort == "snapshots_asc": + queryset = queryset.order_by(F("num_snapshots").asc(nulls_first=True), Lower("name"), "id") else: - queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name')) + queryset = queryset.order_by(F("created_at").desc(nulls_last=True), "-id", Lower("name")) return queryset def get_tag_creator_choices() -> list[tuple[str, str]]: rows = ( - Tag.objects - .filter(created_by__isnull=False) - .values_list('created_by_id', 'created_by__username') - .order_by(Lower('created_by__username'), 'created_by_id') + Tag.objects.filter(created_by__isnull=False) + .values_list("created_by_id", "created_by__username") + .order_by(Lower("created_by__username"), "created_by_id") .distinct() ) - return [(str(user_id), username or f'User {user_id}') for user_id, username in rows] + return [(str(user_id), username or f"User {user_id}") for user_id, username in rows] def get_tag_year_choices() -> list[str]: - years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC') + years = Tag.objects.exclude(created_at__isnull=True).dates("created_at", "year", order="DESC") return [str(year.year) for year in years] @@ -134,7 +133,7 @@ def get_tag_by_ref(tag_ref: str | int) -> Tag: def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]: normalized_name = normalize_tag_name(name) if not normalized_name: - raise ValueError('Tag name is required') + raise ValueError("Tag name is required") existing = Tag.objects.filter(name__iexact=normalized_name).first() if existing: @@ -150,7 +149,7 @@ def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, b def rename_tag(tag: Tag, name: str) -> Tag: normalized_name = normalize_tag_name(name) if not normalized_name: - raise ValueError('Tag name is required') + raise ValueError("Tag name is required") existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first() if existing: @@ -167,53 +166,56 @@ def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]: def export_tag_urls(tag: Tag) -> str: - urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True) - return '\n'.join(urls) + urls = tag.snapshot_set.order_by("-downloaded_at", "-created_at", "-pk").values_list("url", flat=True) + return "\n".join(urls) def export_tag_snapshots_jsonl(tag: Tag) -> str: - snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags') - return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots) + snapshots = tag.snapshot_set.order_by("-downloaded_at", "-created_at", "-pk").prefetch_related("tags") + return "\n".join(json.dumps(snapshot.to_json()) for snapshot in snapshots) def _display_snapshot_title(snapshot: Snapshot) -> str: - title = (snapshot.title or '').strip() - url = (snapshot.url or '').strip() + title = (snapshot.title or "").strip() + url = (snapshot.url or "").strip() if not title: return url normalized_title = title.lower() - if normalized_title == 'pending...' or normalized_title == url.lower(): + if normalized_title == "pending..." or normalized_title == url.lower(): return url return title def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]: return { - 'id': str(snapshot.pk), - 'title': _display_snapshot_title(snapshot), - 'url': snapshot.url, - 'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request), - 'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]), - 'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request), - 'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None, + "id": str(snapshot.pk), + "title": _display_snapshot_title(snapshot), + "url": snapshot.url, + "favicon_url": build_snapshot_url(str(snapshot.pk), "favicon.ico", request=request), + "admin_url": reverse("admin:core_snapshot_change", args=[snapshot.pk]), + "archive_url": build_web_url(f"/{snapshot.archive_path_from_db}/index.html", request=request), + "downloaded_at": snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None, } -def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]: +def _build_snapshot_preview_map( + tags: list[Tag], + request: HttpRequest | None = None, + preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, +) -> dict[int, list[dict[str, Any]]]: tag_ids = [tag.pk for tag in tags] if not tag_ids: return {} snapshot_tags = ( - SnapshotTag.objects - .filter(tag_id__in=tag_ids) - .select_related('snapshot__crawl__created_by') + SnapshotTag.objects.filter(tag_id__in=tag_ids) + .select_related("snapshot__crawl__created_by") .order_by( - 'tag_id', - F('snapshot__downloaded_at').desc(nulls_last=True), - F('snapshot__created_at').desc(nulls_last=True), - F('snapshot_id').desc(), + "tag_id", + F("snapshot__downloaded_at").desc(nulls_last=True), + F("snapshot__created_at").desc(nulls_last=True), + F("snapshot_id").desc(), ) ) @@ -227,31 +229,31 @@ def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = N def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]: - count = getattr(tag, 'num_snapshots', tag.snapshot_set.count()) + count = getattr(tag, "num_snapshots", tag.snapshot_set.count()) return { - 'id': tag.pk, - 'name': tag.name, - 'slug': tag.slug, - 'num_snapshots': count, - 'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}", - 'edit_url': reverse('admin:core_tag_change', args=[tag.pk]), - 'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]), - 'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]), - 'rename_url': reverse('api-1:rename_tag', args=[tag.pk]), - 'delete_url': reverse('api-1:delete_tag', args=[tag.pk]), - 'snapshots': snapshot_previews or [], + "id": tag.pk, + "name": tag.name, + "slug": tag.slug, + "num_snapshots": count, + "filter_url": f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}", + "edit_url": reverse("admin:core_tag_change", args=[tag.pk]), + "export_urls_url": reverse("api-1:tag_urls_export", args=[tag.pk]), + "export_jsonl_url": reverse("api-1:tag_snapshots_export", args=[tag.pk]), + "rename_url": reverse("api-1:rename_tag", args=[tag.pk]), + "delete_url": reverse("api-1:delete_tag", args=[tag.pk]), + "snapshots": snapshot_previews or [], } def build_tag_cards( - query: str = '', + query: str = "", request: HttpRequest | None = None, limit: int | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, - sort: str = 'created_desc', - created_by: str = '', - year: str = '', - has_snapshots: str = 'all', + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", ) -> list[dict[str, Any]]: queryset = get_matching_tags( query=query, @@ -265,7 +267,4 @@ def build_tag_cards( tags = list(queryset) preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit) - return [ - build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, [])) - for tag in tags - ] + return [build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, [])) for tag in tags] diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index a0323ca3..fb1730a2 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -1,13 +1,16 @@ +from typing import Any + from django import template from django.contrib.admin.templatetags.base import InclusionAdminNode from django.utils.safestring import mark_safe from django.utils.html import escape -from typing import Union from pathlib import Path from archivebox.hooks import ( - get_plugin_icon, get_plugin_template, get_plugin_name, + get_plugin_icon, + get_plugin_template, + get_plugin_name, ) from archivebox.core.host_utils import ( get_admin_base_url, @@ -20,28 +23,70 @@ from archivebox.core.host_utils import ( register = template.Library() +_TEXT_PREVIEW_EXTS = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log") +_IMAGE_PREVIEW_EXTS = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".ico", ".avif") + _MEDIA_FILE_EXTS = { - '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v', '.mpg', '.mpeg', '.ts', '.m2ts', '.mts', - '.3gp', '.3g2', '.ogv', - '.mp3', '.m4a', '.aac', '.ogg', '.oga', '.opus', '.wav', '.flac', '.alac', '.aiff', '.wma', '.mka', '.ac3', '.eac3', '.dts', + ".mp4", + ".webm", + ".mkv", + ".avi", + ".mov", + ".flv", + ".wmv", + ".m4v", + ".mpg", + ".mpeg", + ".ts", + ".m2ts", + ".mts", + ".3gp", + ".3g2", + ".ogv", + ".mp3", + ".m4a", + ".aac", + ".ogg", + ".oga", + ".opus", + ".wav", + ".flac", + ".alac", + ".aiff", + ".wma", + ".mka", + ".ac3", + ".eac3", + ".dts", } +def _normalize_output_files(output_files: Any) -> dict[str, dict[str, Any]]: + if isinstance(output_files, dict): + normalized: dict[str, dict[str, Any]] = {} + for path, metadata in output_files.items(): + if not path: + continue + normalized[str(path)] = dict(metadata) if isinstance(metadata, dict) else {} + return normalized + return {} + + +def _coerce_output_file_size(value: Any) -> int | None: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return None + + def _count_media_files(result) -> int: try: - output_files = getattr(result, 'output_files', None) or {} + output_files = _normalize_output_files(getattr(result, "output_files", None) or {}) except Exception: output_files = {} - count_from_output = 0 if output_files: - count_from_output = sum( - 1 - for path in output_files.keys() - if Path(path).suffix.lower() in _MEDIA_FILE_EXTS - ) - if count_from_output >= 2: - return count_from_output + return sum(1 for path in output_files.keys() if Path(path).suffix.lower() in _MEDIA_FILE_EXTS) try: plugin_dir = Path(result.snapshot_dir) / result.plugin @@ -54,7 +99,7 @@ def _count_media_files(result) -> int: count = 0 scanned = 0 max_scan = 500 - for file_path in plugin_dir.rglob('*'): + for file_path in plugin_dir.rglob("*"): if scanned >= max_scan: break scanned += 1 @@ -62,29 +107,28 @@ def _count_media_files(result) -> int: continue if file_path.suffix.lower() in _MEDIA_FILE_EXTS: count += 1 - return max(count_from_output, count) + return count def _list_media_files(result) -> list[dict]: media_files: list[dict] = [] try: plugin_dir = Path(result.snapshot_dir) / result.plugin - snapshot_dir = Path(result.snapshot_dir) except Exception: return media_files - output_files = getattr(result, 'output_files', None) or {} - candidates: list[Path] = [] + output_files = _normalize_output_files(getattr(result, "output_files", None) or {}) + candidates: list[tuple[Path, int | None]] = [] if output_files: - for path in output_files.keys(): + for path, metadata in output_files.items(): rel_path = Path(path) if rel_path.suffix.lower() in _MEDIA_FILE_EXTS: - candidates.append(rel_path) + candidates.append((rel_path, _coerce_output_file_size(metadata.get("size")))) if not candidates and plugin_dir.exists(): scanned = 0 max_scan = 2000 - for file_path in plugin_dir.rglob('*'): + for file_path in plugin_dir.rglob("*"): if scanned >= max_scan: break scanned += 1 @@ -95,40 +139,143 @@ def _list_media_files(result) -> list[dict]: rel_path = file_path.relative_to(plugin_dir) except ValueError: continue - candidates.append(rel_path) + try: + size = file_path.stat().st_size + except OSError: + size = None + candidates.append((rel_path, size)) - for rel_path in candidates: - file_path = plugin_dir / rel_path - if not file_path.exists() or not file_path.is_file(): - continue - try: - size = file_path.stat().st_size - except OSError: - size = None - try: - href = str(file_path.relative_to(snapshot_dir)) - except ValueError: - href = str(Path(result.plugin) / rel_path) - media_files.append({ - 'name': file_path.name, - 'path': href, - 'size': size, - }) + for rel_path, size in candidates: + href = str(Path(result.plugin) / rel_path) + media_files.append( + { + "name": rel_path.name, + "path": href, + "size": size, + }, + ) - media_files.sort(key=lambda item: item['name'].lower()) + media_files.sort(key=lambda item: item["name"].lower()) return media_files -@register.filter(name='split') -def split(value, separator: str=','): - return (value or '').split(separator) + +def _resolve_snapshot_output_file(snapshot_dir: str | Path | None, raw_output_path: str | None) -> Path | None: + if not snapshot_dir or not raw_output_path or str(raw_output_path).strip() in (".", "/", "./"): + return None + + output_file = Path(raw_output_path) + if not output_file.is_absolute(): + output_file = Path(snapshot_dir) / raw_output_path + + try: + output_file = output_file.resolve() + snap_dir = Path(snapshot_dir).resolve() + if snap_dir not in output_file.parents and output_file != snap_dir: + return None + except Exception: + return None + + if output_file.exists() and output_file.is_file(): + return output_file + return None + + +def _is_text_preview_path(raw_output_path: str | None) -> bool: + return (raw_output_path or "").lower().endswith(_TEXT_PREVIEW_EXTS) + + +def _is_image_preview_path(raw_output_path: str | None) -> bool: + return (raw_output_path or "").lower().endswith(_IMAGE_PREVIEW_EXTS) + + +def _is_root_snapshot_output_path(raw_output_path: str | None) -> bool: + normalized = str(raw_output_path or "").strip().lower() + return normalized in ("", ".", "./", "/", "index.html", "index.json") + + +def _build_snapshot_files_url(snapshot_id: str, request=None) -> str: + return build_snapshot_url(str(snapshot_id), "/?files=1", request=request) + + +def _build_snapshot_preview_url(snapshot_id: str, path: str = "", request=None) -> str: + if path == "about:blank": + return path + if _is_root_snapshot_output_path(path): + return _build_snapshot_files_url(snapshot_id, request=request) + url = build_snapshot_url(str(snapshot_id), path, request=request) + if not (_is_text_preview_path(path) or _is_image_preview_path(path)): + return url + separator = "&" if "?" in url else "?" + return f"{url}{separator}preview=1" + + +def _render_text_preview(plugin: str, icon_html: str, snippet: str) -> str: + plugin_attr = escape(plugin or "") + plugin_label = escape(plugin or "") + escaped = escape(snippet) + return ( + f'
' + f'
' + f'{icon_html}' + f'{plugin_label}' + f"
" + f'
{escaped}
' + f"
" + ) + + +def _render_fallback_card(plugin: str, icon_html: str, fallback_label: str) -> str: + plugin_attr = escape(plugin or "") + plugin_label = escape(plugin or "") + fallback_attr = escape(fallback_label) + return ( + f'
' + f'{icon_html}' + f'{plugin_label}' + f'{fallback_attr}' + f"
" + ) + + +def _render_text_file_preview(snapshot_dir: str | Path | None, raw_output_path: str | None, plugin: str, icon_html: str) -> str | None: + output_file = _resolve_snapshot_output_file(snapshot_dir, raw_output_path) + if not output_file: + return None + + try: + with output_file.open("rb") as f: + raw = f.read(4096) + text = raw.decode("utf-8", errors="replace").strip() + if not text: + return None + lines = text.splitlines()[:6] + snippet = "\n".join(lines) + return _render_text_preview(plugin, icon_html, snippet) + except Exception: + return None + + +@register.filter(name="split") +def split(value, separator: str = ","): + return (value or "").split(separator) + + +@register.filter(name="index") +def index(value, position): + try: + return value[int(position)] + except Exception: + return None + @register.filter -def file_size(num_bytes: Union[int, float]) -> str: - for count in ['Bytes','KB','MB','GB']: +def file_size(num_bytes: int | float) -> str: + for count in ["Bytes", "KB", "MB", "GB"]: if num_bytes > -1024.0 and num_bytes < 1024.0: - return '%3.1f %s' % (num_bytes, count) + return f"{num_bytes:3.1f} {count}" num_bytes /= 1024.0 - return '%3.1f %s' % (num_bytes, 'TB') + return "{:3.1f} {}".format(num_bytes, "TB") + def result_list(cl): """ @@ -136,52 +283,61 @@ def result_list(cl): """ num_sorted_fields = 0 return { - 'cl': cl, - 'num_sorted_fields': num_sorted_fields, - 'results': cl.result_list, + "cl": cl, + "num_sorted_fields": num_sorted_fields, + "results": cl.result_list, } -@register.tag(name='snapshots_grid') + +@register.tag(name="snapshots_grid") def result_list_tag(parser, token): return InclusionAdminNode( - parser, token, + parser, + token, func=result_list, - template_name='snapshots_grid.html', + template_name="snapshots_grid.html", takes_context=False, ) + @register.simple_tag(takes_context=True) def url_replace(context, **kwargs): - dict_ = context['request'].GET.copy() + dict_ = context["request"].GET.copy() dict_.update(**kwargs) return dict_.urlencode() @register.simple_tag(takes_context=True) def admin_base_url(context) -> str: - return get_admin_base_url(request=context.get('request')) + return get_admin_base_url(request=context.get("request")) @register.simple_tag(takes_context=True) def web_base_url(context) -> str: - return get_web_base_url(request=context.get('request')) + return get_web_base_url(request=context.get("request")) @register.simple_tag(takes_context=True) def public_base_url(context) -> str: - return get_public_base_url(request=context.get('request')) + return get_public_base_url(request=context.get("request")) @register.simple_tag(takes_context=True) def snapshot_base_url(context, snapshot) -> str: - snapshot_id = getattr(snapshot, 'id', snapshot) - return get_snapshot_base_url(str(snapshot_id), request=context.get('request')) + snapshot_id = getattr(snapshot, "id", snapshot) + return get_snapshot_base_url(str(snapshot_id), request=context.get("request")) @register.simple_tag(takes_context=True) def snapshot_url(context, snapshot, path: str = "") -> str: - snapshot_id = getattr(snapshot, 'id', snapshot) - return build_snapshot_url(str(snapshot_id), path, request=context.get('request')) + snapshot_id = getattr(snapshot, "id", snapshot) + return build_snapshot_url(str(snapshot_id), path, request=context.get("request")) + + +@register.simple_tag(takes_context=True) +def snapshot_preview_url(context, snapshot, path: str = "") -> str: + snapshot_id = getattr(snapshot, "id", snapshot) + return _build_snapshot_preview_url(str(snapshot_id), path, request=context.get("request")) @register.simple_tag @@ -193,7 +349,7 @@ def plugin_icon(plugin: str) -> str: """ icon_html = get_plugin_icon(plugin) return mark_safe( - f'{icon_html}' + f'{icon_html}', ) @@ -210,46 +366,50 @@ def plugin_card(context, result) -> str: - output_path: Path to output relative to snapshot dir (from embed_path()) - plugin: Plugin base name """ + if result is None or not hasattr(result, "plugin"): + return "" + plugin = get_plugin_name(result.plugin) - template_str = get_plugin_template(plugin, 'card') + template_str = get_plugin_template(plugin, "card") # Use embed_path() for the display path - raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else '' + raw_output_path = result.embed_path() if hasattr(result, "embed_path") else "" output_url = build_snapshot_url( - str(getattr(result, 'snapshot_id', '')), - raw_output_path or '', - request=context.get('request'), + str(getattr(result, "snapshot_id", "")), + raw_output_path or "", + request=context.get("request"), ) icon_html = get_plugin_icon(plugin) - plugin_lower = (plugin or '').lower() - media_file_count = _count_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else 0 - media_files = _list_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else [] + plugin_lower = (plugin or "").lower() + media_file_count = _count_media_files(result) if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl") else 0 + media_files = _list_media_files(result) if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl") else [] if media_files: - snapshot_id = str(getattr(result, 'snapshot_id', '')) - request = context.get('request') + snapshot_id = str(getattr(result, "snapshot_id", "")) + request = context.get("request") for item in media_files: - path = item.get('path') or '' - item['url'] = build_snapshot_url(snapshot_id, path, request=request) if path else '' + path = item.get("path") or "" + item["url"] = build_snapshot_url(snapshot_id, path, request=request) if path else "" - output_lower = (raw_output_path or '').lower() - text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log') - force_text_preview = output_lower.endswith(text_preview_exts) + output_lower = (raw_output_path or "").lower() + force_text_preview = output_lower.endswith(_TEXT_PREVIEW_EXTS) # Create a mini template and render it with context try: - if template_str and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './') and not force_text_preview: + if template_str and raw_output_path and str(raw_output_path).strip() not in (".", "/", "./") and not force_text_preview: tpl = template.Template(template_str) - ctx = template.Context({ - 'result': result, - 'snapshot': result.snapshot, - 'output_path': output_url, - 'output_path_raw': raw_output_path, - 'plugin': plugin, - 'plugin_icon': icon_html, - 'media_file_count': media_file_count, - 'media_files': media_files, - }) + ctx = template.Context( + { + "result": result, + "snapshot": result.snapshot, + "output_path": output_url, + "output_path_raw": raw_output_path, + "plugin": plugin, + "plugin_icon": icon_html, + "media_file_count": media_file_count, + "media_files": media_files, + }, + ) rendered = tpl.render(ctx) # Only return non-empty content (strip whitespace to check) if rendered.strip(): @@ -257,52 +417,30 @@ def plugin_card(context, result) -> str: except Exception: pass - if force_text_preview and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './'): - output_file = Path(raw_output_path) - if not output_file.is_absolute(): - output_file = Path(result.snapshot_dir) / raw_output_path - try: - output_file = output_file.resolve() - snap_dir = Path(result.snapshot_dir).resolve() - if snap_dir not in output_file.parents and output_file != snap_dir: - output_file = None - except Exception: - output_file = None - if output_file and output_file.exists() and output_file.is_file(): - try: - with output_file.open('rb') as f: - raw = f.read(4096) - text = raw.decode('utf-8', errors='replace').strip() - if text: - lines = text.splitlines()[:6] - snippet = '\n'.join(lines) - escaped = escape(snippet) - preview = ( - f'
' - f'
' - f'{icon_html}' - f'{plugin}' - f'
' - f'
{escaped}
' - f'
' - ) - return mark_safe(preview) - except Exception: - pass + if force_text_preview: + preview = _render_text_file_preview(getattr(result, "snapshot_dir", None), raw_output_path, plugin, icon_html) + if preview: + return mark_safe(preview) - if output_lower.endswith(text_preview_exts): - fallback_label = 'text' + if output_lower.endswith(_TEXT_PREVIEW_EXTS): + fallback_label = "text" else: - fallback_label = 'output' + fallback_label = "output" - fallback = ( - f'
' - f'{icon_html}' - f'{plugin}' - f'{fallback_label}' - f'
' - ) - return mark_safe(fallback) + return mark_safe(_render_fallback_card(plugin, icon_html, fallback_label)) + + +@register.simple_tag +def output_card(snapshot, output_path: str, plugin: str) -> str: + plugin_name = get_plugin_name(plugin) + icon_html = get_plugin_icon(plugin_name) + preview = _render_text_file_preview(getattr(snapshot, "output_dir", None), output_path, plugin_name, icon_html) + if preview: + return mark_safe(preview) + + output_lower = (output_path or "").lower() + fallback_label = "text" if output_lower.endswith(_TEXT_PREVIEW_EXTS) else "output" + return mark_safe(_render_fallback_card(plugin_name, icon_html, fallback_label)) @register.simple_tag(takes_context=True) @@ -312,37 +450,46 @@ def plugin_full(context, result) -> str: Usage: {% plugin_full result %} """ + if result is None or not hasattr(result, "plugin"): + return "" + plugin = get_plugin_name(result.plugin) - template_str = get_plugin_template(plugin, 'full') + template_str = get_plugin_template(plugin, "full") if not template_str: - return '' + return "" - raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else '' + raw_output_path = "" + if hasattr(result, "embed_path_db"): + raw_output_path = result.embed_path_db() or "" + if not raw_output_path and hasattr(result, "embed_path"): + raw_output_path = result.embed_path() or "" + if _is_root_snapshot_output_path(raw_output_path): + return "" output_url = build_snapshot_url( - str(getattr(result, 'snapshot_id', '')), - raw_output_path or '', - request=context.get('request'), + str(getattr(result, "snapshot_id", "")), + raw_output_path, + request=context.get("request"), ) try: tpl = template.Template(template_str) - ctx = template.Context({ - 'result': result, - 'snapshot': result.snapshot, - 'output_path': output_url, - 'output_path_raw': raw_output_path, - 'plugin': plugin, - }) + ctx = template.Context( + { + "result": result, + "snapshot": result.snapshot, + "output_path": output_url, + "output_path_raw": raw_output_path, + "plugin": plugin, + }, + ) rendered = tpl.render(ctx) # Only return non-empty content (strip whitespace to check) if rendered.strip(): return mark_safe(rendered) - return '' + return "" except Exception: - return '' - - + return "" @register.filter @@ -355,8 +502,6 @@ def plugin_name(value: str) -> str: return get_plugin_name(value) - - @register.simple_tag(takes_context=True) def api_token(context) -> str: """ @@ -364,10 +509,10 @@ def api_token(context) -> str: """ from archivebox.api.auth import get_or_create_api_token - request = context.get('request') - user = getattr(request, 'user', None) + request = context.get("request") + user = getattr(request, "user", None) if not user or not user.is_authenticated: - return '' + return "" token = get_or_create_api_token(user) - return token.token if token else '' + return token.token if token else "" diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index e8072d14..85a5bb85 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from django.urls import path, re_path, include from django.views import static @@ -9,7 +9,18 @@ from django.http import HttpRequest from archivebox.misc.serve_static import serve_static from archivebox.core.admin_site import archivebox_admin -from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, SnapshotReplayView, OriginalDomainReplayView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view +from archivebox.core.views import ( + HomepageView, + SnapshotView, + SnapshotPathView, + SnapshotReplayView, + OriginalDomainReplayView, + PublicIndexView, + AddView, + WebAddView, + HealthCheckView, + live_progress_view, +) # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 @@ -22,54 +33,54 @@ from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, urlpatterns = [ re_path(r"^static/(?P.*)$", serve_static), # re_path(r"^media/(?P.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}), - - path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}), - path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}), - - path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), - - path('public/', PublicIndexView.as_view(), name='public-index'), - path('public.html', RedirectView.as_view(url='/public/'), name='public-index-html'), - - path('archive/', RedirectView.as_view(url='/')), - path('archive/', SnapshotView.as_view(), name='Snapshot'), - re_path(r'^snapshot\/(?P[0-9a-fA-F-]{8,36})(?:\/(?P.*))?$', SnapshotReplayView.as_view(), name='snapshot-replay'), - re_path(r'^original\/(?P[^/]+)(?:\/(?P.*))?$', OriginalDomainReplayView.as_view(), name='original-replay'), - re_path(r'^web/(?P(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$', WebAddView.as_view(), name='web-add'), - re_path(r'^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?Phttps?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'), - re_path(r'^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'), - re_path(r'^(?P[^/]+)/(?Phttps?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url-nodate'), - re_path(r'^(?P[^/]+)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path-nodate'), - - path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')), - path('add/', AddView.as_view(), name='add'), - - path('accounts/login/', RedirectView.as_view(url='/admin/login/')), - path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')), - - - path('accounts/', include('django.contrib.auth.urls')), - - path('admin/live-progress/', live_progress_view, name='live_progress'), - path('admin/', archivebox_admin.urls), - - path("api/", include('archivebox.api.urls'), name='api'), - - path('health/', HealthCheckView.as_view(), name='healthcheck'), - path('error/', lambda request: _raise_test_error(request)), - + path("robots.txt", static.serve, {"document_root": settings.STATICFILES_DIRS[0], "path": "robots.txt"}), + path("favicon.ico", static.serve, {"document_root": settings.STATICFILES_DIRS[0], "path": "favicon.ico"}), + path("docs/", RedirectView.as_view(url="https://github.com/ArchiveBox/ArchiveBox/wiki"), name="Docs"), + path("public/", PublicIndexView.as_view(), name="public-index"), + path("public.html", RedirectView.as_view(url="/public/"), name="public-index-html"), + path("archive/", RedirectView.as_view(url="/")), + path("archive/", SnapshotView.as_view(), name="Snapshot"), + re_path(r"^snapshot\/(?P[0-9a-fA-F-]{8,36})(?:\/(?P.*))?$", SnapshotReplayView.as_view(), name="snapshot-replay"), + re_path(r"^original\/(?P[^/]+)(?:\/(?P.*))?$", OriginalDomainReplayView.as_view(), name="original-replay"), + re_path(r"^web/(?P(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$", WebAddView.as_view(), name="web-add"), + re_path( + r"^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?Phttps?://.*)$", + SnapshotPathView.as_view(), + name="snapshot-path-url", + ), + re_path( + r"^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$", + SnapshotPathView.as_view(), + name="snapshot-path", + ), + re_path(r"^(?P[^/]+)/(?Phttps?://.*)$", SnapshotPathView.as_view(), name="snapshot-path-url-nodate"), + re_path( + r"^(?P[^/]+)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$", + SnapshotPathView.as_view(), + name="snapshot-path-nodate", + ), + path("admin/core/snapshot/add/", RedirectView.as_view(url="/add/")), + path("add/", AddView.as_view(), name="add"), + path("accounts/login/", RedirectView.as_view(url="/admin/login/")), + path("accounts/logout/", RedirectView.as_view(url="/admin/logout/")), + path("accounts/", include("django.contrib.auth.urls")), + path("admin/live-progress/", live_progress_view, name="live_progress"), + path("admin/", archivebox_admin.urls), + path("api/", include("archivebox.api.urls"), name="api"), + path("health/", HealthCheckView.as_view(), name="healthcheck"), + path("error/", lambda request: _raise_test_error(request)), # path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django - - path('index.html', RedirectView.as_view(url='/')), - path('', HomepageView.as_view(), name='Home'), + path("index.html", RedirectView.as_view(url="/")), + path("", HomepageView.as_view(), name="Home"), ] def _raise_test_error(_request: HttpRequest): - raise ZeroDivisionError('Intentional test error route') + raise ZeroDivisionError("Intentional test error route") + if settings.DEBUG_TOOLBAR: - urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))] + urlpatterns += [path("__debug__/", include("debug_toolbar.urls"))] if settings.DEBUG_REQUESTS_TRACKER: urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))] @@ -84,7 +95,7 @@ if settings.DEBUG_REQUESTS_TRACKER: # path('/admin', admin.site.urls) # path('/accounts', django.contrib.auth.urls) -# # Prposed REST API spec +# # Proposed REST API spec # # :slugs can be uuid, short_uuid, or any of the unique index_fields # path('api/v1/'), # path('api/v1/core/' [GET]) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index d63af6dc..9f27b278 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" import json import os @@ -6,7 +6,8 @@ import posixpath from glob import glob, escape from django.utils import timezone import inspect -from typing import Callable, cast, get_type_hints +from typing import cast, get_type_hints +from collections.abc import Callable from pathlib import Path from urllib.parse import quote, urlparse @@ -29,13 +30,22 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG from archivebox.config.configset import get_flat_config, get_config, get_all_configs -from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode +from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode, without_fragment from archivebox.misc.serve_static import serve_static_with_byterange_support from archivebox.misc.logging_util import printable_filesize -from archivebox.search import query_search_index +from archivebox.search import get_search_mode, prioritize_metadata_matches, query_search_index from archivebox.core.models import Snapshot -from archivebox.core.host_utils import build_snapshot_url +from archivebox.core.host_utils import ( + build_admin_url, + build_snapshot_url, + build_web_url, + get_admin_host, + get_snapshot_host, + get_snapshot_lookup_key, + get_web_host, + host_matches, +) from archivebox.core.forms import AddLinkForm from archivebox.crawls.models import Crawl from archivebox.hooks import ( @@ -48,35 +58,54 @@ from archivebox.hooks import ( ) -ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/' -LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/' +ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/" +LIVE_PLUGIN_BASE_URL = "/admin/environment/plugins/" def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: - target = archivefile or '' - if target == 'index.html': - target = '' + target = archivefile or "" + if target == "index.html": + target = "" fullpath = Path(snapshot.output_dir) / target if fullpath.is_file(): target = str(Path(target).parent) - if target == '.': - target = '' + if target == ".": + target = "" return target +def _find_snapshot_by_ref(snapshot_ref: str) -> Snapshot | None: + lookup = get_snapshot_lookup_key(snapshot_ref) + if not lookup: + return None + + if len(lookup) == 12 and "-" not in lookup: + return Snapshot.objects.filter(id__endswith=lookup).order_by("-created_at", "-downloaded_at").first() + + try: + return Snapshot.objects.get(pk=lookup) + except Snapshot.DoesNotExist: + try: + return Snapshot.objects.get(id__startswith=lookup) + except Snapshot.DoesNotExist: + return None + except Snapshot.MultipleObjectsReturned: + return Snapshot.objects.filter(id__startswith=lookup).first() + + def _admin_login_redirect_or_forbidden(request: HttpRequest): if SERVER_CONFIG.CONTROL_PLANE_ENABLED: - return redirect(f'/admin/login/?next={request.path}') + return redirect(f"/admin/login/?next={request.path}") return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") class HomepageView(View): def get(self, request): if request.user.is_authenticated and SERVER_CONFIG.CONTROL_PLANE_ENABLED: - return redirect('/admin/core/snapshot/') + return redirect("/admin/core/snapshot/") if SERVER_CONFIG.PUBLIC_INDEX: - return redirect('/public') + return redirect("/public") return _admin_login_redirect_or_forbidden(request) @@ -87,17 +116,24 @@ class SnapshotView(View): @staticmethod def find_snapshots_for_url(path: str): """Return a queryset of snapshots matching a URL-ish path.""" - normalized = path - if path.startswith(('http://', 'https://')): + + def _fragmentless_url_query(url: str) -> Q: + canonical = without_fragment(url) + return Q(url=canonical) | Q(url__startswith=f"{canonical}#") + + normalized = without_fragment(path) + if path.startswith(("http://", "https://")): # try exact match on full url / ID first - qs = Snapshot.objects.filter(Q(url=path) | Q(id__icontains=path)) + qs = Snapshot.objects.filter(_fragmentless_url_query(path) | Q(id__icontains=path) | Q(id__icontains=normalized)) if qs.exists(): return qs - normalized = path.split('://', 1)[1] + normalized = normalized.split("://", 1)[1] # try exact match on full url / ID (without scheme) qs = Snapshot.objects.filter( - Q(url='http://' + normalized) | Q(url='https://' + normalized) | Q(id__icontains=normalized) + _fragmentless_url_query("http://" + normalized) + | _fragmentless_url_query("https://" + normalized) + | Q(id__icontains=normalized), ) if qs.exists(): return qs @@ -105,131 +141,62 @@ class SnapshotView(View): # fall back to match on exact base_url base = base_url(normalized) qs = Snapshot.objects.filter( - Q(url='http://' + base) | Q(url='https://' + base) + _fragmentless_url_query("http://" + base) | _fragmentless_url_query("https://" + base), ) if qs.exists(): return qs # fall back to matching base_url as prefix - return Snapshot.objects.filter( - Q(url__startswith='http://' + base) | Q(url__startswith='https://' + base) - ) + return Snapshot.objects.filter(Q(url__startswith="http://" + base) | Q(url__startswith="https://" + base)) @staticmethod def render_live_index(request, snapshot): - TITLE_LOADING_MSG = 'Not yet archived...' + TITLE_LOADING_MSG = "Not yet archived..." + from archivebox.core.widgets import TagEditorWidget - hidden_card_plugins = {'archivedotorg', 'favicon', 'title'} + hidden_card_plugins = {"archivedotorg", "favicon", "title"} outputs = [ - out for out in snapshot.discover_outputs() - if (out.get('size') or 0) > 0 and out.get('name') not in hidden_card_plugins + out + for out in snapshot.discover_outputs(include_filesystem_fallback=True) + if (out.get("size") or 0) > 0 and out.get("name") not in hidden_card_plugins ] - archiveresults = {out['name']: out for out in outputs} - snap_dir = Path(snapshot.output_dir) + archiveresults = {out["name"]: out for out in outputs} + hash_index = snapshot.hashes_index # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) # Convert to base names for display ordering all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()] accounted_entries: set[str] = set() for output in outputs: - output_name = output.get('name') or '' + output_name = output.get("name") or "" if output_name: accounted_entries.add(output_name) - output_path = output.get('path') or '' + output_path = output.get("path") or "" if not output_path: continue parts = Path(output_path).parts if parts: accounted_entries.add(parts[0]) - ignore_names = { - '.DS_Store', - 'index.html', - 'index.json', - 'index.jsonl', - 'favicon.ico', - } - ignored_suffixes = {'.log', '.pid', '.sh'} - max_loose_scan = 300 - - def has_meaningful_files(dir_path: Path) -> bool: - scanned = 0 - for file_path in dir_path.rglob('*'): - scanned += 1 - if scanned > max_loose_scan: - return True - if file_path.is_dir() or file_path.name.startswith('.'): - continue - if file_path.suffix.lower() in ignored_suffixes: - continue - try: - if file_path.stat().st_size == 0: - continue - except OSError: - continue - return True - return False - - unaccounted_entries = [] - if snap_dir.exists(): - for entry in snap_dir.iterdir(): - name = entry.name - if name.startswith('.') or name in ignore_names or name in accounted_entries: - continue - is_dir = entry.is_dir() - is_meaningful = False - size = None - if is_dir: - is_meaningful = has_meaningful_files(entry) - elif entry.is_file(): - if entry.suffix.lower() not in ignored_suffixes: - try: - size = entry.stat().st_size - is_meaningful = size > 0 - except OSError: - size = None - is_meaningful = False - - unaccounted_entries.append({ - 'name': name, - 'path': name, - 'is_dir': is_dir, - 'size': size, - 'is_meaningful': is_meaningful, - }) - - unaccounted_entries.sort(key=lambda item: item['name'].lower()) - loose_items = [item for item in unaccounted_entries if item['is_meaningful']] - failed_exclude_suffixes = {'.json', '.jsonl', '.sh', '.log'} - failed_items = [ - item for item in unaccounted_entries - if not item['is_meaningful'] - and not ( - not item['is_dir'] - and Path(item['name']).suffix.lower() in failed_exclude_suffixes - ) - ] + loose_items, failed_items = snapshot.get_detail_page_auxiliary_items(outputs, hidden_card_plugins=hidden_card_plugins) preview_priority = [ - 'singlefile', - 'screenshot', - 'wget', - 'dom', - 'pdf', - 'readability', + "singlefile", + "screenshot", + "wget", + "dom", + "pdf", + "readability", ] preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority]) all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) - best_result = {'path': 'about:blank', 'result': None} + best_result = {"path": "about:blank", "result": None} for result_type in preferred_types: if result_type in archiveresults: best_result = archiveresults[result_type] break - snapshot_info = snapshot.to_dict(extended=True) related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url) - related_snapshots = list( - related_snapshots_qs.exclude(id=snapshot.id).order_by('-bookmarked_at', '-created_at', '-timestamp')[:25] - ) + related_snapshots = list(related_snapshots_qs.exclude(id=snapshot.id).order_by("-bookmarked_at", "-created_at", "-timestamp")[:25]) related_years_map: dict[int, list[Snapshot]] = {} for snap in [snapshot, *related_snapshots]: snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at @@ -240,58 +207,61 @@ class SnapshotView(View): for year, snaps in related_years_map.items(): snaps_sorted = sorted( snaps, - key=lambda s: (s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now()), + key=lambda s: s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now(), reverse=True, ) - related_years.append({ - 'year': year, - 'latest': snaps_sorted[0], - 'snapshots': snaps_sorted, - }) - related_years.sort(key=lambda item: item['year'], reverse=True) + related_years.append( + { + "year": year, + "latest": snaps_sorted[0], + "snapshots": snaps_sorted, + }, + ) + related_years.sort(key=lambda item: item["year"], reverse=True) - try: - warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name - except IndexError: - warc_path = 'warc/' + warc_path = next( + (rel_path for rel_path in hash_index if rel_path.startswith("warc/") and ".warc" in Path(rel_path).name), + "warc/", + ) ordered_outputs = sorted( archiveresults.values(), - key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size'], + key=lambda r: all_types.index(r["name"]) if r["name"] in all_types else -r["size"], ) - non_compact_outputs = [ - out for out in ordered_outputs - if not out.get('is_compact') and not out.get('is_metadata') - ] - compact_outputs = [ - out for out in ordered_outputs - if out.get('is_compact') or out.get('is_metadata') - ] + non_compact_outputs = [out for out in ordered_outputs if not out.get("is_compact") and not out.get("is_metadata")] + compact_outputs = [out for out in ordered_outputs if out.get("is_compact") or out.get("is_metadata")] + tag_widget = TagEditorWidget() + output_size = sum(int(out.get("size") or 0) for out in ordered_outputs) + is_archived = bool(ordered_outputs or snapshot.downloaded_at or snapshot.status == Snapshot.StatusChoices.SEALED) context = { - **snapshot_info, - 'title': htmlencode( - snapshot.title - or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG) - ), - 'extension': snapshot.extension or 'html', - 'tags': snapshot.tags_str() or 'untagged', - 'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending', - 'status': 'archived' if snapshot.is_archived else 'not yet archived', - 'status_color': 'success' if snapshot.is_archived else 'danger', - 'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date), - 'warc_path': warc_path, - 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, - 'archiveresults': [*non_compact_outputs, *compact_outputs], - 'best_result': best_result, - 'snapshot': snapshot, # Pass the snapshot object for template tags - 'related_snapshots': related_snapshots, - 'related_years': related_years, - 'loose_items': loose_items, - 'failed_items': failed_items, + "id": str(snapshot.id), + "snapshot_id": str(snapshot.id), + "url": snapshot.url, + "archive_path": snapshot.archive_path_from_db, + "title": htmlencode(snapshot.resolved_title or (snapshot.base_url if is_archived else TITLE_LOADING_MSG)), + "extension": snapshot.extension or "html", + "tags": snapshot.tags_str() or "untagged", + "size": printable_filesize(output_size) if output_size else "pending", + "status": "archived" if is_archived else "not yet archived", + "status_color": "success" if is_archived else "danger", + "bookmarked_date": snapshot.bookmarked_date, + "downloaded_datestr": snapshot.downloaded_datestr, + "num_outputs": snapshot.num_outputs, + "num_failures": snapshot.num_failures, + "oldest_archive_date": ts_to_date_str(snapshot.oldest_archive_date), + "warc_path": warc_path, + "PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS, + "archiveresults": [*non_compact_outputs, *compact_outputs], + "best_result": best_result, + "snapshot": snapshot, # Pass the snapshot object for template tags + "related_snapshots": related_snapshots, + "related_years": related_years, + "loose_items": loose_items, + "failed_items": failed_items, + "title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in snapshot.tags.all().order_by("name")], } - return render(template_name='core/snapshot_live.html', request=request, context=context) - + return render(template_name="core/snapshot.html", request=request, context=context) def get(self, request, path): if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: @@ -300,42 +270,44 @@ class SnapshotView(View): snapshot = None try: - slug, archivefile = path.split('/', 1) + slug, archivefile = path.split("/", 1) except (IndexError, ValueError): - slug, archivefile = path.split('/', 1)[0], 'index.html' - + slug, archivefile = path.split("/", 1)[0], "index.html" # slug is a timestamp - if slug.replace('.','').isdigit(): - + if slug.replace(".", "").isdigit(): # missing trailing slash -> redirect to index - if '/' not in path: - return redirect(f'{path}/index.html') + if "/" not in path: + return redirect(f"{path}/index.html") try: try: snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug)) canonical_base = snapshot.url_path if canonical_base != snapshot.legacy_archive_path: - target_path = f'/{canonical_base}/{archivefile or "index.html"}' - query = request.META.get('QUERY_STRING') + target_path = f"/{canonical_base}/{archivefile or 'index.html'}" + query = request.META.get("QUERY_STRING") if query: - target_path = f'{target_path}?{query}' + target_path = f"{target_path}?{query}" return redirect(target_path) - if request.GET.get('files'): + if request.GET.get("files"): target_path = _files_index_target(snapshot, archivefile) response = serve_static_with_byterange_support( - request, target_path, document_root=snapshot.output_dir, show_indexes=True, is_archive_replay=True, + request, + target_path, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, ) - elif archivefile == 'index.html': + elif archivefile == "index.html": # if they requested snapshot index, serve live rendered template instead of static html response = self.render_live_index(request, snapshot) else: target = build_snapshot_url(str(snapshot.id), archivefile, request=request) - query = request.META.get('QUERY_STRING') + query = request.META.get("QUERY_STRING") if query: - target = f'{target}?{query}' + target = f"{target}?{query}" return redirect(target) response["Link"] = f'<{snapshot.url}>; rel="canonical"' return response @@ -349,10 +321,10 @@ class SnapshotView(View): return HttpResponse( format_html( ( - '



' - 'No Snapshot directories match the given timestamp/ID: {}

' + "



" + "No Snapshot directories match the given timestamp/ID: {}

" 'You can
add a new Snapshot, or return to the Main Index' - '
' + "
" ), slug, path, @@ -361,72 +333,71 @@ class SnapshotView(View): status=404, ) except Snapshot.MultipleObjectsReturned: - snapshot_hrefs = mark_safe('
').join( + snapshot_hrefs = mark_safe("
").join( format_html( '{} {} {} {}', - snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'), + snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"), snap.archive_path, snap.timestamp, snap.url, - snap.title_stripped[:64] or '', + snap.title_stripped[:64] or "", ) - for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at') + for snap in Snapshot.objects.filter(timestamp__startswith=slug) + .only("url", "timestamp", "title", "bookmarked_at") + .order_by("-bookmarked_at") ) return HttpResponse( format_html( - ( - 'Multiple Snapshots match the given timestamp/ID {}
'
-                        ),
+                        ("Multiple Snapshots match the given timestamp/ID {}
"),
                         slug,
-                    ) + snapshot_hrefs + format_html(
-                        (
-                            '

' - 'Choose a Snapshot to proceed or go back to the Main Index' - ) - ), + ) + + snapshot_hrefs + + format_html('

Choose a Snapshot to proceed or go back to the Main Index'), content_type="text/html", status=404, ) except Http404: - assert snapshot # (Snapshot.DoesNotExist is already handled above) + assert snapshot # (Snapshot.DoesNotExist is already handled above) # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png return HttpResponse( format_html( ( - '' - 'Snapshot Not Found' + "" + "Snapshot Not Found" #'' - '' - '



' + "" + "



" f'Snapshot [{snapshot.timestamp}]: {snapshot.url}
' - f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, ' + f"was queued on {str(snapshot.bookmarked_at).split('.')[0]}, " f'but no files have been saved yet in:
{snapshot.timestamp}/' - '{}' - f'

' - 'It\'s possible {} ' - f'during the last capture on {str(snapshot.bookmarked_at).split(".")[0]},
or that the archiving process has not completed yet.
' - f'
# run this cmd to finish/retry archiving this Snapshot
' + "{}" + f"

" + "It's possible {} " + f"during the last capture on {str(snapshot.bookmarked_at).split('.')[0]},
or that the archiving process has not completed yet.
" + f"
# run this cmd to finish/retry archiving this Snapshot
" f'archivebox update -t timestamp {snapshot.timestamp}


' '
' - 'Next steps:
' + "Next steps:
" f'- list all the Snapshot files .*
' f'- view the Snapshot ./index.html
' f'- go to the Snapshot admin to edit
' f'- go to the Snapshot actions to re-archive
' '- or return to the main index...
' - '
' - '' + "
" + "" ), - archivefile if str(archivefile) != 'None' else '', - f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available', + archivefile if str(archivefile) != "None" else "", + f"the {archivefile} resource could not be fetched" + if str(archivefile) != "None" + else "the original site was not available", ), content_type="text/html", status=404, ) - + # slug is a URL try: try: @@ -437,14 +408,14 @@ class SnapshotView(View): return HttpResponse( format_html( ( - '



' - 'No Snapshots match the given url: {}


' + "



" + "No Snapshots match the given url: {}


" 'Return to the Main Index, or:

' '+ Add a new Snapshot for {}

' - '
' + "
" ), base_url(path), - path if '://' in path else f'https://{path}', + path if "://" in path else f"https://{path}", path, ), content_type="text/html", @@ -452,56 +423,60 @@ class SnapshotView(View): ) except Snapshot.MultipleObjectsReturned: snapshots = SnapshotView.find_snapshots_for_url(path) - snapshot_hrefs = mark_safe('
').join( + snapshot_hrefs = mark_safe("
").join( format_html( '{} {} {} {} {}', - snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'), + snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"), str(snap.id)[:8], snap.archive_path, snap.timestamp, snap.url, - snap.title_stripped[:64] or '', + snap.title_stripped[:64] or "", ) - for snap in snapshots.only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at') + for snap in snapshots.only("url", "timestamp", "title", "bookmarked_at").order_by("-bookmarked_at") ) return HttpResponse( format_html( - ( - 'Multiple Snapshots match the given URL {}
'
-                    ),
+                    ("Multiple Snapshots match the given URL {}
"),
                     base_url(path),
-                ) + snapshot_hrefs + format_html(
-                    (
-                        '

' - 'Choose a Snapshot to proceed or go back to the Main Index' - ) - ), + ) + + snapshot_hrefs + + format_html('

Choose a Snapshot to proceed or go back to the Main Index'), content_type="text/html", status=404, ) - target_path = f'/{snapshot.archive_path}/index.html' - query = request.META.get('QUERY_STRING') + target_path = f"/{snapshot.archive_path}/index.html" + query = request.META.get("QUERY_STRING") if query: - target_path = f'{target_path}?{query}' + target_path = f"{target_path}?{query}" return redirect(target_path) class SnapshotPathView(View): """Serve snapshots by the new URL scheme: /////...""" - def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None): + def get( + self, + request, + username: str, + date: str | None = None, + domain: str | None = None, + snapshot_id: str | None = None, + path: str = "", + url: str | None = None, + ): if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: return _admin_login_redirect_or_forbidden(request) - if username == 'system': - return redirect(request.path.replace('/system/', '/web/', 1)) + if username == "system": + return redirect(request.path.replace("/system/", "/web/", 1)) if date and domain and domain == date: raise Http404 requested_url = url - if not requested_url and domain and domain.startswith(('http://', 'https://')): + if not requested_url and domain and domain.startswith(("http://", "https://")): requested_url = domain snapshot = None @@ -517,7 +492,7 @@ class SnapshotPathView(View): snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first() else: # fuzzy lookup by date + domain/url (most recent) - username_lookup = 'system' if username == 'web' else username + username_lookup = "system" if username == "web" else username if requested_url: qs = SnapshotView.find_snapshots_for_url(requested_url).filter(crawl__created_by__username=username_lookup) else: @@ -539,26 +514,28 @@ class SnapshotPathView(View): pass if requested_url: - snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first() + snapshot = qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first() else: - requested_domain = domain or '' - if requested_domain.startswith(('http://', 'https://')): + requested_domain = domain or "" + if requested_domain.startswith(("http://", "https://")): requested_domain = Snapshot.extract_domain_from_url(requested_domain) else: - requested_domain = Snapshot.extract_domain_from_url(f'https://{requested_domain}') + requested_domain = Snapshot.extract_domain_from_url(f"https://{requested_domain}") # Prefer exact domain matches - matches = [s for s in qs.order_by('-created_at', '-bookmarked_at') if Snapshot.extract_domain_from_url(s.url) == requested_domain] - snapshot = matches[0] if matches else qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first() + matches = [ + s for s in qs.order_by("-created_at", "-bookmarked_at") if Snapshot.extract_domain_from_url(s.url) == requested_domain + ] + snapshot = matches[0] if matches else qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first() if not snapshot: return HttpResponse( format_html( ( - '



' - 'No Snapshots match the given id or url: {}


' + "



" + "No Snapshots match the given id or url: {}


" 'Return to the Main Index' - '
' + "
" ), snapshot_id or requested_url or domain, ), @@ -568,37 +545,45 @@ class SnapshotPathView(View): canonical_base = snapshot.url_path if date: - requested_base = f'{username}/{date}/{domain or url or ""}' + requested_base = f"{username}/{date}/{domain or url or ''}" else: - requested_base = f'{username}/{domain or url or ""}' + requested_base = f"{username}/{domain or url or ''}" if snapshot_id: - requested_base = f'{requested_base}/{snapshot_id}' + requested_base = f"{requested_base}/{snapshot_id}" if canonical_base != requested_base: - target = f'/{canonical_base}/{path or "index.html"}' - query = request.META.get('QUERY_STRING') + target = f"/{canonical_base}/{path or 'index.html'}" + query = request.META.get("QUERY_STRING") if query: - target = f'{target}?{query}' + target = f"{target}?{query}" return redirect(target) archivefile = path or "index.html" - if archivefile != "index.html" and not request.GET.get('files'): + if archivefile != "index.html" and not request.GET.get("files"): target = build_snapshot_url(str(snapshot.id), archivefile, request=request) - query = request.META.get('QUERY_STRING') + query = request.META.get("QUERY_STRING") if query: - target = f'{target}?{query}' + target = f"{target}?{query}" return redirect(target) - if request.GET.get('files'): + if request.GET.get("files"): target_path = _files_index_target(snapshot, archivefile) return serve_static_with_byterange_support( - request, target_path, document_root=snapshot.output_dir, show_indexes=True, is_archive_replay=True, + request, + target_path, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, ) if archivefile == "index.html": return SnapshotView.render_live_index(request, snapshot) return serve_static_with_byterange_support( - request, archivefile, document_root=snapshot.output_dir, show_indexes=True, is_archive_replay=True, + request, + archivefile, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, ) @@ -612,6 +597,37 @@ def _safe_archive_relpath(path: str) -> str | None: return cleaned +def _coerce_sort_timestamp(value: str | float | None) -> float: + if value is None: + return 0.0 + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def _snapshot_sort_key(match_path: str, cache: dict[str, float]) -> tuple[float, str]: + parts = Path(match_path).parts + date_str = "" + snapshot_id = "" + try: + idx = parts.index("snapshots") + date_str = parts[idx + 1] + snapshot_id = parts[idx + 3] + except Exception: + return (_coerce_sort_timestamp(date_str), match_path) + + if snapshot_id not in cache: + snapshot = Snapshot.objects.filter(id=snapshot_id).only("bookmarked_at", "created_at", "downloaded_at", "timestamp").first() + if snapshot: + snap_dt = snapshot.bookmarked_at or snapshot.created_at or snapshot.downloaded_at + cache[snapshot_id] = snap_dt.timestamp() if snap_dt else _coerce_sort_timestamp(snapshot.timestamp) + else: + cache[snapshot_id] = _coerce_sort_timestamp(date_str) + + return (cache[snapshot_id], match_path) + + def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None: if not domain or not rel_path: return None @@ -625,17 +641,8 @@ def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | No if not matches: return None - def sort_key(match_path: str) -> tuple[str, str]: - parts = Path(match_path).parts - date_str = "" - try: - idx = parts.index("snapshots") - date_str = parts[idx + 1] - except Exception: - date_str = "" - return (date_str, match_path) - - best = max(matches, key=sort_key) + sort_cache: dict[str, float] = {} + best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache)) best_path = Path(best) parts = best_path.parts try: @@ -658,20 +665,36 @@ def _latest_responses_root(domain: str) -> Path | None: if not matches: return None - def sort_key(match_path: str) -> tuple[str, str]: - parts = Path(match_path).parts - date_str = "" - try: - idx = parts.index("snapshots") - date_str = parts[idx + 1] - except Exception: - date_str = "" - return (date_str, match_path) - - best = max(matches, key=sort_key) + sort_cache: dict[str, float] = {} + best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache)) return Path(best) +def _latest_snapshot_for_domain(domain: str) -> Snapshot | None: + if not domain: + return None + + requested_domain = domain.split(":", 1)[0].lower() + snapshots = SnapshotView.find_snapshots_for_url(f"https://{requested_domain}").order_by("-created_at", "-bookmarked_at", "-timestamp") + for snapshot in snapshots: + if Snapshot.extract_domain_from_url(snapshot.url).lower() == requested_domain: + return snapshot + return None + + +def _original_request_url(domain: str, path: str = "", query_string: str = "") -> str: + normalized_domain = (domain or "").split(":", 1)[0].lower() + normalized_path = (path or "").lstrip("/") + if normalized_path in ("", "index.html"): + normalized_path = "" + target = f"https://{normalized_domain}" + if normalized_path: + target = f"{target}/{normalized_path}" + if query_string: + target = f"{target}?{query_string}" + return target + + def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool): candidates: list[str] = [] rel_path = rel_path or "" @@ -710,7 +733,8 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""): rel_path = path or "" - show_indexes = bool(request.GET.get("files")) + is_directory_request = bool(path) and path.endswith("/") + show_indexes = bool(request.GET.get("files")) or (SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and is_directory_request) if not show_indexes and (not rel_path or rel_path == "index.html"): return SnapshotView.render_live_index(request, snapshot) @@ -745,6 +769,7 @@ def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = ""): + requested_root_index = path in ("", "index.html") or path.endswith("/") rel_path = path or "" if not rel_path or rel_path.endswith("/"): rel_path = f"{rel_path}index.html" @@ -774,31 +799,39 @@ def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = if response is not None: return response + if requested_root_index and not show_indexes: + snapshot = _latest_snapshot_for_domain(domain) + if snapshot: + return SnapshotView.render_live_index(request, snapshot) + + if SERVER_CONFIG.PUBLIC_ADD_VIEW or request.user.is_authenticated: + target_url = _original_request_url(domain, path, request.META.get("QUERY_STRING", "")) + return redirect(build_web_url(f"/web/{quote(target_url, safe=':/')}")) + raise Http404 class SnapshotHostView(View): - """Serve snapshot directory contents on ./.""" + """Serve snapshot directory contents on ./.""" def get(self, request, snapshot_id: str, path: str = ""): if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: return _admin_login_redirect_or_forbidden(request) - snapshot = None - if snapshot_id: - try: - snapshot = Snapshot.objects.get(pk=snapshot_id) - except Snapshot.DoesNotExist: - try: - snapshot = Snapshot.objects.get(id__startswith=snapshot_id) - except Snapshot.DoesNotExist: - snapshot = None - except Snapshot.MultipleObjectsReturned: - snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first() + snapshot = _find_snapshot_by_ref(snapshot_id) if not snapshot: raise Http404 + + canonical_host = get_snapshot_host(str(snapshot.id)) + if not host_matches(request.get_host(), canonical_host): + target = build_snapshot_url(str(snapshot.id), path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + return _serve_snapshot_replay(request, snapshot, path) + class SnapshotReplayView(View): """Serve snapshot directory contents on a one-domain replay path.""" @@ -806,17 +839,8 @@ class SnapshotReplayView(View): if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: return _admin_login_redirect_or_forbidden(request) - try: - snapshot = Snapshot.objects.get(pk=snapshot_id) - except Snapshot.DoesNotExist: - try: - snapshot = Snapshot.objects.get(id__startswith=snapshot_id) - except Snapshot.DoesNotExist: - raise Http404 - except Snapshot.MultipleObjectsReturned: - snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first() - - if snapshot is None: + snapshot = _find_snapshot_by_ref(snapshot_id) + if not snapshot: raise Http404 return _serve_snapshot_replay(request, snapshot, path) @@ -841,48 +865,67 @@ class OriginalDomainReplayView(View): class PublicIndexView(ListView): - template_name = 'public_index.html' + template_name = "public_index.html" model = Snapshot paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE - ordering = ['-bookmarked_at', '-created_at'] + ordering = ["-bookmarked_at", "-created_at"] def get_context_data(self, **kwargs): return { **super().get_context_data(**kwargs), - 'VERSION': VERSION, - 'COMMIT_HASH': SHELL_CONFIG.COMMIT_HASH, - 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, + "VERSION": VERSION, + "COMMIT_HASH": SHELL_CONFIG.COMMIT_HASH, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + "search_mode": get_search_mode(self.request.GET.get("search_mode")), } def get_queryset(self, **kwargs): qs = super().get_queryset(**kwargs) - query = self.request.GET.get('q', default = '').strip() + query = self.request.GET.get("q", default="").strip() if not query: return qs.distinct() - query_type = self.request.GET.get('query_type') + query_type = self.request.GET.get("query_type") + search_mode = get_search_mode(self.request.GET.get("search_mode")) - if not query_type or query_type == 'all': - qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query)) - try: - qs = qs | query_search_index(query) - except Exception as err: - print(f'[!] Error while using search backend: {err.__class__.__name__} {err}') - elif query_type == 'fulltext': - try: - qs = qs | query_search_index(query) - except Exception as err: - print(f'[!] Error while using search backend: {err.__class__.__name__} {err}') - elif query_type == 'meta': - qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query)) - elif query_type == 'url': + if not query_type or query_type == "all": + metadata_qs = qs.filter( + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), + ) + if search_mode == "meta": + qs = metadata_qs + else: + try: + qs = prioritize_metadata_matches( + qs, + metadata_qs, + query_search_index(query, search_mode=search_mode), + ordering=self.ordering, + ) + except Exception as err: + print(f"[!] Error while using search backend: {err.__class__.__name__} {err}") + qs = metadata_qs + elif query_type == "fulltext": + if search_mode == "meta": + qs = qs.none() + else: + try: + qs = query_search_index(query, search_mode=search_mode).filter(pk__in=qs.values("pk")) + except Exception as err: + print(f"[!] Error while using search backend: {err.__class__.__name__} {err}") + qs = qs.none() + elif query_type == "meta": + qs = qs.filter( + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), + ) + elif query_type == "url": qs = qs.filter(Q(url__icontains=query)) - elif query_type == 'title': + elif query_type == "title": qs = qs.filter(Q(title__icontains=query)) - elif query_type == 'timestamp': + elif query_type == "timestamp": qs = qs.filter(Q(timestamp__icontains=query)) - elif query_type == 'tags': + elif query_type == "tags": qs = qs.filter(Q(tags__name__icontains=query)) else: print(f'[!] Unknown value for query_type: "{query_type}"') @@ -890,23 +933,26 @@ class PublicIndexView(ListView): return qs.distinct() def get(self, *args, **kwargs): - if SERVER_CONFIG.PUBLIC_INDEX or self.request.user.is_authenticated: + if self.request.user.is_authenticated: + return redirect("/admin/core/snapshot/") + if SERVER_CONFIG.PUBLIC_INDEX: response = super().get(*args, **kwargs) return response else: return _admin_login_redirect_or_forbidden(self.request) -@method_decorator(csrf_exempt, name='dispatch') + +@method_decorator(csrf_exempt, name="dispatch") class AddView(UserPassesTestMixin, FormView): template_name = "add.html" form_class = AddLinkForm def get_initial(self): """Prefill the AddLinkForm with the 'url' GET parameter""" - if self.request.method == 'GET': - url = self.request.GET.get('url', None) + if self.request.method == "GET": + url = self.request.GET.get("url", None) if url: - return {'url': url if '://' in url else f'https://{url}'} + return {"url": url if "://" in url else f"https://{url}"} return super().get_initial() @@ -915,7 +961,7 @@ class AddView(UserPassesTestMixin, FormView): def _can_override_crawl_config(self) -> bool: user = self.request.user - return bool(user.is_authenticated and (getattr(user, 'is_superuser', False) or getattr(user, 'is_staff', False))) + return bool(user.is_authenticated and (getattr(user, "is_superuser", False) or getattr(user, "is_staff", False))) def _get_custom_config_overrides(self, form: AddLinkForm) -> dict: custom_config = form.cleaned_data.get("config") or {} @@ -929,37 +975,37 @@ class AddView(UserPassesTestMixin, FormView): return custom_config def get_context_data(self, **kwargs): - required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip() + required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip() plugin_configs = discover_plugin_configs() plugin_dependency_map = { plugin_name: [ - str(required_plugin).strip() - for required_plugin in (schema.get('required_plugins') or []) - if str(required_plugin).strip() + str(required_plugin).strip() for required_plugin in (schema.get("required_plugins") or []) if str(required_plugin).strip() ] for plugin_name, schema in plugin_configs.items() - if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins') + if isinstance(schema.get("required_plugins"), list) and schema.get("required_plugins") } return { **super().get_context_data(**kwargs), - 'title': "Create Crawl", + "title": "Create Crawl", # We can't just call request.build_absolute_uri in the template, because it would include query parameters - 'absolute_add_path': self.request.build_absolute_uri(self.request.path), - 'VERSION': VERSION, - 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, - 'required_search_plugin': required_search_plugin, - 'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True), - 'stdout': '', + "absolute_add_path": self.request.build_absolute_uri(self.request.path), + "VERSION": VERSION, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + "required_search_plugin": required_search_plugin, + "plugin_dependency_map_json": json.dumps(plugin_dependency_map, sort_keys=True), + "stdout": "", } def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl: urls = form.cleaned_data["url"] - print(f'[+] Adding URL: {urls}') + print(f"[+] Adding URL: {urls}") # Extract all form fields tag = form.cleaned_data["tag"] depth = int(form.cleaned_data["depth"]) - plugins = ','.join(form.cleaned_data.get("plugins", [])) + max_urls = int(form.cleaned_data.get("max_urls") or 0) + max_size = int(form.cleaned_data.get("max_size") or 0) + plugins = ",".join(form.cleaned_data.get("plugins", [])) schedule = form.cleaned_data.get("schedule", "").strip() persona = form.cleaned_data.get("persona") index_only = form.cleaned_data.get("index_only", False) @@ -974,46 +1020,50 @@ class AddView(UserPassesTestMixin, FormView): created_by_id = self.request.user.pk else: from archivebox.base_models.models import get_or_create_system_user_pk + created_by_id = get_or_create_system_user_pk() - created_by_name = getattr(self.request.user, 'username', 'web') if self.request.user.is_authenticated else 'web' + created_by_name = getattr(self.request.user, "username", "web") if self.request.user.is_authenticated else "web" # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt - sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt' + sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__web_ui_add_by_user_{created_by_id}.txt" sources_file.parent.mkdir(parents=True, exist_ok=True) - sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) + sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls)) # 2. create a new Crawl with the URLs from the file timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") urls_content = sources_file.read_text() # Build complete config config = { - 'INDEX_ONLY': index_only, - 'DEPTH': depth, - 'PLUGINS': plugins or '', - 'DEFAULT_PERSONA': (persona.name if persona else 'Default'), + "INDEX_ONLY": index_only, + "DEPTH": depth, + "PLUGINS": plugins or "", + "DEFAULT_PERSONA": (persona.name if persona else "Default"), } # Merge custom config overrides config.update(custom_config) - if url_filters.get('allowlist'): - config['URL_ALLOWLIST'] = url_filters['allowlist'] - if url_filters.get('denylist'): - config['URL_DENYLIST'] = url_filters['denylist'] + if url_filters.get("allowlist"): + config["URL_ALLOWLIST"] = url_filters["allowlist"] + if url_filters.get("denylist"): + config["URL_DENYLIST"] = url_filters["denylist"] crawl = Crawl.objects.create( urls=urls_content, max_depth=depth, + max_urls=max_urls, + max_size=max_size, tags_str=tag, notes=notes, - label=f'{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}', + label=f"{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}", created_by_id=created_by_id, - config=config + config=config, ) # 3. create a CrawlSchedule if schedule is provided if schedule: from archivebox.crawls.models import CrawlSchedule + crawl_schedule = CrawlSchedule.objects.create( template=crawl, schedule=schedule, @@ -1023,9 +1073,12 @@ class AddView(UserPassesTestMixin, FormView): created_by_id=created_by_id, ) crawl.schedule = crawl_schedule - crawl.save(update_fields=['schedule']) + crawl.save(update_fields=["schedule"]) crawl.create_snapshots_from_urls() + from archivebox.services.runner import ensure_background_runner + + ensure_background_runner() # 4. start the Orchestrator & wait until it completes # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... @@ -1048,7 +1101,9 @@ class AddView(UserPassesTestMixin, FormView): messages.success( self.request, - mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. View Crawl →"), + mark_safe( + f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. View Crawl →", + ), ) # Orchestrator (managed by supervisord) will pick up the queued crawl @@ -1057,32 +1112,36 @@ class AddView(UserPassesTestMixin, FormView): class WebAddView(AddView): def _latest_snapshot_for_url(self, requested_url: str): - return SnapshotView.find_snapshots_for_url(requested_url).order_by( - '-created_at', '-bookmarked_at', '-timestamp' - ).first() + return SnapshotView.find_snapshots_for_url(requested_url).order_by("-created_at", "-bookmarked_at", "-timestamp").first() def _normalize_add_url(self, requested_url: str) -> str: - if requested_url.startswith(('http://', 'https://')): + if requested_url.startswith(("http://", "https://")): return requested_url - return f'https://{requested_url}' + return f"https://{requested_url}" def dispatch(self, request, *args, **kwargs): - requested_url = urldecode(kwargs.get('url', '') or '') + requested_url = urldecode(kwargs.get("url", "") or "") if requested_url: snapshot = self._latest_snapshot_for_url(requested_url) if snapshot: - return redirect(f'/{snapshot.url_path}') + return redirect(f"/{snapshot.url_path}") if not self.test_func(): + request_host = (request.get_host() or "").lower() + if host_matches(request_host, get_web_host()): + return redirect(build_admin_url(request.get_full_path(), request=request)) + if host_matches(request_host, get_admin_host()): + next_url = quote(request.get_full_path(), safe="/:?=&") + return redirect(f"{build_admin_url('/admin/login/', request=request)}?next={next_url}") return HttpResponse( format_html( ( - '



' - 'No Snapshots match the given url: {}


' + "



" + "No Snapshots match the given url: {}


" 'Return to the Main Index' - '
' + "
" ), - requested_url or '', + requested_url or "", ), content_type="text/html", status=404, @@ -1091,49 +1150,49 @@ class WebAddView(AddView): return super().dispatch(request, *args, **kwargs) def get(self, request: HttpRequest, *args: object, **kwargs: object): - requested_url = urldecode(str(kwargs.get('url') or (args[0] if args else ''))) + requested_url = urldecode(str(kwargs.get("url") or (args[0] if args else ""))) if not requested_url: raise Http404 snapshot = self._latest_snapshot_for_url(requested_url) if snapshot: - return redirect(f'/{snapshot.url_path}') + return redirect(f"/{snapshot.url_path}") add_url = self._normalize_add_url(requested_url) assert self.form_class is not None defaults_form = self.form_class() form_data = { - 'url': add_url, - 'depth': defaults_form.fields['depth'].initial or '0', - 'persona': defaults_form.fields['persona'].initial or 'Default', - 'config': {}, + "url": add_url, + "depth": defaults_form.fields["depth"].initial or "0", + "max_urls": defaults_form.fields["max_urls"].initial or 0, + "max_size": defaults_form.fields["max_size"].initial or "0", + "persona": defaults_form.fields["persona"].initial or "Default", + "config": {}, } - if defaults_form.fields['index_only'].initial: - form_data['index_only'] = 'on' + if defaults_form.fields["index_only"].initial: + form_data["index_only"] = "on" form = self.form_class(data=form_data) if not form.is_valid(): return self.form_invalid(form) crawl = self._create_crawl_from_form(form) - snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl}) + snapshot = Snapshot.from_json({"url": add_url, "tags": form.cleaned_data.get("tag", "")}, overrides={"crawl": crawl}) assert snapshot is not None - return redirect(f'/{snapshot.url_path}') + return redirect(f"/{snapshot.url_path}") class HealthCheckView(View): """ A Django view that renders plain text "OK" for service discovery tools """ + def get(self, request): """ Handle a GET request """ - return HttpResponse( - 'OK', - content_type='text/plain', - status=200 - ) + return HttpResponse("OK", content_type="text/plain", status=200) + def live_progress_view(request): """Simple JSON endpoint for live progress status - used by admin progress monitor.""" @@ -1142,6 +1201,25 @@ def live_progress_view(request): from archivebox.core.models import Snapshot, ArchiveResult from archivebox.machine.models import Process, Machine + def is_current_run_timestamp(event_ts, run_started_at) -> bool: + if run_started_at is None: + return True + if event_ts is None: + return False + return event_ts >= run_started_at + + def archiveresult_matches_current_run(ar, run_started_at) -> bool: + if run_started_at is None: + return True + if ar.status in ( + ArchiveResult.StatusChoices.QUEUED, + ArchiveResult.StatusChoices.STARTED, + ArchiveResult.StatusChoices.BACKOFF, + ): + return True + event_ts = ar.end_ts or ar.start_ts or ar.modified_at or ar.created_at + return is_current_run_timestamp(event_ts, run_started_at) + def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]: normalized_hook_name = Path(hook_name).name if hook_name else "" if not normalized_hook_name: @@ -1178,29 +1256,26 @@ def live_progress_view(request): return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup") machine = Machine.current() - orchestrator_proc = Process.objects.filter( - machine=machine, - process_type=Process.TypeChoices.ORCHESTRATOR, - status=Process.StatusChoices.RUNNING, - ).order_by('-started_at').first() + Process.cleanup_stale_running(machine=machine) + Process.cleanup_orphaned_workers() + orchestrator_proc = ( + Process.objects.filter( + machine=machine, + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + ) + .order_by("-started_at") + .first() + ) orchestrator_running = orchestrator_proc is not None orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None - total_workers = Process.objects.filter( - machine=machine, - status=Process.StatusChoices.RUNNING, - process_type__in=[ - Process.TypeChoices.WORKER, - Process.TypeChoices.HOOK, - Process.TypeChoices.BINARY, - ], - ).count() - # Get model counts by status crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count() crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count() # Get recent crawls (last 24 hours) from datetime import timedelta + one_day_ago = timezone.now() - timedelta(days=1) crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count() @@ -1214,28 +1289,34 @@ def live_progress_view(request): # Get recently completed ArchiveResults with thumbnails (last 20 succeeded results) recent_thumbnails = [] - recent_results = ArchiveResult.objects.filter( - status=ArchiveResult.StatusChoices.SUCCEEDED, - ).select_related('snapshot').order_by('-end_ts')[:20] + recent_results = ( + ArchiveResult.objects.filter( + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + .select_related("snapshot") + .order_by("-end_ts")[:20] + ) for ar in recent_results: embed = ar.embed_path() if embed: # Only include results with embeddable image/media files - ext = embed.lower().split('.')[-1] if '.' in embed else '' - is_embeddable = ext in ('png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'pdf', 'html') - if is_embeddable or ar.plugin in ('screenshot', 'favicon', 'dom'): - archive_path = embed or '' - recent_thumbnails.append({ - 'id': str(ar.id), - 'plugin': ar.plugin, - 'snapshot_id': str(ar.snapshot_id), - 'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '', - 'embed_path': embed, - 'archive_path': archive_path, - 'archive_url': build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else '', - 'end_ts': ar.end_ts.isoformat() if ar.end_ts else None, - }) + ext = embed.lower().split(".")[-1] if "." in embed else "" + is_embeddable = ext in ("png", "jpg", "jpeg", "gif", "webp", "svg", "ico", "pdf", "html") + if is_embeddable or ar.plugin in ("screenshot", "favicon", "dom"): + archive_path = embed or "" + recent_thumbnails.append( + { + "id": str(ar.id), + "plugin": ar.plugin, + "snapshot_id": str(ar.snapshot_id), + "snapshot_url": ar.snapshot.url[:60] if ar.snapshot else "", + "embed_path": embed, + "archive_path": archive_path, + "archive_url": build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else "", + "end_ts": ar.end_ts.isoformat() if ar.end_ts else None, + }, + ) # Build hierarchical active crawls with nested snapshots and archive results @@ -1257,16 +1338,16 @@ def live_progress_view(request): ).order_by("-modified_at") crawl_process_pids: dict[str, int] = {} snapshot_process_pids: dict[str, int] = {} - process_records_by_crawl: dict[str, list[dict[str, object]]] = {} - process_records_by_snapshot: dict[str, list[dict[str, object]]] = {} + process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {} + process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {} seen_process_records: set[str] = set() for proc in running_processes: env = proc.env or {} if not isinstance(env, dict): env = {} - crawl_id = env.get('CRAWL_ID') - snapshot_id = env.get('SNAPSHOT_ID') + crawl_id = env.get("CRAWL_ID") + snapshot_id = env.get("SNAPSHOT_ID") _plugin, _label, phase, _hook_name = process_label(proc.cmd) if crawl_id and proc.pid: crawl_process_pids.setdefault(str(crawl_id), proc.pid) @@ -1291,7 +1372,11 @@ def live_progress_view(request): continue seen_process_records.add(proc_key) - status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded") + status = ( + "started" + if proc.status == Process.StatusChoices.RUNNING + else ("failed" if proc.exit_code not in (None, 0) else "succeeded") + ) payload: dict[str, object] = { "id": str(proc.id), "plugin": plugin, @@ -1304,20 +1389,25 @@ def live_progress_view(request): } if status == "started" and proc.pid: payload["pid"] = proc.pid + proc_started_at = proc.started_at or proc.modified_at if phase == "snapshot" and snapshot_id: - process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload) + process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at)) elif crawl_id: - process_records_by_crawl.setdefault(str(crawl_id), []).append(payload) + process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at)) - active_crawls_qs = Crawl.objects.filter( - status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED] - ).prefetch_related( - 'snapshot_set', - 'snapshot_set__archiveresult_set', - 'snapshot_set__archiveresult_set__process', - ).distinct().order_by('-modified_at')[:10] + active_crawls_qs = ( + Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]) + .prefetch_related( + "snapshot_set", + "snapshot_set__archiveresult_set", + "snapshot_set__archiveresult_set__process", + ) + .distinct() + .order_by("-modified_at")[:10] + ) active_crawls = [] + total_workers = 0 for crawl in active_crawls_qs: # Get ALL snapshots for this crawl to count status (already prefetched) all_crawl_snapshots = list(crawl.snapshot_set.all()) @@ -1330,18 +1420,23 @@ def live_progress_view(request): # Get only ACTIVE snapshots to display (limit to 5 most recent) active_crawl_snapshots = [ - s for s in all_crawl_snapshots - if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + s for s in all_crawl_snapshots if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] ][:5] # Count URLs in the crawl (for when snapshots haven't been created yet) urls_count = 0 if crawl.urls: - urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')]) + urls_count = len([u for u in crawl.urls.split("\n") if u.strip() and not u.startswith("#")]) # Calculate crawl progress crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 - crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), [])) + crawl_run_started_at = crawl.created_at + crawl_setup_plugins = [ + payload + for payload, proc_started_at in process_records_by_crawl.get(str(crawl.id), []) + if is_current_run_timestamp(proc_started_at, crawl_run_started_at) + ] + total_workers += sum(1 for item in crawl_setup_plugins if item.get("source") == "process" and item.get("status") == "started") crawl_setup_total = len(crawl_setup_plugins) crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded") crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed") @@ -1350,8 +1445,11 @@ def live_progress_view(request): # Get active snapshots for this crawl (already prefetched) active_snapshots_for_crawl = [] for snapshot in active_crawl_snapshots: + snapshot_run_started_at = snapshot.downloaded_at or snapshot.created_at # Get archive results for this snapshot (already prefetched) - snapshot_results = snapshot.archiveresult_set.all() + snapshot_results = [ + ar for ar in snapshot.archiveresult_set.all() if archiveresult_matches_current_run(ar, snapshot_run_started_at) + ] now = timezone.now() plugin_progress_values: list[int] = [] @@ -1393,26 +1491,26 @@ def live_progress_view(request): plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin) plugin_payload = { - 'id': str(ar.id), - 'plugin': ar.plugin, - 'label': label, - 'hook_name': hook_name, - 'phase': phase, - 'status': status, - 'process_id': str(ar.process_id) if ar.process_id else None, + "id": str(ar.id), + "plugin": ar.plugin, + "label": label, + "hook_name": hook_name, + "phase": phase, + "status": status, + "process_id": str(ar.process_id) if ar.process_id else None, } if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process: - plugin_payload['pid'] = ar.process.pid + plugin_payload["pid"] = ar.process.pid if status == ArchiveResult.StatusChoices.STARTED: - plugin_payload['progress'] = progress_value - plugin_payload['timeout'] = ar.timeout or 120 - plugin_payload['source'] = 'archiveresult' + plugin_payload["progress"] = progress_value + plugin_payload["timeout"] = ar.timeout or 120 + plugin_payload["source"] = "archiveresult" all_plugins.append(plugin_payload) - seen_plugin_keys.add( - str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}" - ) + seen_plugin_keys.add(str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}") - for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []): + for proc_payload, proc_started_at in process_records_by_snapshot.get(str(snapshot.id), []): + if not is_current_run_timestamp(proc_started_at, snapshot_run_started_at): + continue proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}") if proc_key in seen_plugin_keys: continue @@ -1424,6 +1522,7 @@ def live_progress_view(request): plugin_progress_values.append(100) elif proc_status == "started": plugin_progress_values.append(1) + total_workers += 1 else: plugin_progress_values.append(0) @@ -1434,19 +1533,23 @@ def live_progress_view(request): snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0 - active_snapshots_for_crawl.append({ - 'id': str(snapshot.id), - 'url': snapshot.url[:80], - 'status': snapshot.status, - 'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None, - 'progress': snapshot_progress, - 'total_plugins': total_plugins, - 'completed_plugins': completed_plugins, - 'failed_plugins': failed_plugins, - 'pending_plugins': pending_plugins, - 'all_plugins': all_plugins, - 'worker_pid': snapshot_process_pids.get(str(snapshot.id)), - }) + active_snapshots_for_crawl.append( + { + "id": str(snapshot.id), + "url": snapshot.url[:80], + "status": snapshot.status, + "started": (snapshot.downloaded_at or snapshot.created_at).isoformat() + if (snapshot.downloaded_at or snapshot.created_at) + else None, + "progress": snapshot_progress, + "total_plugins": total_plugins, + "completed_plugins": completed_plugins, + "failed_plugins": failed_plugins, + "pending_plugins": pending_plugins, + "all_plugins": all_plugins, + "worker_pid": snapshot_process_pids.get(str(snapshot.id)), + }, + ) # Check if crawl can start (for debugging stuck crawls) can_start = bool(crawl.urls) @@ -1456,115 +1559,124 @@ def live_progress_view(request): retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0 - active_crawls.append({ - 'id': str(crawl.id), - 'label': str(crawl)[:60], - 'status': crawl.status, - 'started': crawl.modified_at.isoformat() if crawl.modified_at else None, - 'progress': crawl_progress, - 'max_depth': crawl.max_depth, - 'urls_count': urls_count, - 'total_snapshots': total_snapshots, - 'completed_snapshots': completed_snapshots, - 'started_snapshots': started_snapshots, - 'failed_snapshots': 0, - 'pending_snapshots': pending_snapshots, - 'setup_plugins': crawl_setup_plugins, - 'setup_total_plugins': crawl_setup_total, - 'setup_completed_plugins': crawl_setup_completed, - 'setup_failed_plugins': crawl_setup_failed, - 'setup_pending_plugins': crawl_setup_pending, - 'active_snapshots': active_snapshots_for_crawl, - 'can_start': can_start, - 'urls_preview': urls_preview, - 'retry_at_future': retry_at_future, - 'seconds_until_retry': seconds_until_retry, - 'worker_pid': crawl_process_pids.get(str(crawl.id)), - }) + active_crawls.append( + { + "id": str(crawl.id), + "label": str(crawl)[:60], + "status": crawl.status, + "started": crawl.created_at.isoformat() if crawl.created_at else None, + "progress": crawl_progress, + "max_depth": crawl.max_depth, + "urls_count": urls_count, + "total_snapshots": total_snapshots, + "completed_snapshots": completed_snapshots, + "started_snapshots": started_snapshots, + "failed_snapshots": 0, + "pending_snapshots": pending_snapshots, + "setup_plugins": crawl_setup_plugins, + "setup_total_plugins": crawl_setup_total, + "setup_completed_plugins": crawl_setup_completed, + "setup_failed_plugins": crawl_setup_failed, + "setup_pending_plugins": crawl_setup_pending, + "active_snapshots": active_snapshots_for_crawl, + "can_start": can_start, + "urls_preview": urls_preview, + "retry_at_future": retry_at_future, + "seconds_until_retry": seconds_until_retry, + "worker_pid": crawl_process_pids.get(str(crawl.id)), + }, + ) - return JsonResponse({ - 'orchestrator_running': orchestrator_running, - 'orchestrator_pid': orchestrator_pid, - 'total_workers': total_workers, - 'crawls_pending': crawls_pending, - 'crawls_started': crawls_started, - 'crawls_recent': crawls_recent, - 'snapshots_pending': snapshots_pending, - 'snapshots_started': snapshots_started, - 'archiveresults_pending': archiveresults_pending, - 'archiveresults_started': archiveresults_started, - 'archiveresults_succeeded': archiveresults_succeeded, - 'archiveresults_failed': archiveresults_failed, - 'active_crawls': active_crawls, - 'recent_thumbnails': recent_thumbnails, - 'server_time': timezone.now().isoformat(), - }) + return JsonResponse( + { + "orchestrator_running": orchestrator_running, + "orchestrator_pid": orchestrator_pid, + "total_workers": total_workers, + "crawls_pending": crawls_pending, + "crawls_started": crawls_started, + "crawls_recent": crawls_recent, + "snapshots_pending": snapshots_pending, + "snapshots_started": snapshots_started, + "archiveresults_pending": archiveresults_pending, + "archiveresults_started": archiveresults_started, + "archiveresults_succeeded": archiveresults_succeeded, + "archiveresults_failed": archiveresults_failed, + "active_crawls": active_crawls, + "recent_thumbnails": recent_thumbnails, + "server_time": timezone.now().isoformat(), + }, + ) except Exception as e: import traceback - return JsonResponse({ - 'error': str(e), - 'traceback': traceback.format_exc(), - 'orchestrator_running': False, - 'total_workers': 0, - 'crawls_pending': 0, - 'crawls_started': 0, - 'crawls_recent': 0, - 'snapshots_pending': 0, - 'snapshots_started': 0, - 'archiveresults_pending': 0, - 'archiveresults_started': 0, - 'archiveresults_succeeded': 0, - 'archiveresults_failed': 0, - 'active_crawls': [], - 'recent_thumbnails': [], - 'server_time': timezone.now().isoformat(), - }, status=500) + + return JsonResponse( + { + "error": str(e), + "traceback": traceback.format_exc(), + "orchestrator_running": False, + "total_workers": 0, + "crawls_pending": 0, + "crawls_started": 0, + "crawls_recent": 0, + "snapshots_pending": 0, + "snapshots_started": 0, + "archiveresults_pending": 0, + "archiveresults_started": 0, + "archiveresults_succeeded": 0, + "archiveresults_failed": 0, + "active_crawls": [], + "recent_thumbnails": [], + "server_time": timezone.now().isoformat(), + }, + status=500, + ) def find_config_section(key: str) -> str: CONFIGS = get_all_configs() - + if key in CONSTANTS_CONFIG: - return 'CONSTANT' - matching_sections = [ - section_id for section_id, section in CONFIGS.items() if key in dict(section) - ] - section = matching_sections[0] if matching_sections else 'DYNAMIC' + return "CONSTANT" + matching_sections = [section_id for section_id, section in CONFIGS.items() if key in dict(section)] + section = matching_sections[0] if matching_sections else "DYNAMIC" return section + def find_config_default(key: str) -> str: CONFIGS = get_all_configs() - + if key in CONSTANTS_CONFIG: return str(CONSTANTS_CONFIG[key]) - + default_val = None for config in CONFIGS.values(): if key in dict(config): - default_field = getattr(config, 'model_fields', dict(config))[key] - default_val = default_field.default if hasattr(default_field, 'default') else default_field + default_field = getattr(config, "model_fields", dict(config))[key] + default_val = default_field.default if hasattr(default_field, "default") else default_field break - + if isinstance(default_val, Callable): - default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip() - if default_val.count(')') > default_val.count('('): + default_val = inspect.getsource(default_val).split("lambda", 1)[-1].split(":", 1)[-1].replace("\n", " ").strip() + if default_val.count(")") > default_val.count("("): default_val = default_val[:-1] else: default_val = str(default_val) - + return default_val + def find_config_type(key: str) -> str: from typing import ClassVar + CONFIGS = get_all_configs() for config in CONFIGS.values(): if hasattr(config, key): # Try to get from pydantic model_fields first (more reliable) - if hasattr(config, 'model_fields') and key in config.model_fields: + if hasattr(config, "model_fields") and key in config.model_fields: field = config.model_fields[key] - if hasattr(field, 'annotation') and field.annotation is not None: + if hasattr(field, "annotation") and field.annotation is not None: try: return str(field.annotation.__name__) except AttributeError: @@ -1573,13 +1685,14 @@ def find_config_type(key: str) -> str: # Fallback to get_type_hints with proper namespace try: import typing + namespace = { - 'ClassVar': ClassVar, - 'Optional': typing.Optional, - 'Union': typing.Union, - 'List': typing.List, - 'Dict': typing.Dict, - 'Path': Path, + "ClassVar": ClassVar, + "Optional": typing.Optional, + "Union": typing.Union, + "List": list, + "Dict": dict, + "Path": Path, } type_hints = get_type_hints(config, globalns=namespace, localns=namespace) try: @@ -1589,43 +1702,46 @@ def find_config_type(key: str) -> str: except Exception: # If all else fails, return str pass - return 'str' + return "str" + def key_is_safe(key: str) -> bool: - for term in ('key', 'password', 'secret', 'token'): + for term in ("key", "password", "secret", "token"): if term in key.lower(): return False return True + def find_config_source(key: str, merged_config: dict) -> str: """Determine where a config value comes from.""" from archivebox.machine.models import Machine # Environment variables override all persistent config sources. if key in os.environ: - return 'Environment' + return "Environment" # Machine.config overrides ArchiveBox.conf. try: machine = Machine.current() if machine.config and key in machine.config: - return 'Machine' + return "Machine" except Exception: pass # Check if it's from archivebox.config.file from archivebox.config.configset import BaseConfigSet + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) if key in file_config: - return 'Config File' + return "Config File" # Otherwise it's using the default - return 'Default' + return "Default" def find_plugin_for_config_key(key: str) -> str | None: for plugin_name, schema in discover_plugin_configs().items(): - if key in (schema.get('properties') or {}): + if key in (schema.get("properties") or {}): return plugin_name return None @@ -1634,8 +1750,8 @@ def get_config_definition_link(key: str) -> tuple[str, str]: plugin_name = find_plugin_for_config_key(key) if not plugin_name: return ( - f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code', - 'archivebox/config', + f"https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code", + "archivebox/config", ) plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) @@ -1643,20 +1759,20 @@ def get_config_definition_link(key: str) -> tuple[str, str]: builtin_root = BUILTIN_PLUGINS_DIR.resolve() if plugin_dir.is_relative_to(builtin_root): return ( - f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json', - f'abx_plugins/plugins/{plugin_name}/config.json', + f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json", + f"abx_plugins/plugins/{plugin_name}/config.json", ) user_root = USER_PLUGINS_DIR.resolve() if plugin_dir.is_relative_to(user_root): return ( - f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/', - f'data/custom_plugins/{plugin_name}/config.json', + f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/", + f"data/custom_plugins/{plugin_name}/config.json", ) return ( - f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/', - f'abx_plugins/plugins/{plugin_name}/config.json', + f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/", + f"abx_plugins/plugins/{plugin_name}/config.json", ) @@ -1664,11 +1780,12 @@ def get_config_definition_link(key: str) -> tuple[str, str]: def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: CONFIGS = get_all_configs() - assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.' + assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings." # Get merged config that includes Machine.config overrides try: from archivebox.machine.models import Machine + Machine.current() merged_config = get_config() except Exception: @@ -1688,45 +1805,48 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: for section_id, section in reversed(list(CONFIGS.items())): for key in dict(section).keys(): - rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '') - rows['Key'].append(ItemLink(key, key=key)) - rows['Type'].append(format_html('{}', find_config_type(key))) + rows["Section"].append(section_id) # section.replace('_', ' ').title().replace(' Config', '') + rows["Key"].append(ItemLink(key, key=key)) + rows["Type"].append(format_html("{}", find_config_type(key))) # Use merged config value (includes machine overrides) actual_value = merged_config.get(key, getattr(section, key, None)) - rows['Value'].append(mark_safe(f'{actual_value}') if key_is_safe(key) else '******** (redacted)') + rows["Value"].append(mark_safe(f"{actual_value}") if key_is_safe(key) else "******** (redacted)") # Show where the value comes from source = find_config_source(key, merged_config) - source_colors = { - 'Machine': 'purple', - 'Environment': 'blue', - 'Config File': 'green', - 'Default': 'gray' - } - rows['Source'].append(format_html('{}', source_colors.get(source, 'gray'), source)) + source_colors = {"Machine": "purple", "Environment": "blue", "Config File": "green", "Default": "gray"} + rows["Source"].append(format_html('{}', source_colors.get(source, "gray"), source)) - rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) + rows["Default"].append( + mark_safe( + f'{find_config_default(key) or "See here..."}', + ), + ) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) # rows['Aliases'].append(', '.join(find_config_aliases(key))) - section = 'CONSTANT' + section = "CONSTANT" for key in CONSTANTS_CONFIG.keys(): - rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') - rows['Key'].append(ItemLink(key, key=key)) - rows['Type'].append(format_html('{}', getattr(type(CONSTANTS_CONFIG[key]), '__name__', str(CONSTANTS_CONFIG[key])))) - rows['Value'].append(format_html('{}', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)') - rows['Source'].append(mark_safe('Constant')) - rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) + rows["Section"].append(section) # section.replace('_', ' ').title().replace(' Config', '') + rows["Key"].append(ItemLink(key, key=key)) + rows["Type"].append(format_html("{}", getattr(type(CONSTANTS_CONFIG[key]), "__name__", str(CONSTANTS_CONFIG[key])))) + rows["Value"].append(format_html("{}", CONSTANTS_CONFIG[key]) if key_is_safe(key) else "******** (redacted)") + rows["Source"].append(mark_safe('Constant')) + rows["Default"].append( + mark_safe( + f'{find_config_default(key) or "See here..."}', + ), + ) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) # rows['Aliases'].append('') - return TableContext( title="Computed Configuration Values", table=rows, ) + @render_with_item_view def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: from archivebox.machine.models import Machine @@ -1735,7 +1855,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont CONFIGS = get_all_configs() FLAT_CONFIG = get_flat_config() - assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.' + assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings." # Get merged config merged_config = get_config() @@ -1745,16 +1865,16 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont # Environment variable if key in os.environ: - sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue')) + sources_info.append(("Environment", os.environ[key] if key_is_safe(key) else "********", "blue")) # Machine config machine = None machine_admin_url = None try: machine = Machine.current() - machine_admin_url = f'/admin/machine/machine/{machine.id}/change/' + machine_admin_url = f"/admin/machine/machine/{machine.id}/change/" if machine.config and key in machine.config: - sources_info.append(('Machine', machine.config[key] if key_is_safe(key) else '********', 'purple')) + sources_info.append(("Machine", machine.config[key] if key_is_safe(key) else "********", "purple")) except Exception: pass @@ -1762,60 +1882,68 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont if CONSTANTS.CONFIG_FILE.exists(): file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) if key in file_config: - sources_info.append(('Config File', file_config[key], 'green')) + sources_info.append(("Config File", file_config[key], "green")) # Default value default_val = find_config_default(key) if default_val: - sources_info.append(('Default', default_val, 'gray')) + sources_info.append(("Default", default_val, "gray")) # Final computed value final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None))) if not key_is_safe(key): - final_value = '********' + final_value = "********" # Build sources display - sources_html = '
'.join([ - f'{source}: {value}' - for source, value, color in sources_info - ]) + sources_html = "
".join([f'{source}: {value}' for source, value, color in sources_info]) # aliases = USER_CONFIG.get(key, {}).get("aliases", []) aliases = [] if key in CONSTANTS_CONFIG: - section_header = mark_safe(f'[CONSTANTS]   {key}   (read-only, hardcoded by ArchiveBox)') + section_header = mark_safe( + f'[CONSTANTS]   {key}   (read-only, hardcoded by ArchiveBox)', + ) elif key in FLAT_CONFIG: - section_header = mark_safe(f'data / ArchiveBox.conf   [{find_config_section(key)}]   {key}') + section_header = mark_safe( + f'data / ArchiveBox.conf   [{find_config_section(key)}]   {key}', + ) else: - section_header = mark_safe(f'[DYNAMIC CONFIG]   {key}   (read-only, calculated at runtime)') - + section_header = mark_safe( + f'[DYNAMIC CONFIG]   {key}   (read-only, calculated at runtime)', + ) definition_url, definition_label = get_config_definition_link(key) - section_data = cast(SectionData, { - "name": section_header, - "description": None, - "fields": { - 'Key': key, - 'Type': find_config_type(key), - 'Value': final_value, - 'Currently read from': find_config_source(key, merged_config), - }, - "help_texts": { - 'Key': mark_safe(f''' + section_data = cast( + SectionData, + { + "name": section_header, + "description": None, + "fields": { + "Key": key, + "Type": find_config_type(key), + "Value": final_value, + "Currently read from": find_config_source(key, merged_config), + }, + "help_texts": { + "Key": mark_safe(f""" Documentation   Aliases: {", ".join(aliases)} - '''), - 'Type': mark_safe(f''' + """), + "Type": mark_safe(f''' See full definition in {definition_label}... '''), - 'Value': mark_safe(f''' - {'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' if not key_is_safe(key) else ''} + "Value": mark_safe(f''' + { + 'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' + if not key_is_safe(key) + else "" + }


Configuration Sources (highest priority first):

{sources_html} @@ -1824,28 +1952,29 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont To change this value, edit data/ArchiveBox.conf or run:

archivebox config --set {key}="{ - val.strip("'") - if (val := find_config_default(key)) else - (str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'") - }" + val.strip("'") + if (val := find_config_default(key)) + else (str(FLAT_CONFIG[key] if key_is_safe(key) else "********")).strip("'") + }"

'''), - 'Currently read from': mark_safe(f''' + "Currently read from": mark_safe(f""" The value shown in the "Value" field comes from the {find_config_source(key, merged_config)} source.

Priority order (highest to lowest):
  1. Environment - Environment variables
  2. Machine - Machine-specific overrides (e.g., resolved binary paths) - {f'
    → Edit {key} in Machine.config for this server' if machine_admin_url else ''} + {f'
    → Edit {key} in Machine.config for this server' if machine_admin_url else ""}
  3. Config File - data/ArchiveBox.conf
  4. Default - Default value from code
- {f'
Tip: To override {key} on this machine, edit the Machine.config field and add:
{{"\\"{key}\\": "your_value_here"}}' if machine_admin_url and key not in CONSTANTS_CONFIG else ''} - '''), + {f'
Tip: To override {key} on this machine, edit the Machine.config field and add:
{{"\\"{key}\\": "your_value_here"}}' if machine_admin_url and key not in CONSTANTS_CONFIG else ""} + """), + }, }, - }) + ) return ItemContext( slug=key, diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py index af30544c..77e62b1f 100644 --- a/archivebox/core/widgets.py +++ b/archivebox/core/widgets.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" import json import re @@ -16,10 +16,11 @@ class TagEditorWidget(forms.Widget): - Press Enter or Space to create new tags (auto-creates if doesn't exist) - Uses AJAX for autocomplete and tag creation """ + template_name = "" # We render manually class Media: - css = {'all': []} + css = {"all": []} js = [] def __init__(self, attrs=None, snapshot_id=None): @@ -28,24 +29,24 @@ class TagEditorWidget(forms.Widget): def _escape(self, value): """Escape HTML entities in value.""" - return escape(str(value)) if value else '' + return escape(str(value)) if value else "" def _normalize_id(self, value): """Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start).""" - normalized = re.sub(r'[^A-Za-z0-9_]', '_', str(value)) - if not normalized or not re.match(r'[A-Za-z_]', normalized): - normalized = f't_{normalized}' + normalized = re.sub(r"[^A-Za-z0-9_]", "_", str(value)) + if not normalized or not re.match(r"[A-Za-z_]", normalized): + normalized = f"t_{normalized}" return normalized def _tag_style(self, value): """Compute a stable pastel color style for a tag value.""" - tag = (value or '').strip().lower() - digest = hashlib.md5(tag.encode('utf-8')).hexdigest() + tag = (value or "").strip().lower() + digest = hashlib.md5(tag.encode("utf-8")).hexdigest() hue = int(digest[:4], 16) % 360 - bg = f'hsl({hue}, 70%, 92%)' - border = f'hsl({hue}, 60%, 82%)' - fg = f'hsl({hue}, 35%, 28%)' - return f'--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};' + bg = f"hsl({hue}, 70%, 92%)" + border = f"hsl({hue}, 60%, 82%)" + fg = f"hsl({hue}, 35%, 28%)" + return f"--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};" def render(self, name, value, attrs=None, renderer=None): """ @@ -64,14 +65,15 @@ class TagEditorWidget(forms.Widget): # Parse value to get list of tag names tags = [] if value: - if hasattr(value, 'all'): # QuerySet + if hasattr(value, "all"): # QuerySet tags = sorted([tag.name for tag in value.all()]) elif isinstance(value, (list, tuple)): - if value and hasattr(value[0], 'name'): # List of Tag objects + if value and hasattr(value[0], "name"): # List of Tag objects tags = sorted([tag.name for tag in value]) else: # List of strings or IDs # Could be tag IDs from form submission from archivebox.core.models import Tag + tag_names = [] for v in value: if isinstance(v, str) and not v.isdigit(): @@ -85,13 +87,13 @@ class TagEditorWidget(forms.Widget): tag_names.append(v) tags = sorted(tag_names) elif isinstance(value, str): - tags = sorted([t.strip() for t in value.split(',') if t.strip()]) + tags = sorted([t.strip() for t in value.split(",") if t.strip()]) - widget_id_raw = attrs.get('id', name) if attrs else name + widget_id_raw = attrs.get("id", name) if attrs else name widget_id = self._normalize_id(widget_id_raw) # Build pills HTML - pills_html = '' + pills_html = "" for tag in tags: pills_html += f''' @@ -113,11 +115,11 @@ class TagEditorWidget(forms.Widget): placeholder="Add tag..." autocomplete="off" onkeydown="handleTagKeydown_{widget_id}(event)" - onkeypress="if(event.key==='Enter' || event.keyCode===13){{event.preventDefault(); event.stopPropagation();}}" + onkeypress="if(event.key==='Enter' || event.keyCode===13 || event.key===' ' || event.code==='Space' || event.key==='Spacebar'){{event.preventDefault(); event.stopPropagation();}}" oninput="fetchTagAutocomplete_{widget_id}(this.value)" > - +
- ''' if crawl is not None else ''} - ''') + ''' + if crawl is not None + else "" + } + """) class URLFiltersWidget(forms.Widget): def render(self, name, value, attrs=None, renderer=None): value = value if isinstance(value, dict) else {} - widget_id = (attrs or {}).get('id', name) - allowlist = escape(value.get('allowlist', '') or '') - denylist = escape(value.get('denylist', '') or '') + widget_id = (attrs or {}).get("id", name) + allowlist = escape(value.get("allowlist", "") or "") + denylist = escape(value.get("denylist", "") or "") return mark_safe(f'''
@@ -340,9 +343,9 @@ class URLFiltersWidget(forms.Widget): def value_from_datadict(self, data, files, name): return { - 'allowlist': data.get(f'{name}_allowlist', ''), - 'denylist': data.get(f'{name}_denylist', ''), - 'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'), + "allowlist": data.get(f"{name}_allowlist", ""), + "denylist": data.get(f"{name}_denylist", ""), + "same_domain_only": data.get(f"{name}_same_domain_only") in ("1", "on", "true"), } @@ -352,54 +355,59 @@ class URLFiltersField(forms.Field): def to_python(self, value): if isinstance(value, dict): return value - return {'allowlist': '', 'denylist': '', 'same_domain_only': False} + return {"allowlist": "", "denylist": "", "same_domain_only": False} class CrawlAdminForm(forms.ModelForm): """Custom form for Crawl admin to render urls field as textarea.""" + tags_editor = forms.CharField( - label='Tags', + label="Tags", required=False, widget=TagEditorWidget(), - help_text='Type tag names and press Enter or Space to add. Click × to remove.', + help_text="Type tag names and press Enter or Space to add. Click × to remove.", ) url_filters = URLFiltersField( - label='URL Filters', + label="URL Filters", required=False, - help_text='Set URL_ALLOWLIST / URL_DENYLIST for this crawl.', + help_text="Set URL_ALLOWLIST / URL_DENYLIST for this crawl.", ) class Meta: model = Crawl - fields = '__all__' + fields = "__all__" widgets = { - 'urls': forms.Textarea(attrs={ - 'rows': 8, - 'style': 'width: 100%; font-family: monospace; font-size: 13px;', - 'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #', - }), - 'notes': forms.Textarea(attrs={ - 'rows': 1, - 'style': 'width: 100%; min-height: 0; resize: vertical;', - }), + "urls": forms.Textarea( + attrs={ + "rows": 8, + "style": "width: 100%; font-family: monospace; font-size: 13px;", + "placeholder": "https://example.com\nhttps://example2.com\n# Comments start with #", + }, + ), + "notes": forms.Textarea( + attrs={ + "rows": 1, + "style": "width: 100%; min-height: 0; resize: vertical;", + }, + ), } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {} if self.instance and self.instance.pk: - self.initial['tags_editor'] = self.instance.tags_str - self.initial['url_filters'] = { - 'allowlist': config.get('URL_ALLOWLIST', ''), - 'denylist': config.get('URL_DENYLIST', ''), - 'same_domain_only': False, + self.initial["tags_editor"] = self.instance.tags_str + self.initial["url_filters"] = { + "allowlist": config.get("URL_ALLOWLIST", ""), + "denylist": config.get("URL_DENYLIST", ""), + "same_domain_only": False, } def clean_tags_editor(self): - tags_str = self.cleaned_data.get('tags_editor', '') + tags_str = self.cleaned_data.get("tags_editor", "") tag_names = [] seen = set() - for raw_name in tags_str.split(','): + for raw_name in tags_str.split(","): name = raw_name.strip() if not name: continue @@ -408,28 +416,28 @@ class CrawlAdminForm(forms.ModelForm): continue seen.add(lowered) tag_names.append(name) - return ','.join(tag_names) + return ",".join(tag_names) def clean_url_filters(self): - value = self.cleaned_data.get('url_filters') or {} + value = self.cleaned_data.get("url_filters") or {} return { - 'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))), - 'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))), - 'same_domain_only': bool(value.get('same_domain_only')), + "allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))), + "denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))), + "same_domain_only": bool(value.get("same_domain_only")), } def save(self, commit=True): instance = super().save(commit=False) - instance.tags_str = self.cleaned_data.get('tags_editor', '') - url_filters = self.cleaned_data.get('url_filters') or {} + instance.tags_str = self.cleaned_data.get("tags_editor", "") + url_filters = self.cleaned_data.get("url_filters") or {} instance.set_url_filters( - url_filters.get('allowlist', ''), - url_filters.get('denylist', ''), + url_filters.get("allowlist", ""), + url_filters.get("denylist", ""), ) if commit: instance.save() instance.apply_crawl_config_filters() - save_m2m = getattr(self, '_save_m2m', None) + save_m2m = getattr(self, "_save_m2m", None) if callable(save_m2m): save_m2m() return instance @@ -437,77 +445,138 @@ class CrawlAdminForm(forms.ModelForm): class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): form = CrawlAdminForm - list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'health_display', 'num_snapshots') - sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at') - search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls') + list_display = ( + "id", + "created_at", + "created_by", + "max_depth", + "max_urls", + "max_size", + "label", + "notes", + "urls_preview", + "schedule_str", + "status", + "retry_at", + "health_display", + "num_snapshots", + ) + sort_fields = ( + "id", + "created_at", + "created_by", + "max_depth", + "max_urls", + "max_size", + "label", + "notes", + "schedule_str", + "status", + "retry_at", + ) + search_fields = ("id", "created_by__username", "max_depth", "max_urls", "max_size", "label", "notes", "schedule_id", "status", "urls") - readonly_fields = ('created_at', 'modified_at', 'snapshots') + readonly_fields = ("created_at", "modified_at", "snapshots") fieldsets = ( - ('URLs', { - 'fields': ('urls',), - 'classes': ('card', 'wide'), - }), - ('Info', { - 'fields': ('label', 'notes', 'tags_editor'), - 'classes': ('card',), - }), - ('Settings', { - 'fields': (('max_depth', 'url_filters'), 'config'), - 'classes': ('card',), - }), - ('Status', { - 'fields': ('status', 'retry_at'), - 'classes': ('card',), - }), - ('Relations', { - 'fields': ('schedule', 'created_by'), - 'classes': ('card',), - }), - ('Timestamps', { - 'fields': ('created_at', 'modified_at'), - 'classes': ('card',), - }), - ('Snapshots', { - 'fields': ('snapshots',), - 'classes': ('card', 'wide'), - }), + ( + "URLs", + { + "fields": ("urls",), + "classes": ("card", "wide"), + }, + ), + ( + "Info", + { + "fields": ("label", "notes", "tags_editor"), + "classes": ("card",), + }, + ), + ( + "Settings", + { + "fields": (("max_depth", "max_urls", "max_size"), "url_filters", "config"), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("schedule", "created_by"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), ) add_fieldsets = ( - ('URLs', { - 'fields': ('urls',), - 'classes': ('card', 'wide'), - }), - ('Info', { - 'fields': ('label', 'notes', 'tags_editor'), - 'classes': ('card',), - }), - ('Settings', { - 'fields': (('max_depth', 'url_filters'), 'config'), - 'classes': ('card',), - }), - ('Status', { - 'fields': ('status', 'retry_at'), - 'classes': ('card',), - }), - ('Relations', { - 'fields': ('schedule', 'created_by'), - 'classes': ('card',), - }), + ( + "URLs", + { + "fields": ("urls",), + "classes": ("card", "wide"), + }, + ), + ( + "Info", + { + "fields": ("label", "notes", "tags_editor"), + "classes": ("card",), + }, + ), + ( + "Settings", + { + "fields": (("max_depth", "max_urls", "max_size"), "url_filters", "config"), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("schedule", "created_by"), + "classes": ("card",), + }, + ), ) - list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at') - ordering = ['-created_at', '-retry_at'] + list_filter = ("max_depth", "max_urls", "schedule", "created_by", "status", "retry_at") + ordering = ["-created_at", "-retry_at"] list_per_page = 100 actions = ["delete_selected_batched"] - change_actions = ['recrawl'] + change_actions = ["recrawl"] def get_queryset(self, request): """Optimize queries with select_related and annotations.""" qs = super().get_queryset(request) - return qs.select_related('schedule', 'created_by').annotate( - num_snapshots_cached=Count('snapshot_set') - ) + return qs.select_related("schedule", "created_by").annotate(num_snapshots_cached=Count("snapshot_set")) def get_fieldsets(self, request, obj=None): return self.fieldsets if obj else self.add_fieldsets @@ -516,19 +585,19 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): urls = super().get_urls() custom_urls = [ path( - '/snapshot//delete/', + "/snapshot//delete/", self.admin_site.admin_view(self.delete_snapshot_view), - name='crawls_crawl_snapshot_delete', + name="crawls_crawl_snapshot_delete", ), path( - '/snapshot//exclude-domain/', + "/snapshot//exclude-domain/", self.admin_site.admin_view(self.exclude_domain_view), - name='crawls_crawl_snapshot_exclude_domain', + name="crawls_crawl_snapshot_exclude_domain", ), ] return custom_urls + urls - @admin.action(description='Delete selected crawls') + @admin.action(description="Delete selected crawls") def delete_selected_batched(self, request, queryset): """Delete crawls in a single transaction to avoid SQLite concurrency issues.""" from django.db import transaction @@ -536,26 +605,28 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): total = queryset.count() # Get list of IDs to delete first (outside transaction) - ids_to_delete = list(queryset.values_list('pk', flat=True)) + ids_to_delete = list(queryset.values_list("pk", flat=True)) # Delete everything in a single atomic transaction with transaction.atomic(): deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete() - messages.success(request, f'Successfully deleted {total} crawls ({deleted_count} total objects including related records).') + messages.success(request, f"Successfully deleted {total} crawls ({deleted_count} total objects including related records).") - @action(label='Recrawl', description='Create a new crawl with the same settings') + @action(label="Recrawl", description="Create a new crawl with the same settings") def recrawl(self, request, obj): """Duplicate this crawl as a new crawl with the same URLs and settings.""" # Validate URLs (required for crawl to start) if not obj.urls: - messages.error(request, 'Cannot recrawl: original crawl has no URLs.') - return redirect('admin:crawls_crawl_change', obj.id) + messages.error(request, "Cannot recrawl: original crawl has no URLs.") + return redirect("admin:crawls_crawl_change", obj.id) new_crawl = Crawl.objects.create( urls=obj.urls, max_depth=obj.max_depth, + max_urls=obj.max_urls, + max_size=obj.max_size, tags_str=obj.tags_str, config=obj.config, schedule=obj.schedule, @@ -566,24 +637,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): retry_at=timezone.now(), ) - messages.success( - request, - f'Created new crawl {new_crawl.id} with the same settings. ' - f'It will start processing shortly.' - ) + messages.success(request, f"Created new crawl {new_crawl.id} with the same settings. It will start processing shortly.") - return redirect('admin:crawls_crawl_change', new_crawl.id) + return redirect("admin:crawls_crawl_change", new_crawl.id) def num_snapshots(self, obj): # Use cached annotation from get_queryset to avoid N+1 - return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count()) + return getattr(obj, "num_snapshots_cached", obj.snapshot_set.count()) def snapshots(self, obj): return render_snapshots_list(obj.snapshot_set.all(), crawl=obj) def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str): - if request.method != 'POST': - return HttpResponseNotAllowed(['POST']) + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) crawl = get_object_or_404(Crawl, pk=object_id) snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) @@ -593,51 +660,55 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): removed_urls = crawl.prune_url(snapshot.url) snapshot.delete() - return JsonResponse({ - 'ok': True, - 'snapshot_id': str(snapshot.id), - 'removed_urls': removed_urls, - }) + return JsonResponse( + { + "ok": True, + "snapshot_id": str(snapshot.id), + "removed_urls": removed_urls, + }, + ) def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str): - if request.method != 'POST': - return HttpResponseNotAllowed(['POST']) + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) crawl = get_object_or_404(Crawl, pk=object_id) snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) result = crawl.exclude_domain(snapshot.url) - return JsonResponse({ - 'ok': True, - **result, - }) + return JsonResponse( + { + "ok": True, + **result, + }, + ) - @admin.display(description='Schedule', ordering='schedule') + @admin.display(description="Schedule", ordering="schedule") def schedule_str(self, obj): if not obj.schedule: - return mark_safe('None') + return mark_safe("None") return format_html('{}', obj.schedule.admin_change_url, obj.schedule) - @admin.display(description='URLs', ordering='urls') + @admin.display(description="URLs", ordering="urls") def urls_preview(self, obj): - first_url = obj.get_urls_list()[0] if obj.get_urls_list() else '' - return first_url[:80] + '...' if len(first_url) > 80 else first_url + first_url = obj.get_urls_list()[0] if obj.get_urls_list() else "" + return first_url[:80] + "..." if len(first_url) > 80 else first_url - @admin.display(description='Health', ordering='health') + @admin.display(description="Health", ordering="health") def health_display(self, obj): h = obj.health - color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + color = "green" if h >= 80 else "orange" if h >= 50 else "red" return format_html('{}', color, h) - @admin.display(description='URLs') + @admin.display(description="URLs") def urls_editor(self, obj): """Editor for crawl URLs.""" - widget_id = f'crawl_urls_{obj.pk}' + widget_id = f"crawl_urls_{obj.pk}" # Escape for safe HTML embedding - escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + escaped_urls = (obj.urls or "").replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) # Count lines for auto-expand logic - line_count = len((obj.urls or '').split('\n')) + line_count = len((obj.urls or "").split("\n")) uri_rows = min(max(3, line_count), 10) html = f''' @@ -653,7 +724,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): placeholder="https://example.com https://example2.com # Comments start with #" readonly>{escaped_urls}

- {line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only + {line_count} URL{"s" if line_count != 1 else ""} · Note: URLs displayed here for reference only

@@ -661,60 +732,98 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return mark_safe(html) - class CrawlScheduleAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots') - sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str') - search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls') + list_display = ("id", "created_at", "created_by", "label", "notes", "template_str", "crawls", "num_crawls", "num_snapshots") + sort_fields = ("id", "created_at", "created_by", "label", "notes", "template_str") + search_fields = ("id", "created_by__username", "label", "notes", "schedule_id", "template_id", "template__urls") - readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots') + readonly_fields = ("created_at", "modified_at", "crawls", "snapshots") fieldsets = ( - ('Schedule Info', { - 'fields': ('label', 'notes'), - 'classes': ('card',), - }), - ('Configuration', { - 'fields': ('schedule', 'template'), - 'classes': ('card',), - }), - ('Metadata', { - 'fields': ('created_by', 'created_at', 'modified_at'), - 'classes': ('card',), - }), - ('Crawls', { - 'fields': ('crawls',), - 'classes': ('card', 'wide'), - }), - ('Snapshots', { - 'fields': ('snapshots',), - 'classes': ('card', 'wide'), - }), + ( + "Schedule Info", + { + "fields": ("label", "notes"), + "classes": ("card",), + }, + ), + ( + "Configuration", + { + "fields": ("schedule", "template"), + "classes": ("card",), + }, + ), + ( + "Metadata", + { + "fields": ("created_by", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Crawls", + { + "fields": ("crawls",), + "classes": ("card", "wide"), + }, + ), + ( + "Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), ) - list_filter = ('created_by',) - ordering = ['-created_at'] + list_filter = ("created_by",) + ordering = ["-created_at"] list_per_page = 100 actions = ["delete_selected"] - @admin.display(description='Template', ordering='template') + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related("created_by", "template") + .annotate( + crawl_count=Count("crawl", distinct=True), + snapshot_count=Count("crawl__snapshot_set", distinct=True), + ) + ) + + def get_fieldsets(self, request, obj=None): + if obj is None: + return tuple(fieldset for fieldset in self.fieldsets if fieldset[0] not in {"Crawls", "Snapshots"}) + return self.fieldsets + + def save_model(self, request, obj, form, change): + if not obj.created_by_id and getattr(request, "user", None) and request.user.is_authenticated: + obj.created_by = request.user + super().save_model(request, obj, form, change) + + @admin.display(description="Template", ordering="template") def template_str(self, obj): return format_html('{}', obj.template.admin_change_url, obj.template) + @admin.display(description="# Crawls", ordering="crawl_count") def num_crawls(self, obj): - return obj.crawl_set.count() + return getattr(obj, "crawl_count", obj.crawl_set.count()) + @admin.display(description="# Snapshots", ordering="snapshot_count") def num_snapshots(self, obj): - return obj.snapshot_set.count() + return getattr(obj, "snapshot_count", Snapshot.objects.filter(crawl__schedule=obj).count()) def crawls(self, obj): - return format_html_join('
', ' - {}', ( - (crawl.admin_change_url, crawl) - for crawl in obj.crawl_set.all().order_by('-created_at')[:20] - )) or mark_safe('No Crawls yet...') - + return format_html_join( + "
", + ' - {}', + ((crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by("-created_at")[:20]), + ) or mark_safe("No Crawls yet...") + def snapshots(self, obj): - crawl_ids = obj.crawl_set.values_list('pk', flat=True) + crawl_ids = obj.crawl_set.values_list("pk", flat=True) return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids)) diff --git a/archivebox/crawls/apps.py b/archivebox/crawls/apps.py index 4d604a45..b9e5ed66 100644 --- a/archivebox/crawls/apps.py +++ b/archivebox/crawls/apps.py @@ -11,5 +11,5 @@ class CrawlsConfig(AppConfig): import sys # Skip during makemigrations to avoid premature state machine access - if 'makemigrations' not in sys.argv: + if "makemigrations" not in sys.argv: from archivebox.crawls.models import CrawlMachine # noqa: F401 diff --git a/archivebox/crawls/migrations/0001_initial.py b/archivebox/crawls/migrations/0001_initial.py index 90a21437..c90b52ad 100644 --- a/archivebox/crawls/migrations/0001_initial.py +++ b/archivebox/crawls/migrations/0001_initial.py @@ -11,11 +11,10 @@ from archivebox.base_models.models import get_or_create_system_user_pk class Migration(migrations.Migration): - initial = True dependencies = [ - ('auth', '0012_alter_user_first_name_max_length'), + ("auth", "0012_alter_user_first_name_max_length"), migrations.swappable_dependency(settings.AUTH_USER_MODEL), ] @@ -80,61 +79,98 @@ class Migration(migrations.Migration): reverse_sql=""" DROP TABLE IF EXISTS crawls_crawl; DROP TABLE IF EXISTS crawls_crawlschedule; - """ + """, ), ], state_operations=[ migrations.CreateModel( - name='CrawlSchedule', + name="CrawlSchedule", fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('num_uses_succeeded', models.PositiveIntegerField(default=0)), - ('num_uses_failed', models.PositiveIntegerField(default=0)), - ('schedule', models.CharField(max_length=64)), - ('is_enabled', models.BooleanField(default=True)), - ('label', models.CharField(blank=True, default='', max_length=64)), - ('notes', models.TextField(blank=True, default='')), - ('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ("schedule", models.CharField(max_length=64)), + ("is_enabled", models.BooleanField(default=True)), + ("label", models.CharField(blank=True, default="", max_length=64)), + ("notes", models.TextField(blank=True, default="")), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), ], options={ - 'verbose_name': 'Scheduled Crawl', - 'verbose_name_plural': 'Scheduled Crawls', - 'app_label': 'crawls', + "verbose_name": "Scheduled Crawl", + "verbose_name_plural": "Scheduled Crawls", + "app_label": "crawls", }, ), migrations.CreateModel( - name='Crawl', + name="Crawl", fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('num_uses_succeeded', models.PositiveIntegerField(default=0)), - ('num_uses_failed', models.PositiveIntegerField(default=0)), - ('urls', models.TextField(help_text='Newline-separated list of URLs to crawl')), - ('config', models.JSONField(blank=True, default=dict, null=True)), - ('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])), - ('tags_str', models.CharField(blank=True, default='', max_length=1024)), - ('persona_id', models.UUIDField(blank=True, null=True)), - ('label', models.CharField(blank=True, default='', max_length=64)), - ('notes', models.TextField(blank=True, default='')), - ('output_dir', models.CharField(blank=True, default='', max_length=512)), - ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)), - ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)), - ('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), - ('schedule', models.ForeignKey(blank=True, editable=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule')), + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ("urls", models.TextField(help_text="Newline-separated list of URLs to crawl")), + ("config", models.JSONField(blank=True, default=dict, null=True)), + ( + "max_depth", + models.PositiveSmallIntegerField( + default=0, + validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)], + ), + ), + ("tags_str", models.CharField(blank=True, default="", max_length=1024)), + ("persona_id", models.UUIDField(blank=True, null=True)), + ("label", models.CharField(blank=True, default="", max_length=64)), + ("notes", models.TextField(blank=True, default="")), + ("output_dir", models.CharField(blank=True, default="", max_length=512)), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + ("retry_at", models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ( + "schedule", + models.ForeignKey( + blank=True, + editable=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="crawls.crawlschedule", + ), + ), ], options={ - 'verbose_name': 'Crawl', - 'verbose_name_plural': 'Crawls', - 'app_label': 'crawls', + "verbose_name": "Crawl", + "verbose_name_plural": "Crawls", + "app_label": "crawls", }, ), migrations.AddField( - model_name='crawlschedule', - name='template', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'), + model_name="crawlschedule", + name="template", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="crawls.crawl"), ), ], ), diff --git a/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py index cb49fb57..1665d62f 100644 --- a/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py +++ b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py @@ -16,8 +16,8 @@ def upgrade_crawl_table_from_v086(apps, schema_editor): # Detect schema version cursor.execute("PRAGMA table_info(crawls_crawl)") crawl_cols = {row[1] for row in cursor.fetchall()} - has_seed_id = 'seed_id' in crawl_cols - has_urls = 'urls' in crawl_cols + has_seed_id = "seed_id" in crawl_cols + has_urls = "urls" in crawl_cols # Only upgrade if we have v0.8.6rc0 schema if not (has_seed_id and not has_urls): @@ -29,7 +29,7 @@ def upgrade_crawl_table_from_v086(apps, schema_editor): # v0.8.6rc0 schema - upgrade to v0.9.0 if has_data: - print('Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0...') + print("Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0...") cursor.execute(""" CREATE TABLE IF NOT EXISTS crawls_crawl_new ( @@ -82,13 +82,12 @@ def upgrade_crawl_table_from_v086(apps, schema_editor): cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);") if has_data: - print('✓ crawls_crawl upgraded to v0.9.0') + print("✓ crawls_crawl upgraded to v0.9.0") class Migration(migrations.Migration): - dependencies = [ - ('crawls', '0001_initial'), + ("crawls", "0001_initial"), ] operations = [ diff --git a/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py index e3740a3b..d8d38f37 100644 --- a/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py +++ b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py @@ -4,18 +4,17 @@ from django.db import migrations class Migration(migrations.Migration): - dependencies = [ - ('crawls', '0002_upgrade_from_0_8_6'), + ("crawls", "0002_upgrade_from_0_8_6"), ] operations = [ migrations.RemoveField( - model_name='crawlschedule', - name='num_uses_failed', + model_name="crawlschedule", + name="num_uses_failed", ), migrations.RemoveField( - model_name='crawlschedule', - name='num_uses_succeeded', + model_name="crawlschedule", + name="num_uses_succeeded", ), ] diff --git a/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py index 3de115bc..3d682530 100644 --- a/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py +++ b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py @@ -4,14 +4,13 @@ from django.db import migrations class Migration(migrations.Migration): - dependencies = [ - ('crawls', '0003_remove_crawlschedule_num_uses_failed_and_more'), + ("crawls", "0003_remove_crawlschedule_num_uses_failed_and_more"), ] operations = [ migrations.RemoveField( - model_name='crawl', - name='output_dir', + model_name="crawl", + name="output_dir", ), ] diff --git a/archivebox/crawls/migrations/0005_add_crawl_limits.py b/archivebox/crawls/migrations/0005_add_crawl_limits.py new file mode 100644 index 00000000..c9318162 --- /dev/null +++ b/archivebox/crawls/migrations/0005_add_crawl_limits.py @@ -0,0 +1,31 @@ +# Generated by Django 6.0 on 2026-03-23 00:00 + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0004_remove_crawl_output_dir"), + ] + + operations = [ + migrations.AddField( + model_name="crawl", + name="max_size", + field=models.BigIntegerField( + default=0, + help_text="Maximum total archived output size in bytes for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + migrations.AddField( + model_name="crawl", + name="max_urls", + field=models.IntegerField( + default=0, + help_text="Maximum number of URLs to snapshot for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + ] diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 77023c55..49de79be 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.crawls' +__package__ = "archivebox.crawls" from typing import TYPE_CHECKING import uuid @@ -17,7 +17,14 @@ from django.utils import timezone from statemachine import State, registry from rich import print -from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk +from archivebox.base_models.models import ( + ModelWithUUID, + ModelWithOutputDir, + ModelWithConfig, + ModelWithNotes, + ModelWithHealthStats, + get_or_create_system_user_pk, +) from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule @@ -31,31 +38,31 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes): created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) modified_at = models.DateTimeField(auto_now=True) - template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False) # type: ignore + template: "Crawl" = models.ForeignKey("Crawl", on_delete=models.CASCADE, null=False, blank=False) # type: ignore schedule = models.CharField(max_length=64, blank=False, null=False) is_enabled = models.BooleanField(default=True) - label = models.CharField(max_length=64, blank=True, null=False, default='') - notes = models.TextField(blank=True, null=False, default='') + label = models.CharField(max_length=64, blank=True, null=False, default="") + notes = models.TextField(blank=True, null=False, default="") - crawl_set: models.Manager['Crawl'] + crawl_set: models.Manager["Crawl"] class Meta(ModelWithUUID.Meta, ModelWithNotes.Meta): - app_label = 'crawls' - verbose_name = 'Scheduled Crawl' - verbose_name_plural = 'Scheduled Crawls' + app_label = "crawls" + verbose_name = "Scheduled Crawl" + verbose_name_plural = "Scheduled Crawls" def __str__(self) -> str: urls_preview = self.template.urls[:64] if self.template and self.template.urls else "" - return f'[{self.id}] {urls_preview} @ {self.schedule}' + return f"[{self.id}] {urls_preview} @ {self.schedule}" @property def api_url(self) -> str: - return str(reverse_lazy('api-1:get_any', args=[self.id])) + return str(reverse_lazy("api-1:get_any", args=[self.id])) def save(self, *args, **kwargs): - self.schedule = (self.schedule or '').strip() + self.schedule = (self.schedule or "").strip() validate_schedule(self.schedule) - self.label = self.label or (self.template.label if self.template else '') + self.label = self.label or (self.template.label if self.template else "") super().save(*args, **kwargs) if self.template: self.template.schedule = self @@ -63,7 +70,7 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes): @property def last_run_at(self): - latest_crawl = self.crawl_set.order_by('-created_at').first() + latest_crawl = self.crawl_set.order_by("-created_at").first() if latest_crawl: return latest_crawl.created_at if self.template: @@ -78,7 +85,7 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes): now = now or timezone.now() return self.is_enabled and self.next_run_at <= now - def enqueue(self, queued_at=None) -> 'Crawl': + def enqueue(self, queued_at=None) -> "Crawl": queued_at = queued_at or timezone.now() template = self.template label = template.label or self.label @@ -87,6 +94,8 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes): urls=template.urls, config=template.config or {}, max_depth=template.max_depth, + max_urls=template.max_urls, + max_size=template.max_size, tags_str=template.tags_str, persona_id=template.persona_id, label=label, @@ -104,28 +113,41 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) modified_at = models.DateTimeField(auto_now=True) - urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl') + urls = models.TextField(blank=False, null=False, help_text="Newline-separated list of URLs to crawl") config = models.JSONField(default=dict, null=True, blank=True) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) - tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') + max_urls = models.IntegerField( + default=0, + validators=[MinValueValidator(0)], + help_text="Maximum number of URLs to snapshot for this crawl (0 = unlimited).", + ) + max_size = models.BigIntegerField( + default=0, + validators=[MinValueValidator(0)], + help_text="Maximum total archived output size in bytes for this crawl (0 = unlimited).", + ) + tags_str = models.CharField(max_length=1024, blank=True, null=False, default="") persona_id = models.UUIDField(null=True, blank=True) - label = models.CharField(max_length=64, blank=True, null=False, default='') - notes = models.TextField(blank=True, null=False, default='') + label = models.CharField(max_length=64, blank=True, null=False, default="") + notes = models.TextField(blank=True, null=False, default="") schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True) - status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) + status = ModelWithStateMachine.StatusField( + choices=ModelWithStateMachine.StatusChoices, + default=ModelWithStateMachine.StatusChoices.QUEUED, + ) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) - state_machine_name = 'archivebox.crawls.models.CrawlMachine' - retry_at_field_name = 'retry_at' - state_field_name = 'status' + state_machine_name = "archivebox.crawls.models.CrawlMachine" + retry_at_field_name = "retry_at" + state_field_name = "status" StatusChoices = ModelWithStateMachine.StatusChoices active_state = StatusChoices.STARTED schedule_id: uuid.UUID | None - sm: 'CrawlMachine' + sm: "CrawlMachine" - snapshot_set: models.Manager['Snapshot'] + snapshot_set: models.Manager["Snapshot"] class Meta( ModelWithOutputDir.Meta, @@ -133,17 +155,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith ModelWithHealthStats.Meta, ModelWithStateMachine.Meta, ): - app_label = 'crawls' - verbose_name = 'Crawl' - verbose_name_plural = 'Crawls' + app_label = "crawls" + verbose_name = "Crawl" + verbose_name_plural = "Crawls" def __str__(self): - first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + first_url = self.get_urls_list()[0] if self.get_urls_list() else "" # Show last 8 digits of UUID and more of the URL short_id = str(self.id)[-8:] - return f'[...{short_id}] {first_url[:120]}' + return f"[...{short_id}] {first_url[:120]}" def save(self, *args, **kwargs): + config = dict(self.config or {}) + if self.max_urls > 0: + config["MAX_URLS"] = self.max_urls + else: + config.pop("MAX_URLS", None) + + if self.max_size > 0: + config["MAX_SIZE"] = self.max_size + else: + config.pop("MAX_SIZE", None) + + if config != (self.config or {}): + self.config = config + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + super().save(*args, **kwargs) # if is_new: # from archivebox.misc.logging_util import log_worker_event @@ -162,23 +201,26 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith @property def api_url(self) -> str: - return str(reverse_lazy('api-1:get_crawl', args=[self.id])) + return str(reverse_lazy("api-1:get_crawl", args=[self.id])) def to_json(self) -> dict: """ Convert Crawl model instance to a JSON-serializable dict. """ from archivebox.config import VERSION + return { - 'type': 'Crawl', - 'schema_version': VERSION, - 'id': str(self.id), - 'urls': self.urls, - 'status': self.status, - 'max_depth': self.max_depth, - 'tags_str': self.tags_str, - 'label': self.label, - 'created_at': self.created_at.isoformat() if self.created_at else None, + "type": "Crawl", + "schema_version": VERSION, + "id": str(self.id), + "urls": self.urls, + "status": self.status, + "max_depth": self.max_depth, + "max_urls": self.max_urls, + "max_size": self.max_size, + "tags_str": self.tags_str, + "label": self.label, + "created_at": self.created_at.isoformat() if self.created_at else None, } @staticmethod @@ -198,7 +240,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith overrides = overrides or {} # Check if crawl already exists by ID - crawl_id = record.get('id') + crawl_id = record.get("id") if crawl_id: try: return Crawl.objects.get(id=crawl_id) @@ -206,9 +248,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith pass # Get URLs - can be string (newline-separated) or from 'url' field - urls = record.get('urls', '') - if not urls and record.get('url'): - urls = record['url'] + urls = record.get("urls", "") + if not urls and record.get("url"): + urls = record["url"] if not urls: return None @@ -216,9 +258,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Create new crawl (status stays QUEUED, not started) crawl = Crawl.objects.create( urls=urls, - max_depth=record.get('max_depth', record.get('depth', 0)), - tags_str=record.get('tags_str', record.get('tags', '')), - label=record.get('label', ''), + max_depth=record.get("max_depth", record.get("depth", 0)), + max_urls=record.get("max_urls", 0), + max_size=record.get("max_size", 0), + tags_str=record.get("tags_str", record.get("tags", "")), + label=record.get("label", ""), status=Crawl.StatusChoices.QUEUED, retry_at=timezone.now(), **overrides, @@ -234,39 +278,35 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith from archivebox import DATA_DIR from archivebox.core.models import Snapshot - date_str = self.created_at.strftime('%Y%m%d') + date_str = self.created_at.strftime("%Y%m%d") urls = self.get_urls_list() - domain = Snapshot.extract_domain_from_url(urls[0]) if urls else 'unknown' + domain = Snapshot.extract_domain_from_url(urls[0]) if urls else "unknown" - return DATA_DIR / 'users' / self.created_by.username / 'crawls' / date_str / domain / str(self.id) + return DATA_DIR / "users" / self.created_by.username / "crawls" / date_str / domain / str(self.id) def get_urls_list(self) -> list[str]: """Get list of URLs from urls field, filtering out comments and empty lines.""" if not self.urls: return [] - return [ - url.strip() - for url in self.urls.split('\n') - if url.strip() and not url.strip().startswith('#') - ] + return [url.strip() for url in self.urls.split("\n") if url.strip() and not url.strip().startswith("#")] @staticmethod def normalize_domain(value: str) -> str: - candidate = (value or '').strip().lower() + candidate = (value or "").strip().lower() if not candidate: - return '' - if '://' not in candidate and '/' not in candidate: - candidate = f'https://{candidate.lstrip(".")}' + return "" + if "://" not in candidate and "/" not in candidate: + candidate = f"https://{candidate.lstrip('.')}" try: parsed = urlparse(candidate) - hostname = parsed.hostname or '' + hostname = parsed.hostname or "" if not hostname: - return '' + return "" if parsed.port: - return f'{hostname}_{parsed.port}' + return f"{hostname}_{parsed.port}" return hostname except Exception: - return '' + return "" @staticmethod def split_filter_patterns(value) -> list[str]: @@ -280,7 +320,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith raw_values = [] for raw_value in raw_values: - pattern = str(raw_value or '').strip() + pattern = str(raw_value or "").strip() if not pattern or pattern in seen: continue seen.add(pattern) @@ -289,28 +329,28 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith @classmethod def _pattern_matches_url(cls, url: str, pattern: str) -> bool: - normalized_pattern = str(pattern or '').strip() + normalized_pattern = str(pattern or "").strip() if not normalized_pattern: return False - if re.fullmatch(r'[\w.*:-]+', normalized_pattern): - wildcard_only_subdomains = normalized_pattern.startswith('*.') + if re.fullmatch(r"[\w.*:-]+", normalized_pattern): + wildcard_only_subdomains = normalized_pattern.startswith("*.") normalized_domain = cls.normalize_domain( - normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern + normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern, ) normalized_url_domain = cls.normalize_domain(url) if not normalized_domain or not normalized_url_domain: return False - pattern_host = normalized_domain.split('_', 1)[0] - url_host = normalized_url_domain.split('_', 1)[0] + pattern_host = normalized_domain.split("_", 1)[0] + url_host = normalized_url_domain.split("_", 1)[0] if wildcard_only_subdomains: - return url_host.endswith(f'.{pattern_host}') + return url_host.endswith(f".{pattern_host}") if normalized_url_domain == normalized_domain: return True - return url_host == pattern_host or url_host.endswith(f'.{pattern_host}') + return url_host == pattern_host or url_host.endswith(f".{pattern_host}") try: return bool(re.search(normalized_pattern, url)) @@ -324,7 +364,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith config = get_config(crawl=self, snapshot=snapshot) else: config = self.config or {} - return self.split_filter_patterns(config.get('URL_ALLOWLIST', '')) + return self.split_filter_patterns(config.get("URL_ALLOWLIST", "")) def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: if use_effective_config: @@ -333,7 +373,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith config = get_config(crawl=self, snapshot=snapshot) else: config = self.config or {} - return self.split_filter_patterns(config.get('URL_DENYLIST', '')) + return self.split_filter_patterns(config.get("URL_DENYLIST", "")) def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool: denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot) @@ -354,14 +394,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith deny_patterns = self.split_filter_patterns(denylist) if allow_patterns: - config['URL_ALLOWLIST'] = '\n'.join(allow_patterns) + config["URL_ALLOWLIST"] = "\n".join(allow_patterns) else: - config.pop('URL_ALLOWLIST', None) + config.pop("URL_ALLOWLIST", None) if deny_patterns: - config['URL_DENYLIST'] = '\n'.join(deny_patterns) + config["URL_DENYLIST"] = "\n".join(deny_patterns) else: - config.pop('URL_DENYLIST', None) + config.pop("URL_DENYLIST", None) self.config = config @@ -369,23 +409,20 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith from archivebox.core.models import Snapshot removed_urls = self.prune_urls( - lambda url: not self.url_passes_filters(url, use_effective_config=False) + lambda url: not self.url_passes_filters(url, use_effective_config=False), ) filtered_snapshots = [ snapshot for snapshot in self.snapshot_set.filter( status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], - ).only('pk', 'url', 'status') + ).only("pk", "url", "status") if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False) ] deleted_snapshots = 0 if filtered_snapshots: - started_snapshots = [ - snapshot for snapshot in filtered_snapshots - if snapshot.status == Snapshot.StatusChoices.STARTED - ] + started_snapshots = [snapshot for snapshot in filtered_snapshots if snapshot.status == Snapshot.StatusChoices.STARTED] for snapshot in started_snapshots: snapshot.cancel_running_hooks() @@ -393,22 +430,22 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete() return { - 'removed_urls': len(removed_urls), - 'deleted_snapshots': deleted_snapshots, + "removed_urls": len(removed_urls), + "deleted_snapshots": deleted_snapshots, } def _iter_url_lines(self) -> list[tuple[str, str]]: entries: list[tuple[str, str]] = [] - for raw_line in (self.urls or '').splitlines(): + for raw_line in (self.urls or "").splitlines(): stripped = raw_line.strip() if not stripped: continue - if stripped.startswith('#'): - entries.append((raw_line.rstrip(), '')) + if stripped.startswith("#"): + entries.append((raw_line.rstrip(), "")) continue try: entry = json.loads(stripped) - entries.append((raw_line.rstrip(), str(entry.get('url', '') or '').strip())) + entries.append((raw_line.rstrip(), str(entry.get("url", "") or "").strip())) except json.JSONDecodeError: entries.append((raw_line.rstrip(), stripped)) return entries @@ -426,14 +463,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith continue kept_lines.append(raw_line) - next_urls = '\n'.join(kept_lines) - if next_urls != (self.urls or ''): + next_urls = "\n".join(kept_lines) + if next_urls != (self.urls or ""): self.urls = next_urls - self.save(update_fields=['urls', 'modified_at']) + self.save(update_fields=["urls", "modified_at"]) return removed_urls def prune_url(self, url: str) -> int: - target = (url or '').strip() + target = (url or "").strip() removed = self.prune_urls(lambda candidate: candidate == target) return len(removed) @@ -441,10 +478,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith normalized_domain = self.normalize_domain(domain) if not normalized_domain: return { - 'domain': '', - 'created': False, - 'removed_urls': 0, - 'deleted_snapshots': 0, + "domain": "", + "created": False, + "removed_urls": 0, + "deleted_snapshots": 0, } domains = self.get_url_denylist(use_effective_config=False) @@ -455,15 +492,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith self.get_url_allowlist(use_effective_config=False), domains, ) - self.save(update_fields=['config', 'modified_at']) + self.save(update_fields=["config", "modified_at"]) filter_result = self.apply_crawl_config_filters() return { - 'domain': normalized_domain, - 'created': created, - 'removed_urls': filter_result['removed_urls'], - 'deleted_snapshots': filter_result['deleted_snapshots'], + "domain": normalized_domain, + "created": created, + "removed_urls": filter_result["removed_urls"], + "deleted_snapshots": filter_result["deleted_snapshots"], } def get_system_task(self) -> str | None: @@ -471,7 +508,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if len(urls) != 1: return None system_url = urls[0].strip().lower() - if system_url.startswith('archivebox://'): + if system_url.startswith("archivebox://"): return system_url return None @@ -481,17 +518,16 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if self.persona_id: persona = Persona.objects.filter(id=self.persona_id).first() if persona is None: - raise Persona.DoesNotExist(f'Crawl {self.id} references missing Persona {self.persona_id}') + raise Persona.DoesNotExist(f"Crawl {self.id} references missing Persona {self.persona_id}") return persona - default_persona_name = str((self.config or {}).get('DEFAULT_PERSONA') or '').strip() + default_persona_name = str((self.config or {}).get("DEFAULT_PERSONA") or "").strip() if default_persona_name: - persona, _ = Persona.objects.get_or_create(name=default_persona_name or 'Default') + persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default") return persona return None - def add_url(self, entry: dict) -> bool: """ Add a URL to the crawl queue if not already present. @@ -502,15 +538,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Returns: True if URL was added, False if skipped (duplicate or depth exceeded) """ - from archivebox.misc.util import fix_url_from_markdown + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url - url = fix_url_from_markdown(str(entry.get('url', '') or '').strip()) + url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) if not url: return False if not self.url_passes_filters(url): return False - depth = entry.get('depth', 1) + depth = entry.get("depth", 1) # Skip if depth exceeds max_depth if depth > self.max_depth: @@ -527,13 +563,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return False # Append as JSONL - entry = {**entry, 'url': url} + entry = {**entry, "url": url} jsonl_entry = json.dumps(entry) - self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n') - self.save(update_fields=['urls', 'modified_at']) + self.urls = (self.urls.rstrip() + "\n" + jsonl_entry).lstrip("\n") + self.save(update_fields=["urls", "modified_at"]) return True - def create_snapshots_from_urls(self) -> list['Snapshot']: + def create_snapshots_from_urls(self) -> list["Snapshot"]: """ Create Snapshot objects for each URL in self.urls that doesn't already exist. @@ -541,7 +577,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith List of newly created Snapshot objects """ from archivebox.core.models import Snapshot - from archivebox.misc.util import fix_url_from_markdown + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url created_snapshots = [] @@ -552,13 +588,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Parse JSONL or plain URL try: entry = json.loads(line) - url = fix_url_from_markdown(str(entry.get('url', '') or '').strip()) - depth = entry.get('depth', 0) - title = entry.get('title') - timestamp = entry.get('timestamp') - tags = entry.get('tags', '') + url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) + depth = entry.get("depth", 0) + title = entry.get("title") + timestamp = entry.get("timestamp") + tags = entry.get("tags", "") except json.JSONDecodeError: - url = fix_url_from_markdown(line.strip()) + url = sanitize_extracted_url(fix_url_from_markdown(line.strip())) depth = 0 title = None timestamp = None @@ -578,20 +614,20 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith url=url, crawl=self, defaults={ - 'depth': depth, - 'title': title, - 'timestamp': timestamp or str(timezone.now().timestamp()), - 'status': Snapshot.INITIAL_STATE, - 'retry_at': timezone.now(), + "depth": depth, + "title": title, + "timestamp": timestamp or str(timezone.now().timestamp()), + "status": Snapshot.INITIAL_STATE, + "retry_at": timezone.now(), # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl - } + }, ) if created: created_snapshots.append(snapshot) # Save tags if present if tags: - snapshot.save_tags(tags.split(',')) + snapshot.save_tags(tags.split(",")) # Ensure crawl -> snapshot symlink exists for both new and existing snapshots try: @@ -632,9 +668,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Binary.objects.filter( machine=machine, name__in=binary_names, - ).exclude( + ) + .exclude( status=Binary.StatusChoices.INSTALLED, - ).order_by('name') + ) + .order_by("name"), ) if not unresolved_binaries: return @@ -670,20 +708,21 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Binary.objects.filter( machine=machine, name__in=binary_names, - ).exclude( + ) + .exclude( status=Binary.StatusChoices.INSTALLED, - ).order_by('name') + ) + .order_by("name"), ) if unresolved_binaries: - binary_details = ', '.join( - f'{binary.name} (status={binary.status}, retry_at={binary.retry_at})' - for binary in unresolved_binaries + binary_details = ", ".join( + f"{binary.name} (status={binary.status}, retry_at={binary.retry_at})" for binary in unresolved_binaries ) raise RuntimeError( - f'Crawl dependencies failed to install before continuing: {binary_details}' + f"Crawl dependencies failed to install before continuing: {binary_details}", ) - def run(self) -> 'Snapshot | None': + def run(self) -> "Snapshot | None": """ Execute this Crawl: run hooks, process JSONL, create snapshots. @@ -699,9 +738,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith from archivebox.machine.models import Binary, Machine # Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode) - debug_log = Path('/tmp/archivebox_crawl_debug.log') - with open(debug_log, 'a') as f: - f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n') + debug_log = Path("/tmp/archivebox_crawl_debug.log") + with open(debug_log, "a") as f: + f.write(f"\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n") f.flush() def get_runtime_config(): @@ -711,7 +750,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return config system_task = self.get_system_task() - if system_task == 'archivebox://update': + if system_task == "archivebox://update": from archivebox.cli.archivebox_update import process_all_db_snapshots process_all_db_snapshots() @@ -723,7 +762,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith persona = self.resolve_persona() if persona: base_runtime_config = get_config(crawl=self, persona=persona) - chrome_binary = str(base_runtime_config.get('CHROME_BINARY') or '') + chrome_binary = str(base_runtime_config.get("CHROME_BINARY") or "") persona_runtime_overrides = persona.prepare_runtime_for_crawl( crawl=self, chrome_binary=chrome_binary, @@ -738,8 +777,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith self.urls.strip(), ) - with open(debug_log, 'a') as f: - f.write(f'Running hook: {hook.name}\n') + with open(debug_log, "a") as f: + f.write(f"Running hook: {hook.name}\n") f.flush() hook_start = time.time() plugin_name = hook.parent.name @@ -755,13 +794,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith url=primary_url, snapshot_id=str(self.id), ) - with open(debug_log, 'a') as f: - f.write(f'Hook {hook.name} completed with status={process.status}\n') + with open(debug_log, "a") as f: + f.write(f"Hook {hook.name} completed with status={process.status}\n") f.flush() hook_elapsed = time.time() - hook_start if hook_elapsed > 0.5: - print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]') + print(f"[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]") if process.status == process.StatusChoices.RUNNING: if not is_finite_background_hook(hook.name): @@ -772,6 +811,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return set() from archivebox.hooks import extract_records_from_process + records = [] # Finite background hooks can exit before their stdout log is fully # visible to our polling loop. Give successful hooks a brief chance @@ -783,26 +823,20 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if records: break if records: - print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]') + print(f"[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]") for record in records[:3]: - print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}') + print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}") if system_task: - records = [ - record - for record in records - if record.get('type') in ('Binary', 'Machine') - ] - overrides = {'crawl': self} + records = [record for record in records if record.get("type") in ("Binary", "Machine")] + overrides = {"crawl": self} stats = process_hook_records(records, overrides=overrides) if stats: - print(f'[green]✓ Created: {stats}[/green]') + print(f"[green]✓ Created: {stats}[/green]") hook_binary_names = { - str(record.get('name')).strip() - for record in records - if record.get('type') == 'Binary' and record.get('name') + str(record.get("name")).strip() for record in records if record.get("type") == "Binary" and record.get("name") } - hook_binary_names.discard('') + hook_binary_names.discard("") if hook_binary_names: declared_binary_names.update(hook_binary_names) return hook_binary_names @@ -818,9 +852,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Binary.objects.filter( machine=machine, name__in=resolved_binary_names, - ).exclude( + ) + .exclude( status=Binary.StatusChoices.INSTALLED, - ).order_by('name') + ) + .order_by("name"), ) if not unresolved_binaries: return resolved_binary_names @@ -837,7 +873,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith provider_hooks = [ hook - for hook in discover_hooks('Crawl', filter_disabled=False, config=get_runtime_config()) + for hook in discover_hooks("Crawl", filter_disabled=False, config=get_runtime_config()) if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks ] if not provider_hooks: @@ -847,12 +883,12 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith resolved_binary_names.update(run_crawl_hook(hook)) # Discover and run on_Crawl hooks - with open(debug_log, 'a') as f: - f.write('Discovering Crawl hooks...\n') + with open(debug_log, "a") as f: + f.write("Discovering Crawl hooks...\n") f.flush() - hooks = discover_hooks('Crawl', config=get_runtime_config()) - with open(debug_log, 'a') as f: - f.write(f'Found {len(hooks)} hooks\n') + hooks = discover_hooks("Crawl", config=get_runtime_config()) + with open(debug_log, "a") as f: + f.write(f"Found {len(hooks)} hooks\n") f.flush() for hook in hooks: @@ -870,20 +906,20 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if leaked_snapshots.exists(): leaked_count = leaked_snapshots.count() leaked_snapshots.delete() - print(f'[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]') - with open(debug_log, 'a') as f: - f.write(f'Skipping snapshot creation for system crawl: {system_task}\n') - f.write('=== Crawl.run() complete ===\n\n') + print(f"[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]") + with open(debug_log, "a") as f: + f.write(f"Skipping snapshot creation for system crawl: {system_task}\n") + f.write("=== Crawl.run() complete ===\n\n") f.flush() return None - with open(debug_log, 'a') as f: - f.write('Creating snapshots from URLs...\n') + with open(debug_log, "a") as f: + f.write("Creating snapshots from URLs...\n") f.flush() created_snapshots = self.create_snapshots_from_urls() - with open(debug_log, 'a') as f: - f.write(f'Created {len(created_snapshots)} snapshots\n') - f.write('=== Crawl.run() complete ===\n\n') + with open(debug_log, "a") as f: + f.write(f"Created {len(created_snapshots)} snapshots\n") + f.write("=== Crawl.run() complete ===\n\n") f.flush() # Return first snapshot for this crawl (newly created or existing) @@ -922,11 +958,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Use Process.kill_tree() to gracefully kill parent + children killed_count = process.kill_tree(graceful_timeout=2.0) if killed_count > 0: - print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]') + print(f"[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]") # Clean up .pid files from output directory if self.output_dir.exists(): - for pid_file in self.output_dir.glob('**/*.pid'): + for pid_file in self.output_dir.glob("**/*.pid"): pid_file.unlink(missing_ok=True) persona = self.resolve_persona() @@ -935,9 +971,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Run on_CrawlEnd hooks from archivebox.config.configset import get_config + config = get_config(crawl=self) - hooks = discover_hooks('CrawlEnd', config=config) + hooks = discover_hooks("CrawlEnd", config=config) for hook in hooks: plugin_name = hook.parent.name @@ -954,13 +991,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Log failures but don't block if process.exit_code != 0: - print(f'[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]') + print(f"[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]") # ============================================================================= # State Machines # ============================================================================= + class CrawlMachine(BaseStateMachine): crawl: Crawl @@ -994,7 +1032,7 @@ class CrawlMachine(BaseStateMachine): └─────────────────────────────────────────────────────────────┘ """ - model_attr_name = 'crawl' + model_attr_name = "crawl" # States queued = State(value=Crawl.StatusChoices.QUEUED, initial=True) @@ -1002,22 +1040,18 @@ class CrawlMachine(BaseStateMachine): sealed = State(value=Crawl.StatusChoices.SEALED, final=True) # Tick Event (polled by workers) - tick = ( - queued.to.itself(unless='can_start') - | queued.to(started, cond='can_start') - | started.to(sealed, cond='is_finished') - ) + tick = queued.to.itself(unless="can_start") | queued.to(started, cond="can_start") | started.to(sealed, cond="is_finished") # Manual event (triggered by last Snapshot sealing) seal = started.to(sealed) def can_start(self) -> bool: if not self.crawl.urls: - print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]') + print(f"[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]") return False urls_list = self.crawl.get_urls_list() if not urls_list: - print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]') + print(f"[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]") return False return True @@ -1029,14 +1063,17 @@ class CrawlMachine(BaseStateMachine): def enter_started(self): import sys - print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr) + print(f"[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]", file=sys.stderr) try: # Run the crawl - runs hooks, processes JSONL, creates snapshots first_snapshot = self.crawl.run() if first_snapshot: - print(f'[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]', file=sys.stderr) + print( + f"[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]", + file=sys.stderr, + ) # Update status to STARTED # Set retry_at to near future so tick() can poll and check is_finished() self.crawl.update_and_requeue( @@ -1045,13 +1082,14 @@ class CrawlMachine(BaseStateMachine): ) else: # No snapshots (system crawl like archivebox://install) - print('[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr) + print("[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]", file=sys.stderr) # Seal immediately since there's no work to do self.seal() except Exception as e: - print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]') + print(f"[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]") import traceback + traceback.print_exc() raise diff --git a/archivebox/crawls/schedule_utils.py b/archivebox/crawls/schedule_utils.py index 1df66ac9..a5307f99 100644 --- a/archivebox/crawls/schedule_utils.py +++ b/archivebox/crawls/schedule_utils.py @@ -33,7 +33,7 @@ def validate_schedule(schedule: str) -> str: normalized = normalize_schedule(schedule) if not croniter.is_valid(normalized): raise ValueError( - "Invalid schedule. Use an alias like daily/weekly/monthly or a cron expression such as '0 */6 * * *'." + "Invalid schedule. Use an alias like daily/weekly/monthly or a cron expression such as '0 */6 * * *'.", ) return normalized diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 3dda2bd6..314bbf72 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -52,19 +52,19 @@ API (all hook logic lives here): is_background_hook(name) -> bool Check if hook is background (.bg suffix) """ -__package__ = 'archivebox' +__package__ = "archivebox" import os import json from functools import lru_cache from pathlib import Path -from typing import TYPE_CHECKING, List, Dict, Any, Optional, TypedDict +from typing import TYPE_CHECKING, Any, Optional, TypedDict from abx_plugins import get_plugins_dir from django.conf import settings from django.utils.safestring import mark_safe from archivebox.config.constants import CONSTANTS -from archivebox.misc.util import fix_url_from_markdown +from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url if TYPE_CHECKING: from archivebox.machine.models import Process @@ -73,9 +73,7 @@ if TYPE_CHECKING: # Plugin directories BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() USER_PLUGINS_DIR = Path( - os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR') - or getattr(settings, 'USER_PLUGINS_DIR', '') - or str(CONSTANTS.USER_PLUGINS_DIR) + os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or getattr(settings, "USER_PLUGINS_DIR", "") or str(CONSTANTS.USER_PLUGINS_DIR), ).expanduser() @@ -101,24 +99,24 @@ def is_background_hook(hook_name: str) -> bool: is_background_hook('on_Snapshot__50_wget.py') -> False is_background_hook('on_Snapshot__63_media.finite.bg.py') -> True """ - return '.bg.' in hook_name or '__background' in hook_name + return ".bg." in hook_name or "__background" in hook_name def is_finite_background_hook(hook_name: str) -> bool: """Check if a background hook is finite-lived and should be awaited.""" - return '.finite.bg.' in hook_name + return ".finite.bg." in hook_name -def iter_plugin_dirs() -> List[Path]: +def iter_plugin_dirs() -> list[Path]: """Iterate over all built-in and user plugin directories.""" - plugin_dirs: List[Path] = [] + plugin_dirs: list[Path] = [] for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): if not base_dir.exists(): continue for plugin_dir in base_dir.iterdir(): - if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'): + if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"): plugin_dirs.append(plugin_dir) return plugin_dirs @@ -126,24 +124,25 @@ def iter_plugin_dirs() -> List[Path]: class HookResult(TypedDict, total=False): """Raw result from run_hook().""" + returncode: int stdout: str stderr: str - output_json: Optional[Dict[str, Any]] - output_files: List[str] + output_json: dict[str, Any] | None + output_files: list[dict[str, Any]] duration_ms: int hook: str plugin: str # Plugin name (directory name, e.g., 'wget', 'screenshot') hook_name: str # Full hook filename (e.g., 'on_Snapshot__50_wget.py') # New fields for JSONL parsing - records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field + records: list[dict[str, Any]] # Parsed JSONL records with 'type' field def discover_hooks( event_name: str, filter_disabled: bool = True, - config: Optional[Dict[str, Any]] = None -) -> List[Path]: + config: dict[str, Any] | None = None, +) -> list[Path]: """ Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern. @@ -187,22 +186,23 @@ def discover_hooks( continue # Search for hook scripts in all subdirectories - for ext in ('sh', 'py', 'js'): - pattern = f'*/on_{event_name}__*.{ext}' + for ext in ("sh", "py", "js"): + pattern = f"*/on_{event_name}__*.{ext}" hooks.extend(base_dir.glob(pattern)) # Also check for hooks directly in the plugins directory - pattern_direct = f'on_{event_name}__*.{ext}' + pattern_direct = f"on_{event_name}__*.{ext}" hooks.extend(base_dir.glob(pattern_direct)) # Binary install hooks are provider hooks, not end-user extractors. They # self-filter via `binproviders`, so applying the PLUGINS whitelist here # can hide the very installer needed by a selected plugin (e.g. # `--plugins=singlefile` still needs the `npm` Binary hook). - if filter_disabled and event_name != 'Binary': + if filter_disabled and event_name != "Binary": # Get merged config if not provided (lazy import to avoid circular dependency) if config is None: from archivebox.config.configset import get_config + config = get_config() enabled_hooks = [] @@ -221,7 +221,7 @@ def discover_hooks( # Check if plugin is enabled plugin_config = get_plugin_special_config(plugin_name, config) - if plugin_config['enabled']: + if plugin_config["enabled"]: enabled_hooks.append(hook) hooks = enabled_hooks @@ -234,11 +234,11 @@ def discover_hooks( def run_hook( script: Path, output_dir: Path, - config: Dict[str, Any], - timeout: Optional[int] = None, - parent: Optional['Process'] = None, - **kwargs: Any -) -> 'Process': + config: dict[str, Any], + timeout: int | None = None, + parent: Optional["Process"] = None, + **kwargs: Any, +) -> "Process": """ Execute a hook script with the given arguments using Process model. @@ -275,7 +275,7 @@ def run_hook( if timeout is None: plugin_name = script.parent.name plugin_config = get_plugin_special_config(plugin_name, config) - timeout = plugin_config['timeout'] + timeout = plugin_config["timeout"] if timeout: timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) @@ -301,22 +301,22 @@ def run_hook( parent=parent, process_type=Process.TypeChoices.HOOK, pwd=str(output_dir), - cmd=['echo', f'Hook script not found: {script}'], + cmd=["echo", f"Hook script not found: {script}"], timeout=timeout, status=Process.StatusChoices.EXITED, exit_code=1, - stderr=f'Hook script not found: {script}', + stderr=f"Hook script not found: {script}", ) return process # Determine the interpreter based on file extension ext = script.suffix.lower() - if ext == '.sh': - cmd = ['bash', str(script)] - elif ext == '.py': + if ext == ".sh": + cmd = ["bash", str(script)] + elif ext == ".py": cmd = [sys.executable, str(script)] - elif ext == '.js': - cmd = ['node', str(script)] + elif ext == ".js": + cmd = ["node", str(script)] else: # Try to execute directly (assumes shebang) cmd = [str(script)] @@ -324,56 +324,57 @@ def run_hook( # Build CLI arguments from kwargs for key, value in kwargs.items(): # Skip keys that start with underscore (internal parameters) - if key.startswith('_'): + if key.startswith("_"): continue - arg_key = f'--{key.replace("_", "-")}' + arg_key = f"--{key.replace('_', '-')}" if isinstance(value, bool): if value: cmd.append(arg_key) - elif value is not None and value != '': + elif value is not None and value != "": # JSON-encode complex values, use str for simple ones # Skip empty strings to avoid --key= which breaks argument parsers if isinstance(value, (dict, list)): - cmd.append(f'{arg_key}={json.dumps(value)}') + cmd.append(f"{arg_key}={json.dumps(value)}") else: # Ensure value is converted to string and strip whitespace str_value = str(value).strip() if str_value: # Only add if non-empty after stripping - cmd.append(f'{arg_key}={str_value}') + cmd.append(f"{arg_key}={str_value}") # Set up environment with base paths env = os.environ.copy() - env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd())) - env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive')) - env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', '')) + env["DATA_DIR"] = str(getattr(settings, "DATA_DIR", Path.cwd())) + env["ARCHIVE_DIR"] = str(getattr(settings, "ARCHIVE_DIR", Path.cwd() / "archive")) + env["ABX_RUNTIME"] = "archivebox" + env.setdefault("MACHINE_ID", getattr(settings, "MACHINE_ID", "") or os.environ.get("MACHINE_ID", "")) resolved_output_dir = output_dir.resolve() output_parts = set(resolved_output_dir.parts) - if 'snapshots' in output_parts: - env['SNAP_DIR'] = str(resolved_output_dir.parent) - if 'crawls' in output_parts: - env['CRAWL_DIR'] = str(resolved_output_dir.parent) + if "snapshots" in output_parts: + env["SNAP_DIR"] = str(resolved_output_dir.parent) + if "crawls" in output_parts: + env["CRAWL_DIR"] = str(resolved_output_dir.parent) - crawl_id = kwargs.get('_crawl_id') or kwargs.get('crawl_id') + crawl_id = kwargs.get("_crawl_id") or kwargs.get("crawl_id") if crawl_id: try: from archivebox.crawls.models import Crawl crawl = Crawl.objects.filter(id=crawl_id).first() if crawl: - env['CRAWL_DIR'] = str(crawl.output_dir) + env["CRAWL_DIR"] = str(crawl.output_dir) except Exception: pass # Get LIB_DIR and LIB_BIN_DIR from config - lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None)) - lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None)) + lib_dir = config.get("LIB_DIR", getattr(settings, "LIB_DIR", None)) + lib_bin_dir = config.get("LIB_BIN_DIR", getattr(settings, "LIB_BIN_DIR", None)) if lib_dir: - env['LIB_DIR'] = str(lib_dir) + env["LIB_DIR"] = str(lib_dir) if not lib_bin_dir and lib_dir: # Derive LIB_BIN_DIR from LIB_DIR if not set - lib_bin_dir = Path(lib_dir) / 'bin' + lib_bin_dir = Path(lib_dir) / "bin" # Build PATH with proper precedence: # 1. LIB_BIN_DIR (highest priority - local symlinked binaries) @@ -382,60 +383,72 @@ def run_hook( if lib_bin_dir: lib_bin_dir = str(lib_bin_dir) - env['LIB_BIN_DIR'] = lib_bin_dir + env["LIB_BIN_DIR"] = lib_bin_dir # Start with base PATH - current_path = env.get('PATH', '') + current_path = env.get("PATH", "") # Prepend Machine.config.PATH if it exists (treat as extra entries, not replacement) try: from archivebox.machine.models import Machine + machine = Machine.current() if machine and machine.config: - machine_path = machine.config.get('PATH') + machine_path = machine.config.get("PATH") if machine_path: # Prepend machine_path to current PATH - current_path = f'{machine_path}:{current_path}' if current_path else machine_path + current_path = f"{machine_path}:{current_path}" if current_path else machine_path except Exception: pass # Finally prepend LIB_BIN_DIR to the front (highest priority) if lib_bin_dir: - if not current_path.startswith(f'{lib_bin_dir}:'): - env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else lib_bin_dir + if not current_path.startswith(f"{lib_bin_dir}:"): + env["PATH"] = f"{lib_bin_dir}:{current_path}" if current_path else lib_bin_dir else: - env['PATH'] = current_path + env["PATH"] = current_path else: - env['PATH'] = current_path + env["PATH"] = current_path # Set NODE_PATH for Node.js module resolution # Priority: config dict > Machine.config > derive from LIB_DIR - node_path = config.get('NODE_PATH') + node_path = config.get("NODE_PATH") if not node_path and lib_dir: # Derive from LIB_DIR/npm/node_modules (create if needed) - node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules' + node_modules_dir = Path(lib_dir) / "npm" / "node_modules" node_modules_dir.mkdir(parents=True, exist_ok=True) node_path = str(node_modules_dir) if not node_path: try: # Fallback to Machine.config - node_path = machine.config.get('NODE_MODULES_DIR') + node_path = machine.config.get("NODE_MODULES_DIR") except Exception: pass if node_path: - env['NODE_PATH'] = node_path - env['NODE_MODULES_DIR'] = node_path # For backwards compatibility + env["NODE_PATH"] = node_path + env["NODE_MODULES_DIR"] = node_path # For backwards compatibility # Export all config values to environment (already merged by get_config()) # Skip keys we've already handled specially above (PATH, LIB_DIR, LIB_BIN_DIR, NODE_PATH, etc.) - SKIP_KEYS = {'PATH', 'LIB_DIR', 'LIB_BIN_DIR', 'NODE_PATH', 'NODE_MODULES_DIR', 'DATA_DIR', 'ARCHIVE_DIR', 'MACHINE_ID', 'SNAP_DIR', 'CRAWL_DIR'} + SKIP_KEYS = { + "PATH", + "LIB_DIR", + "LIB_BIN_DIR", + "NODE_PATH", + "NODE_MODULES_DIR", + "DATA_DIR", + "ARCHIVE_DIR", + "MACHINE_ID", + "SNAP_DIR", + "CRAWL_DIR", + } for key, value in config.items(): if key in SKIP_KEYS: continue # Already handled specially above, don't overwrite if value is None: continue elif isinstance(value, bool): - env[key] = 'true' if value else 'false' + env[key] = "true" if value else "false" elif isinstance(value, (list, dict)): env[key] = json.dumps(value) else: @@ -447,7 +460,7 @@ def run_hook( # Detect if this is a background hook (long-running daemon) # Background hooks use the .daemon.bg. or .finite.bg. filename convention. # Old convention: __background in stem (for backwards compatibility) - is_background = '.bg.' in script.name or '__background' in script.stem + is_background = ".bg." in script.name or "__background" in script.stem try: # Create Process record @@ -485,12 +498,12 @@ def run_hook( timeout=timeout, status=Process.StatusChoices.EXITED, exit_code=1, - stderr=f'Failed to run hook: {type(e).__name__}: {e}', + stderr=f"Failed to run hook: {type(e).__name__}: {e}", ) return process -def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]: +def extract_records_from_process(process: "Process") -> list[dict[str, Any]]: """ Extract JSONL records from a Process's stdout. @@ -507,20 +520,20 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]: return [] # Extract plugin metadata from process.pwd and process.cmd - plugin_name = Path(process.pwd).name if process.pwd else 'unknown' - hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown' - plugin_hook = process.cmd[1] if len(process.cmd) > 1 else '' + plugin_name = Path(process.pwd).name if process.pwd else "unknown" + hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else "unknown" + plugin_hook = process.cmd[1] if len(process.cmd) > 1 else "" for record in records: # Add plugin metadata to record - record.setdefault('plugin', plugin_name) - record.setdefault('hook_name', hook_name) - record.setdefault('plugin_hook', plugin_hook) + record.setdefault("plugin", plugin_name) + record.setdefault("hook_name", hook_name) + record.setdefault("plugin_hook", plugin_hook) return records -def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]: +def collect_urls_from_plugins(snapshot_dir: Path) -> list[dict[str, Any]]: """ Collect all urls.jsonl entries from parser plugin output subdirectories. @@ -542,20 +555,21 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]: if not subdir.is_dir(): continue - urls_file = subdir / 'urls.jsonl' + urls_file = subdir / "urls.jsonl" if not urls_file.exists(): continue try: from archivebox.machine.models import Process + text = urls_file.read_text() for entry in Process.parse_records_from_text(text): - if entry.get('url'): - entry['url'] = fix_url_from_markdown(str(entry['url']).strip()) - if not entry['url']: + if entry.get("url"): + entry["url"] = sanitize_extracted_url(fix_url_from_markdown(str(entry["url"]).strip())) + if not entry["url"]: continue # Track which parser plugin found this URL - entry['plugin'] = subdir.name + entry["plugin"] = subdir.name urls.append(entry) except Exception: pass @@ -563,9 +577,8 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]: return urls - @lru_cache(maxsize=1) -def get_plugins() -> List[str]: +def get_plugins() -> list[str]: """ Get list of available plugins by discovering plugin directories. @@ -576,16 +589,15 @@ def get_plugins() -> List[str]: plugins = [] for plugin_dir in iter_plugin_dirs(): - has_hooks = any(plugin_dir.glob('on_*__*.*')) - has_config = (plugin_dir / 'config.json').exists() - has_icon = (plugin_dir / 'templates' / 'icon.html').exists() + has_hooks = any(plugin_dir.glob("on_*__*.*")) + has_config = (plugin_dir / "config.json").exists() + has_icon = (plugin_dir / "templates" / "icon.html").exists() if has_hooks or has_config or has_icon: plugins.append(plugin_dir.name) return sorted(set(plugins)) - def get_plugin_name(plugin: str) -> str: """ Get the base plugin name without numeric prefix. @@ -596,14 +608,13 @@ def get_plugin_name(plugin: str) -> str: '50_parse_html_urls' -> 'parse_html_urls' """ # Split on first underscore after any leading digits - parts = plugin.split('_', 1) + parts = plugin.split("_", 1) if len(parts) == 2 and parts[0].isdigit(): return parts[1] return plugin - -def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: +def get_enabled_plugins(config: dict[str, Any] | None = None) -> list[str]: """ Get the list of enabled plugins based on config and available hooks. @@ -623,32 +634,33 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: # Get merged config if not provided if config is None: from archivebox.config.configset import get_config + config = get_config() - def normalize_enabled_plugins(value: Any) -> List[str]: + def normalize_enabled_plugins(value: Any) -> list[str]: if value is None: return [] if isinstance(value, str): raw = value.strip() if not raw: return [] - if raw.startswith('['): + if raw.startswith("["): try: parsed = json.loads(raw) except json.JSONDecodeError: parsed = None if isinstance(parsed, list): return [str(plugin).strip() for plugin in parsed if str(plugin).strip()] - return [plugin.strip() for plugin in raw.split(',') if plugin.strip()] + return [plugin.strip() for plugin in raw.split(",") if plugin.strip()] if isinstance(value, (list, tuple, set)): return [str(plugin).strip() for plugin in value if str(plugin).strip()] return [str(value).strip()] if str(value).strip() else [] # Support explicit ENABLED_PLUGINS override (legacy) - if 'ENABLED_PLUGINS' in config: - return normalize_enabled_plugins(config['ENABLED_PLUGINS']) - if 'ENABLED_EXTRACTORS' in config: - return normalize_enabled_plugins(config['ENABLED_EXTRACTORS']) + if "ENABLED_PLUGINS" in config: + return normalize_enabled_plugins(config["ENABLED_PLUGINS"]) + if "ENABLED_EXTRACTORS" in config: + return normalize_enabled_plugins(config["ENABLED_EXTRACTORS"]) # Filter all plugins by enabled status all_plugins = get_plugins() @@ -656,7 +668,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: for plugin in all_plugins: plugin_config = get_plugin_special_config(plugin, config) - if plugin_config['enabled']: + if plugin_config["enabled"]: enabled.append(plugin) return enabled @@ -664,9 +676,9 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: def discover_plugins_that_provide_interface( module_name: str, - required_attrs: List[str], - plugin_prefix: Optional[str] = None, -) -> Dict[str, Any]: + required_attrs: list[str], + plugin_prefix: str | None = None, +) -> dict[str, Any]: """ Discover plugins that provide a specific Python module with required interface. @@ -710,15 +722,15 @@ def discover_plugins_that_provide_interface( continue # Look for the module file - module_path = plugin_dir / f'{module_name}.py' + module_path = plugin_dir / f"{module_name}.py" if not module_path.exists(): continue try: # Import the module dynamically spec = importlib.util.spec_from_file_location( - f'archivebox.dynamic_plugins.{plugin_name}.{module_name}', - module_path + f"archivebox.dynamic_plugins.{plugin_name}.{module_name}", + module_path, ) if spec is None or spec.loader is None: continue @@ -732,7 +744,7 @@ def discover_plugins_that_provide_interface( # Derive backend name from plugin directory name if plugin_prefix: - backend_name = plugin_name[len(plugin_prefix):] + backend_name = plugin_name[len(plugin_prefix) :] else: backend_name = plugin_name @@ -745,7 +757,7 @@ def discover_plugins_that_provide_interface( return backends -def get_search_backends() -> Dict[str, Any]: +def get_search_backends() -> dict[str, Any]: """ Discover all available search backend plugins. @@ -758,13 +770,13 @@ def get_search_backends() -> Dict[str, Any]: e.g., {'sqlite': , 'sonic': , 'ripgrep': } """ return discover_plugins_that_provide_interface( - module_name='search', - required_attrs=['search', 'flush'], - plugin_prefix='search_backend_', + module_name="search", + required_attrs=["search", "flush"], + plugin_prefix="search_backend_", ) -def discover_plugin_configs() -> Dict[str, Dict[str, Any]]: +def discover_plugin_configs() -> dict[str, dict[str, Any]]: """ Discover all plugin config.json schemas. @@ -792,21 +804,20 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]: configs = {} for plugin_dir in iter_plugin_dirs(): - - config_path = plugin_dir / 'config.json' + config_path = plugin_dir / "config.json" if not config_path.exists(): continue try: - with open(config_path, 'r') as f: + with open(config_path) as f: schema = json.load(f) # Basic validation: must be an object with properties if not isinstance(schema, dict): continue - if schema.get('type') != 'object': + if schema.get("type") != "object": continue - if 'properties' not in schema: + if "properties" not in schema: continue configs[plugin_dir.name] = schema @@ -814,13 +825,14 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]: except (json.JSONDecodeError, OSError) as e: # Log warning but continue - malformed config shouldn't break discovery import sys + print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr) continue return configs -def get_config_defaults_from_plugins() -> Dict[str, Any]: +def get_config_defaults_from_plugins() -> dict[str, Any]: """ Get default values for all plugin config options. @@ -832,15 +844,15 @@ def get_config_defaults_from_plugins() -> Dict[str, Any]: defaults = {} for plugin_name, schema in plugin_configs.items(): - properties = schema.get('properties', {}) + properties = schema.get("properties", {}) for key, prop_schema in properties.items(): - if 'default' in prop_schema: - defaults[key] = prop_schema['default'] + if "default" in prop_schema: + defaults[key] = prop_schema["default"] return defaults -def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]: +def get_plugin_special_config(plugin_name: str, config: dict[str, Any]) -> dict[str, Any]: """ Extract special config keys for a plugin following naming conventions. @@ -878,19 +890,19 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon) - plugins_whitelist = config.get('PLUGINS', '') + plugins_whitelist = config.get("PLUGINS", "") if plugins_whitelist: # PLUGINS whitelist is specified - include transitive required_plugins from # config.json so selecting a plugin also enables its declared plugin-level # dependencies (e.g. singlefile -> chrome). plugin_configs = discover_plugin_configs() - plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()} + plugin_names = {p.strip().lower() for p in plugins_whitelist.split(",") if p.strip()} pending = list(plugin_names) while pending: current = pending.pop() schema = plugin_configs.get(current, {}) - required_plugins = schema.get('required_plugins', []) + required_plugins = schema.get("required_plugins", []) if not isinstance(required_plugins, list): continue @@ -906,34 +918,34 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ enabled = False else: # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED - enabled_key = f'{plugin_upper}_ENABLED' + enabled_key = f"{plugin_upper}_ENABLED" enabled = config.get(enabled_key) if enabled is None: enabled = True # Default to enabled if in whitelist elif isinstance(enabled, str): - enabled = enabled.lower() not in ('false', '0', 'no', '') + enabled = enabled.lower() not in ("false", "0", "no", "") else: # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) - enabled_key = f'{plugin_upper}_ENABLED' + enabled_key = f"{plugin_upper}_ENABLED" enabled = config.get(enabled_key) if enabled is None: enabled = True elif isinstance(enabled, str): # Handle string values from config file ("true"/"false") - enabled = enabled.lower() not in ('false', '0', 'no', '') + enabled = enabled.lower() not in ("false", "0", "no", "") # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300) - timeout_key = f'{plugin_upper}_TIMEOUT' - timeout = config.get(timeout_key) or config.get('TIMEOUT', 300) + timeout_key = f"{plugin_upper}_TIMEOUT" + timeout = config.get(timeout_key) or config.get("TIMEOUT", 300) # 3. Binary: PLUGINNAME_BINARY (default to plugin_name) - binary_key = f'{plugin_upper}_BINARY' + binary_key = f"{plugin_upper}_BINARY" binary = config.get(binary_key, plugin_name) return { - 'enabled': bool(enabled), - 'timeout': int(timeout), - 'binary': str(binary), + "enabled": bool(enabled), + "timeout": int(timeout), + "binary": str(binary), } @@ -959,30 +971,30 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ # Default templates used when plugin doesn't provide one DEFAULT_TEMPLATES = { - 'icon': ''' + "icon": """ {{ icon }} - ''', - 'card': ''' + """, + "card": """ - ''', - 'full': ''' + """, + "full": """ - ''', + """, } -def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]: +def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> str | None: """ Get a plugin template by plugin name and template type. @@ -995,20 +1007,19 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) Template content as string, or None if not found and fallback=False. """ base_name = get_plugin_name(plugin) - if base_name in ('yt-dlp', 'youtube-dl'): - base_name = 'ytdlp' + if base_name in ("yt-dlp", "youtube-dl"): + base_name = "ytdlp" for plugin_dir in iter_plugin_dirs(): - # Match by directory name (exact or partial) - if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'): - template_path = plugin_dir / 'templates' / f'{template_name}.html' + if plugin_dir.name == base_name or plugin_dir.name.endswith(f"_{base_name}"): + template_path = plugin_dir / "templates" / f"{template_name}.html" if template_path.exists(): return template_path.read_text() # Fall back to default template if requested if fallback: - return DEFAULT_TEMPLATES.get(template_name, '') + return DEFAULT_TEMPLATES.get(template_name, "") return None @@ -1025,14 +1036,12 @@ def get_plugin_icon(plugin: str) -> str: Icon HTML/emoji string. """ # Try plugin-provided icon template - icon_template = get_plugin_template(plugin, 'icon', fallback=False) + icon_template = get_plugin_template(plugin, "icon", fallback=False) if icon_template: return mark_safe(icon_template.strip()) # Fall back to generic folder icon - return mark_safe('📁') - - + return mark_safe("📁") # ============================================================================= @@ -1040,9 +1049,7 @@ def get_plugin_icon(plugin: str) -> str: # ============================================================================= - - -def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] | None = None) -> Dict[str, int]: +def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any] | None = None) -> dict[str, int]: """ Process JSONL records from hook output. Dispatches to Model.from_json() for each record type. @@ -1058,62 +1065,67 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any overrides = overrides or {} for record in records: - record_type = record.get('type') + record_type = record.get("type") if not record_type: continue # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones) - if record_type == 'ArchiveResult': + if record_type == "ArchiveResult": continue try: # Dispatch to appropriate model's from_json() method - if record_type == 'Snapshot': + if record_type == "Snapshot": from archivebox.core.models import Snapshot - if record.get('url'): + if record.get("url"): record = { **record, - 'url': fix_url_from_markdown(str(record['url']).strip()), + "url": sanitize_extracted_url(fix_url_from_markdown(str(record["url"]).strip())), } - if not record['url']: + if not record["url"]: continue # Check if discovered snapshot exceeds crawl max_depth - snapshot_depth = record.get('depth', 0) - crawl = overrides.get('crawl') + snapshot_depth = record.get("depth", 0) + crawl = overrides.get("crawl") if crawl and snapshot_depth > crawl.max_depth: # Skip - this URL was discovered but exceeds max crawl depth continue obj = Snapshot.from_json(record.copy(), overrides) if obj: - stats['Snapshot'] = stats.get('Snapshot', 0) + 1 + stats["Snapshot"] = stats.get("Snapshot", 0) + 1 - elif record_type == 'Tag': + elif record_type == "Tag": from archivebox.core.models import Tag + obj = Tag.from_json(record.copy(), overrides) if obj: - stats['Tag'] = stats.get('Tag', 0) + 1 + stats["Tag"] = stats.get("Tag", 0) + 1 - elif record_type == 'Binary': + elif record_type == "Binary": from archivebox.machine.models import Binary + obj = Binary.from_json(record.copy(), overrides) if obj: - stats['Binary'] = stats.get('Binary', 0) + 1 + stats["Binary"] = stats.get("Binary", 0) + 1 - elif record_type == 'Machine': + elif record_type == "Machine": from archivebox.machine.models import Machine + obj = Machine.from_json(record.copy(), overrides) if obj: - stats['Machine'] = stats.get('Machine', 0) + 1 + stats["Machine"] = stats.get("Machine", 0) + 1 else: import sys + print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr) except Exception as e: import sys + print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr) continue diff --git a/archivebox/ideas/process_plugin.py b/archivebox/ideas/process_plugin.py index 8696781b..aad584bb 100644 --- a/archivebox/ideas/process_plugin.py +++ b/archivebox/ideas/process_plugin.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.ideas' +__package__ = "archivebox.ideas" import asyncio import importlib @@ -9,7 +9,8 @@ import signal from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Any, Callable, Mapping, MutableMapping, Optional +from typing import Any, Optional +from collections.abc import Callable, Mapping, MutableMapping from pydantic import BaseModel, Field @@ -18,7 +19,7 @@ try: BaseEvent = bubus.BaseEvent EventBus = bubus.EventBus except Exception as exc: # pragma: no cover - optional dependency - raise ImportError('ProcessPlugin requires bubus to be installed') from exc + raise ImportError("ProcessPlugin requires bubus to be installed") from exc try: uuid7str = importlib.import_module("bubus.service").uuid7str @@ -118,10 +119,10 @@ class ProcessPlugin: env = {**self.env, **(event.env or {})} log_prefix = event.log_prefix or proc_id - stdout_path = output_dir / f'{log_prefix}.stdout.log' - stderr_path = output_dir / f'{log_prefix}.stderr.log' - cmd_path = output_dir / f'{log_prefix}.sh' - pid_path = output_dir / f'{log_prefix}.pid' + stdout_path = output_dir / f"{log_prefix}.stdout.log" + stderr_path = output_dir / f"{log_prefix}.stderr.log" + cmd_path = output_dir / f"{log_prefix}.sh" + pid_path = output_dir / f"{log_prefix}.pid" self._write_cmd_file(cmd_path, event.cmd) @@ -152,16 +153,19 @@ class ProcessPlugin: ) await event.event_bus.dispatch( - ProcessStarted(process=record, event_parent_id=parent_event_id) + ProcessStarted(process=record, event_parent_id=parent_event_id), ) stdout_task = asyncio.create_task( self._consume_stream( - proc.stdout, stdout_path, parent_event_id, event.parse_stdout_events - ) + proc.stdout, + stdout_path, + parent_event_id, + event.parse_stdout_events, + ), ) stderr_task = asyncio.create_task( - self._consume_stream(proc.stderr, stderr_path, parent_event_id, False) + self._consume_stream(proc.stderr, stderr_path, parent_event_id, False), ) running = _RunningProcess( @@ -176,7 +180,7 @@ class ProcessPlugin: if event.is_background: running.watcher_task = asyncio.create_task( - self._watch_process(proc_id, event.timeout) + self._watch_process(proc_id, event.timeout), ) return record @@ -186,7 +190,7 @@ class ProcessPlugin: async def on_ProcessKill(self, event: ProcessKill) -> ProcessRecord: running = self._running.get(event.process_id) if not running: - raise RuntimeError(f'Process not found: {event.process_id}') + raise RuntimeError(f"Process not found: {event.process_id}") proc = running.process self._terminate_process(proc, event.signal) @@ -194,7 +198,7 @@ class ProcessPlugin: if event.timeout is not None: try: await asyncio.wait_for(proc.wait(), timeout=event.timeout) - except asyncio.TimeoutError: + except TimeoutError: self._terminate_process(proc, signal.SIGKILL) else: await proc.wait() @@ -212,7 +216,7 @@ class ProcessPlugin: await asyncio.wait_for(proc.wait(), timeout=timeout) else: await proc.wait() - except asyncio.TimeoutError: + except TimeoutError: self._terminate_process(proc, signal.SIGTERM) await asyncio.sleep(2) if proc.returncode is None: @@ -237,7 +241,7 @@ class ProcessPlugin: record.ended_at = _utcnow() await self.bus.dispatch( - ProcessExited(process=record, event_parent_id=running.parent_event_id) + ProcessExited(process=record, event_parent_id=running.parent_event_id), ) self._running.pop(process_id, None) @@ -251,12 +255,12 @@ class ProcessPlugin: ) -> None: if stream is None: return - with path.open('w', encoding='utf-8') as fh: + with path.open("w", encoding="utf-8") as fh: while True: line = await stream.readline() if not line: break - text = line.decode('utf-8', errors='replace') + text = line.decode("utf-8", errors="replace") fh.write(text) fh.flush() if parse_events: @@ -264,7 +268,7 @@ class ProcessPlugin: async def _maybe_dispatch_json_event(self, line: str, parent_event_id: str | None) -> None: text = line.strip() - if not text.startswith('{') or not text.endswith('}'): + if not text.startswith("{") or not text.endswith("}"): return try: data = json.loads(text) @@ -274,7 +278,7 @@ class ProcessPlugin: event = None if self.json_event_adapter: event = self.json_event_adapter(data, parent_event_id) - elif isinstance(data, dict) and 'event_type' in data: + elif isinstance(data, dict) and "event_type" in data: try: event = BaseEvent.model_validate(data) except Exception: @@ -283,18 +287,18 @@ class ProcessPlugin: if event is None: return - if not getattr(event, 'event_parent_id', None) and parent_event_id: + if not getattr(event, "event_parent_id", None) and parent_event_id: event.event_parent_id = parent_event_id await self.bus.dispatch(event) @staticmethod def _write_cmd_file(path: Path, cmd: list[str]) -> None: - cmd_line = ' '.join(shlex.quote(part) for part in cmd) - path.write_text(cmd_line + '\n', encoding='utf-8') + cmd_line = shlex.join(cmd) + path.write_text(cmd_line + "\n", encoding="utf-8") @staticmethod def _write_pid_file(path: Path, pid: int) -> None: - path.write_text(str(pid), encoding='utf-8') + path.write_text(str(pid), encoding="utf-8") ts = datetime.now().timestamp() os.utime(path, (ts, ts)) @@ -312,10 +316,10 @@ class ProcessPlugin: __all__ = [ - 'ProcessRecord', - 'ProcessLaunch', - 'ProcessStarted', - 'ProcessExited', - 'ProcessKill', - 'ProcessPlugin', + "ProcessRecord", + "ProcessLaunch", + "ProcessStarted", + "ProcessExited", + "ProcessKill", + "ProcessPlugin", ] diff --git a/archivebox/ldap/apps.py b/archivebox/ldap/apps.py index 1d7fc44e..54390c62 100644 --- a/archivebox/ldap/apps.py +++ b/archivebox/ldap/apps.py @@ -8,6 +8,6 @@ from django.apps import AppConfig class LDAPConfig(AppConfig): """Django app config for LDAP authentication.""" - default_auto_field = 'django.db.models.BigAutoField' - name = 'archivebox.ldap' - verbose_name = 'LDAP Authentication' + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.ldap" + verbose_name = "LDAP Authentication" diff --git a/archivebox/ldap/auth.py b/archivebox/ldap/auth.py index 5d7a56a8..dd1ac462 100644 --- a/archivebox/ldap/auth.py +++ b/archivebox/ldap/auth.py @@ -11,6 +11,7 @@ import importlib try: BaseLDAPBackend = importlib.import_module("django_auth_ldap.backend").LDAPBackend except ImportError: + class BaseLDAPBackend: """Dummy LDAP backend when django-auth-ldap is not installed.""" diff --git a/archivebox/machine/__init__.py b/archivebox/machine/__init__.py index 1e67edea..36a1de6e 100644 --- a/archivebox/machine/__init__.py +++ b/archivebox/machine/__init__.py @@ -1 +1 @@ -__package__ = 'archivebox.machine' +__package__ = "archivebox.machine" diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py index 7d531aed..ca0ad0b2 100644 --- a/archivebox/machine/admin.py +++ b/archivebox/machine/admin.py @@ -1,229 +1,543 @@ -__package__ = 'archivebox.machine' +__package__ = "archivebox.machine" -from django.contrib import admin +import json +import shlex + +from django.contrib import admin, messages +from django.db.models import DurationField, ExpressionWrapper, F +from django.db.models.functions import Coalesce, Now +from django.shortcuts import redirect +from django.utils import timezone from django.utils.html import format_html +from django_object_actions import action from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.misc.logging_util import printable_filesize +from archivebox.machine.env_utils import env_to_dotenv_text from archivebox.machine.models import Machine, NetworkInterface, Binary, Process -class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): - list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health_display') - sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') - - readonly_fields = ('guid', 'created_at', 'modified_at', 'ips') - - fieldsets = ( - ('Identity', { - 'fields': ('hostname', 'guid', 'ips'), - 'classes': ('card',), - }), - ('Hardware', { - 'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'), - 'classes': ('card',), - }), - ('Operating System', { - 'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'), - 'classes': ('card',), - }), - ('Statistics', { - 'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'), - 'classes': ('card',), - }), - ('Configuration', { - 'fields': ('config',), - 'classes': ('card', 'wide'), - }), - ('Timestamps', { - 'fields': ('created_at', 'modified_at'), - 'classes': ('card',), - }), +def _render_copy_block(text: str, *, multiline: bool = False): + if multiline: + return format_html( + """ +
+ +
{}
+
+ """, + text, + text, + text, + ) + return format_html( + """ +
+ + + {} + +
+ """, + text, + text, + text, ) - list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform') - ordering = ['-created_at'] + +def _format_process_duration_seconds(started_at, ended_at) -> str: + if not started_at: + return "-" + + end_time = ended_at or timezone.now() + seconds = max((end_time - started_at).total_seconds(), 0.0) + if seconds < 1: + return f"{seconds:.2f}s" + if seconds < 10 and seconds != int(seconds): + return f"{seconds:.1f}s" + return f"{int(seconds)}s" + + +class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): + list_display = ( + "id", + "created_at", + "hostname", + "ips", + "os_platform", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "os_arch", + "os_family", + "os_release", + "hw_uuid", + "health_display", + ) + sort_fields = ( + "id", + "created_at", + "hostname", + "ips", + "os_platform", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "os_arch", + "os_family", + "os_release", + "hw_uuid", + ) + + readonly_fields = ("guid", "created_at", "modified_at", "ips") + + fieldsets = ( + ( + "Identity", + { + "fields": ("hostname", "guid", "ips"), + "classes": ("card",), + }, + ), + ( + "Hardware", + { + "fields": ("hw_manufacturer", "hw_product", "hw_uuid", "hw_in_docker", "hw_in_vm"), + "classes": ("card",), + }, + ), + ( + "Operating System", + { + "fields": ("os_platform", "os_family", "os_arch", "os_kernel", "os_release"), + "classes": ("card",), + }, + ), + ( + "Statistics", + { + "fields": ("stats", "num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Configuration", + { + "fields": ("config",), + "classes": ("card", "wide"), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("hw_in_docker", "hw_in_vm", "os_arch", "os_family", "os_platform") + ordering = ["-created_at"] list_per_page = 100 actions = ["delete_selected"] - @admin.display(description='Public IP', ordering='networkinterface__ip_public') + @admin.display(description="Public IP", ordering="networkinterface__ip_public") def ips(self, machine): return format_html( '{}', - machine.id, ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)), + machine.id, + ", ".join(machine.networkinterface_set.values_list("ip_public", flat=True)), ) - @admin.display(description='Health', ordering='health') + @admin.display(description="Health", ordering="health") def health_display(self, obj): h = obj.health - color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + color = "green" if h >= 80 else "orange" if h >= 50 else "red" return format_html('{}', color, h) class NetworkInterfaceAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health_display') - sort_fields = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address') - search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country') - - readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server') - - fieldsets = ( - ('Machine', { - 'fields': ('machine',), - 'classes': ('card',), - }), - ('Network', { - 'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'), - 'classes': ('card',), - }), - ('Location', { - 'fields': ('hostname', 'isp', 'city', 'region', 'country'), - 'classes': ('card',), - }), - ('Usage', { - 'fields': ('num_uses_succeeded', 'num_uses_failed'), - 'classes': ('card',), - }), - ('Timestamps', { - 'fields': ('created_at', 'modified_at'), - 'classes': ('card',), - }), + list_display = ( + "id", + "created_at", + "machine_info", + "ip_public", + "dns_server", + "isp", + "country", + "region", + "city", + "iface", + "ip_local", + "mac_address", + "health_display", + ) + sort_fields = ( + "id", + "created_at", + "machine_info", + "ip_public", + "dns_server", + "isp", + "country", + "region", + "city", + "iface", + "ip_local", + "mac_address", + ) + search_fields = ( + "id", + "machine__id", + "iface", + "ip_public", + "ip_local", + "mac_address", + "dns_server", + "hostname", + "isp", + "city", + "region", + "country", ) - list_filter = ('isp', 'country', 'region') - ordering = ['-created_at'] + readonly_fields = ("machine", "created_at", "modified_at", "mac_address", "ip_public", "ip_local", "dns_server") + + fieldsets = ( + ( + "Machine", + { + "fields": ("machine",), + "classes": ("card",), + }, + ), + ( + "Network", + { + "fields": ("iface", "ip_public", "ip_local", "mac_address", "dns_server"), + "classes": ("card",), + }, + ), + ( + "Location", + { + "fields": ("hostname", "isp", "city", "region", "country"), + "classes": ("card",), + }, + ), + ( + "Usage", + { + "fields": ("num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("isp", "country", "region") + ordering = ["-created_at"] list_per_page = 100 actions = ["delete_selected"] - @admin.display(description='Machine', ordering='machine__id') + @admin.display(description="Machine", ordering="machine__id") def machine_info(self, iface): return format_html( '[{}]   {}', - iface.machine.id, str(iface.machine.id)[:8], iface.machine.hostname, + iface.machine.id, + str(iface.machine.id)[:8], + iface.machine.hostname, ) - @admin.display(description='Health', ordering='health') + @admin.display(description="Health", ordering="health") def health_display(self, obj): h = obj.health - color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + color = "green" if h >= 80 else "orange" if h >= 50 else "red" return format_html('{}', color, h) class BinaryAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health_display') - sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status') - search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256') + list_display = ("id", "created_at", "machine_info", "name", "binprovider", "version", "abspath", "sha256", "status", "health_display") + sort_fields = ("id", "created_at", "machine_info", "name", "binprovider", "version", "abspath", "sha256", "status") + search_fields = ("id", "machine__id", "name", "binprovider", "version", "abspath", "sha256") - readonly_fields = ('created_at', 'modified_at', 'output_dir') + readonly_fields = ("created_at", "modified_at", "output_dir") fieldsets = ( - ('Binary Info', { - 'fields': ('name', 'binproviders', 'binprovider', 'overrides'), - 'classes': ('card',), - }), - ('Location', { - 'fields': ('machine', 'abspath'), - 'classes': ('card',), - }), - ('Version', { - 'fields': ('version', 'sha256'), - 'classes': ('card',), - }), - ('State', { - 'fields': ('status', 'retry_at', 'output_dir'), - 'classes': ('card',), - }), - ('Usage', { - 'fields': ('num_uses_succeeded', 'num_uses_failed'), - 'classes': ('card',), - }), - ('Timestamps', { - 'fields': ('created_at', 'modified_at'), - 'classes': ('card',), - }), + ( + "Binary Info", + { + "fields": ("name", "binproviders", "binprovider", "overrides"), + "classes": ("card",), + }, + ), + ( + "Location", + { + "fields": ("machine", "abspath"), + "classes": ("card",), + }, + ), + ( + "Version", + { + "fields": ("version", "sha256"), + "classes": ("card",), + }, + ), + ( + "State", + { + "fields": ("status", "retry_at", "output_dir"), + "classes": ("card",), + }, + ), + ( + "Usage", + { + "fields": ("num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), ) - list_filter = ('name', 'binprovider', 'status', 'machine_id') - ordering = ['-created_at'] + list_filter = ("name", "binprovider", "status", "machine_id") + ordering = ["-created_at"] list_per_page = 100 actions = ["delete_selected"] - @admin.display(description='Machine', ordering='machine__id') + @admin.display(description="Machine", ordering="machine__id") def machine_info(self, binary): return format_html( '[{}]   {}', - binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname, + binary.machine.id, + str(binary.machine.id)[:8], + binary.machine.hostname, ) - @admin.display(description='Health', ordering='health') + @admin.display(description="Health", ordering="health") def health_display(self, obj): h = obj.health - color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + color = "green" if h >= 80 else "orange" if h >= 50 else "red" return format_html('{}', color, h) class ProcessAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info') - sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid') - search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr') + list_display = ( + "id", + "created_at", + "machine_info", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_str", + "status", + "duration_display", + "exit_code", + "pid", + "output_summary", + "binary_info", + ) + sort_fields = ( + "id", + "created_at", + "machine_info", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_str", + "status", + "duration_display", + "exit_code", + "pid", + "output_summary", + "binary_info", + ) + search_fields = ("id", "machine__id", "binary__name", "cmd", "pwd", "stdout", "stderr") - readonly_fields = ('created_at', 'modified_at', 'machine', 'binary_link', 'iface_link', 'archiveresult_link') - - fieldsets = ( - ('Process Info', { - 'fields': ('machine', 'archiveresult_link', 'status', 'retry_at'), - 'classes': ('card',), - }), - ('Command', { - 'fields': ('cmd', 'pwd', 'env', 'timeout'), - 'classes': ('card', 'wide'), - }), - ('Execution', { - 'fields': ('binary_link', 'iface_link', 'pid', 'exit_code', 'url'), - 'classes': ('card',), - }), - ('Timing', { - 'fields': ('started_at', 'ended_at'), - 'classes': ('card',), - }), - ('Output', { - 'fields': ('stdout', 'stderr'), - 'classes': ('card', 'wide', 'collapse'), - }), - ('Timestamps', { - 'fields': ('created_at', 'modified_at'), - 'classes': ('card',), - }), + readonly_fields = ( + "created_at", + "modified_at", + "machine", + "binary_link", + "iface_link", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_display", + "env_display", + "timeout", + "pid", + "exit_code", + "url", + "started_at", + "ended_at", + "duration_display", ) - list_filter = ('status', 'exit_code', 'machine_id') - ordering = ['-created_at'] - list_per_page = 100 - actions = ["delete_selected"] + fieldsets = ( + ( + "Process Info", + { + "fields": ("machine", "archiveresult_link", "snapshot_link", "crawl_link", "status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Command", + { + "fields": ("cmd_display", "pwd", "env_display", "timeout"), + "classes": ("card", "wide"), + }, + ), + ( + "Execution", + { + "fields": ("binary_link", "iface_link", "pid", "exit_code", "url"), + "classes": ("card",), + }, + ), + ( + "Timing", + { + "fields": ("started_at", "ended_at", "duration_display"), + "classes": ("card",), + }, + ), + ( + "Output", + { + "fields": ("stdout", "stderr"), + "classes": ("card", "wide", "collapse"), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) - @admin.display(description='Machine', ordering='machine__id') + list_filter = ("status", "exit_code", "machine_id") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["kill_processes", "delete_selected"] + change_actions = ["kill_process"] + + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related( + "machine", + "binary", + "iface", + "archiveresult__snapshot__crawl", + ) + .annotate( + runtime_sort=ExpressionWrapper( + Coalesce(F("ended_at"), Now()) - F("started_at"), + output_field=DurationField(), + ), + ) + ) + + def _terminate_processes(self, request, processes): + terminated = 0 + skipped = 0 + + for process in processes: + if process.status == Process.StatusChoices.EXITED or not process.is_running: + skipped += 1 + continue + if process.terminate(): + terminated += 1 + else: + skipped += 1 + + if terminated: + self.message_user( + request, + f"Killed {terminated} running process{'es' if terminated != 1 else ''}.", + level=messages.SUCCESS, + ) + if skipped: + self.message_user( + request, + f"Skipped {skipped} process{'es' if skipped != 1 else ''} that were already exited.", + level=messages.INFO, + ) + + return terminated, skipped + + @admin.action(description="Kill selected processes") + def kill_processes(self, request, queryset): + self._terminate_processes(request, queryset) + + @action( + label="Kill", + description="Kill this process if it is still running", + attrs={"class": "deletelink"}, + ) + def kill_process(self, request, obj): + self._terminate_processes(request, [obj]) + return redirect("admin:machine_process_change", obj.pk) + + @admin.display(description="Machine", ordering="machine__id") def machine_info(self, process): return format_html( '[{}]   {}', - process.machine.id, str(process.machine.id)[:8], process.machine.hostname, + process.machine.id, + str(process.machine.id)[:8], + process.machine.hostname, ) - @admin.display(description='Binary', ordering='binary__name') + @admin.display(description="Binary", ordering="binary__name") def binary_info(self, process): if not process.binary: - return '-' + return "-" return format_html( '{} v{}', - process.binary.id, process.binary.name, process.binary.version, + process.binary.id, + process.binary.name, + process.binary.version, ) - @admin.display(description='Binary', ordering='binary__name') + @admin.display(description="Binary", ordering="binary__name") def binary_link(self, process): return self.binary_info(process) - @admin.display(description='Network Interface', ordering='iface__id') + @admin.display(description="Network Interface", ordering="iface__id") def iface_link(self, process): if not process.iface: - return '-' + return "-" return format_html( '{} {}', process.iface.id, @@ -231,25 +545,112 @@ class ProcessAdmin(BaseModelAdmin): process.iface.iface or process.iface.ip_public or process.iface.ip_local, ) - @admin.display(description='ArchiveResult') + @admin.display(description="ArchiveResult", ordering="archiveresult__plugin") def archiveresult_link(self, process): - if not hasattr(process, 'archiveresult'): - return '-' + if not hasattr(process, "archiveresult"): + return "-" ar = process.archiveresult return format_html( - '{} → {}', - ar.id, ar.plugin, ar.snapshot.url[:50], + '{} ← {}', + ar.id, + ar.snapshot.url[:50], + ar.plugin, ) - @admin.display(description='Command') + @admin.display(description="Snapshot", ordering="archiveresult__snapshot__id") + def snapshot_link(self, process): + ar = getattr(process, "archiveresult", None) + snapshot = getattr(ar, "snapshot", None) + if not snapshot: + return "-" + return format_html( + '{}', + snapshot.id, + str(snapshot.id)[:8], + ) + + @admin.display(description="Crawl", ordering="archiveresult__snapshot__crawl__id") + def crawl_link(self, process): + ar = getattr(process, "archiveresult", None) + snapshot = getattr(ar, "snapshot", None) + crawl = getattr(snapshot, "crawl", None) + if not crawl: + return "-" + return format_html( + '{}', + crawl.id, + str(crawl.id)[:8], + ) + + @admin.display(description="Command", ordering="cmd") def cmd_str(self, process): if not process.cmd: - return '-' - cmd = ' '.join(process.cmd[:3]) if isinstance(process.cmd, list) else str(process.cmd) + return "-" + cmd = " ".join(process.cmd[:3]) if isinstance(process.cmd, list) else str(process.cmd) if len(process.cmd) > 3: - cmd += ' ...' + cmd += " ..." return format_html('{}', cmd[:80]) + @admin.display(description="Duration", ordering="runtime_sort") + def duration_display(self, process): + return _format_process_duration_seconds(process.started_at, process.ended_at) + + @admin.display(description="Output", ordering="archiveresult__output_size") + def output_summary(self, process): + output_files = getattr(getattr(process, "archiveresult", None), "output_files", {}) or {} + + if isinstance(output_files, str): + try: + output_files = json.loads(output_files) + except Exception: + output_files = {} + + file_count = 0 + total_bytes = 0 + + if isinstance(output_files, dict): + file_count = len(output_files) + items = output_files.values() + elif isinstance(output_files, (list, tuple, set)): + file_count = len(output_files) + items = output_files + else: + items = () + + for metadata in items: + if not isinstance(metadata, dict): + continue + size = metadata.get("size", 0) + try: + total_bytes += int(size or 0) + except (TypeError, ValueError): + continue + + file_label = "file" if file_count == 1 else "files" + return format_html( + '{} {} • {}', + file_count, + file_label, + printable_filesize(total_bytes), + ) + + @admin.display(description="Command") + def cmd_display(self, process): + if not process.cmd: + return "-" + if isinstance(process.cmd, list): + cmd = shlex.join(str(arg) for arg in process.cmd) + else: + cmd = str(process.cmd) + return _render_copy_block(cmd) + + @admin.display(description="Environment") + def env_display(self, process): + env_text = env_to_dotenv_text(process.env) + if not env_text: + return "-" + return _render_copy_block(env_text, multiline=True) + def register_admin(admin_site): admin_site.register(Machine, MachineAdmin) diff --git a/archivebox/machine/apps.py b/archivebox/machine/apps.py index b3287409..f4834e4c 100644 --- a/archivebox/machine/apps.py +++ b/archivebox/machine/apps.py @@ -1,24 +1,25 @@ -__package__ = 'archivebox.machine' +__package__ = "archivebox.machine" from django.apps import AppConfig class MachineConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' + default_auto_field = "django.db.models.BigAutoField" - name = 'archivebox.machine' - label = 'machine' # Explicit label for migrations - verbose_name = 'Machine Info' + name = "archivebox.machine" + label = "machine" # Explicit label for migrations + verbose_name = "Machine Info" def ready(self): """Import models to register state machines with the registry""" import sys # Skip during makemigrations to avoid premature state machine access - if 'makemigrations' not in sys.argv: + if "makemigrations" not in sys.argv: from archivebox.machine import models # noqa: F401 def register_admin(admin_site): from archivebox.machine.admin import register_admin + register_admin(admin_site) diff --git a/archivebox/machine/detect.py b/archivebox/machine/detect.py index 9d44df0d..c3960f14 100644 --- a/archivebox/machine/detect.py +++ b/archivebox/machine/detect.py @@ -2,7 +2,7 @@ import os import json import socket import urllib.request -from typing import Dict, Any +from typing import Any from pathlib import Path import subprocess import platform @@ -10,34 +10,35 @@ import tempfile from datetime import datetime import psutil -import machineid # https://github.com/keygen-sh/py-machineid +import machineid # https://github.com/keygen-sh/py-machineid from rich import print PACKAGE_DIR = Path(__file__).parent DATA_DIR = Path(os.getcwd()).resolve() + def get_vm_info(): - hw_in_docker = bool(os.getenv('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE')) + hw_in_docker = bool(os.getenv("IN_DOCKER", False) in ("1", "true", "True", "TRUE")) hw_in_vm = False try: # check for traces of docker/containerd/podman in cgroup - with open('/proc/self/cgroup', 'r') as procfile: + with open("/proc/self/cgroup") as procfile: for line in procfile: cgroup = line.strip() # .split('/', 1)[-1].lower() - if 'docker' in cgroup or 'containerd' in cgroup or 'podman' in cgroup: + if "docker" in cgroup or "containerd" in cgroup or "podman" in cgroup: hw_in_docker = True except Exception: pass - - hw_manufacturer = 'Docker' if hw_in_docker else 'Unknown' - hw_product = 'Container' if hw_in_docker else 'Unknown' + + hw_manufacturer = "Docker" if hw_in_docker else "Unknown" + hw_product = "Container" if hw_in_docker else "Unknown" hw_uuid = machineid.id() - - if platform.system().lower() == 'darwin': + + if platform.system().lower() == "darwin": # Get macOS machine info - hw_manufacturer = 'Apple' - hw_product = 'Mac' + hw_manufacturer = "Apple" + hw_product = "Mac" try: # Hardware: # Hardware Overview: @@ -48,14 +49,14 @@ def get_vm_info(): # Serial Number (system): M230YYTD77 # Hardware UUID: 39A12B50-1972-5910-8BEE-235AD20C8EE3 # ... - result = subprocess.run(['system_profiler', 'SPHardwareDataType'], capture_output=True, text=True, check=True) - for line in result.stdout.split('\n'): - if 'Model Name:' in line: - hw_product = line.split(':', 1)[-1].strip() - elif 'Model Identifier:' in line: - hw_product += ' ' + line.split(':', 1)[-1].strip() - elif 'Hardware UUID:' in line: - hw_uuid = line.split(':', 1)[-1].strip() + result = subprocess.run(["system_profiler", "SPHardwareDataType"], capture_output=True, text=True, check=True) + for line in result.stdout.split("\n"): + if "Model Name:" in line: + hw_product = line.split(":", 1)[-1].strip() + elif "Model Identifier:" in line: + hw_product += " " + line.split(":", 1)[-1].strip() + elif "Hardware UUID:" in line: + hw_uuid = line.split(":", 1)[-1].strip() except Exception: pass else: @@ -72,25 +73,25 @@ def get_vm_info(): # UUID: fb65f41c-ec24-4539-beaf-f941903bdb2c # ... # Family: DigitalOcean_Droplet - dmidecode = subprocess.run(['dmidecode', '-t', 'system'], capture_output=True, text=True, check=True) - for line in dmidecode.stdout.split('\n'): - if 'Manufacturer:' in line: - hw_manufacturer = line.split(':', 1)[-1].strip() - elif 'Product Name:' in line: - hw_product = line.split(':', 1)[-1].strip() - elif 'UUID:' in line: - hw_uuid = line.split(':', 1)[-1].strip() + dmidecode = subprocess.run(["dmidecode", "-t", "system"], capture_output=True, text=True, check=True) + for line in dmidecode.stdout.split("\n"): + if "Manufacturer:" in line: + hw_manufacturer = line.split(":", 1)[-1].strip() + elif "Product Name:" in line: + hw_product = line.split(":", 1)[-1].strip() + elif "UUID:" in line: + hw_uuid = line.split(":", 1)[-1].strip() except Exception: pass # Check for VM fingerprint in manufacturer/product name - if 'qemu' in hw_product.lower() or 'vbox' in hw_product.lower() or 'lxc' in hw_product.lower() or 'vm' in hw_product.lower(): + if "qemu" in hw_product.lower() or "vbox" in hw_product.lower() or "lxc" in hw_product.lower() or "vm" in hw_product.lower(): hw_in_vm = True - + # Check for QEMU explicitly in pmap output try: - result = subprocess.run(['pmap', '1'], capture_output=True, text=True, check=True) - if 'qemu' in result.stdout.lower(): + result = subprocess.run(["pmap", "1"], capture_output=True, text=True, check=True) + if "qemu" in result.stdout.lower(): hw_in_vm = True except Exception: pass @@ -103,17 +104,18 @@ def get_vm_info(): "hw_uuid": hw_uuid, } + def get_public_ip() -> str: def fetch_url(url: str) -> str: with urllib.request.urlopen(url, timeout=5) as response: - return response.read().decode('utf-8').strip() + return response.read().decode("utf-8").strip() def fetch_dns(pubip_lookup_host: str) -> str: return socket.gethostbyname(pubip_lookup_host).strip() methods = [ (lambda: fetch_url("https://ipinfo.io/ip"), lambda r: r), - (lambda: fetch_url("https://api.ipify.org?format=json"), lambda r: json.loads(r)['ip']), + (lambda: fetch_url("https://api.ipify.org?format=json"), lambda r: json.loads(r)["ip"]), (lambda: fetch_dns("myip.opendns.com"), lambda r: r), (lambda: fetch_url("http://whatismyip.akamai.com/"), lambda r: r), # try HTTP as final fallback in case of TLS/system time errors ] @@ -128,68 +130,72 @@ def get_public_ip() -> str: raise Exception("Could not determine public IP address") -def get_local_ip(remote_ip: str='1.1.1.1', remote_port: int=80) -> str: + +def get_local_ip(remote_ip: str = "1.1.1.1", remote_port: int = 80) -> str: try: with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: s.connect((remote_ip, remote_port)) return s.getsockname()[0] except Exception: pass - return '127.0.0.1' + return "127.0.0.1" + ip_addrs = lambda addrs: (a for a in addrs if a.family == socket.AF_INET) mac_addrs = lambda addrs: (a for a in addrs if a.family == psutil.AF_LINK) + def get_isp_info(ip=None): # Get public IP try: - ip = ip or urllib.request.urlopen('https://api.ipify.org').read().decode('utf8') + ip = ip or urllib.request.urlopen("https://api.ipify.org").read().decode("utf8") except Exception: pass - + # Get ISP name, city, and country data = {} try: - url = f'https://ipapi.co/{ip}/json/' + url = f"https://ipapi.co/{ip}/json/" response = urllib.request.urlopen(url) data = json.loads(response.read().decode()) except Exception: pass - - isp = data.get('org', 'Unknown') - city = data.get('city', 'Unknown') - region = data.get('region', 'Unknown') - country = data.get('country_name', 'Unknown') - + + isp = data.get("org", "Unknown") + city = data.get("city", "Unknown") + region = data.get("region", "Unknown") + country = data.get("country_name", "Unknown") + # Get system DNS resolver servers dns_server = None try: - result = subprocess.run(['dig', 'example.com', 'A'], capture_output=True, text=True, check=True).stdout - dns_server = result.split(';; SERVER: ', 1)[-1].split('\n')[0].split('#')[0].strip() + result = subprocess.run(["dig", "example.com", "A"], capture_output=True, text=True, check=True).stdout + dns_server = result.split(";; SERVER: ", 1)[-1].split("\n")[0].split("#")[0].strip() except Exception: try: - dns_server = Path('/etc/resolv.conf').read_text().split('nameserver ', 1)[-1].split('\n')[0].strip() + dns_server = Path("/etc/resolv.conf").read_text().split("nameserver ", 1)[-1].split("\n")[0].strip() except Exception: - dns_server = '127.0.0.1' - print(f'[red]:warning: WARNING: Could not determine DNS server, using {dns_server}[/red]') - + dns_server = "127.0.0.1" + print(f"[red]:warning: WARNING: Could not determine DNS server, using {dns_server}[/red]") + # Get DNS resolver's ISP name # url = f'https://ipapi.co/{dns_server}/json/' # dns_isp = json.loads(urllib.request.urlopen(url).read().decode()).get('org', 'Unknown') - + return { - 'isp': isp, - 'city': city, - 'region': region, - 'country': country, - 'dns_server': dns_server, + "isp": isp, + "city": city, + "region": region, + "country": country, + "dns_server": dns_server, # 'net_dns_isp': dns_isp, } - -def get_host_network() -> Dict[str, Any]: + + +def get_host_network() -> dict[str, Any]: default_gateway_local_ip = get_local_ip() gateways = psutil.net_if_addrs() - + for interface, ips in gateways.items(): for local_ip in ip_addrs(ips): if default_gateway_local_ip == local_ip.address: @@ -204,20 +210,20 @@ def get_host_network() -> Dict[str, Any]: # "is_behind_nat": local_ip.address != public_ip, **get_isp_info(public_ip), } - + raise Exception("Could not determine host network info") -def get_os_info() -> Dict[str, Any]: +def get_os_info() -> dict[str, Any]: os_release = platform.release() - if platform.system().lower() == 'darwin': - os_release = 'macOS ' + platform.mac_ver()[0] + if platform.system().lower() == "darwin": + os_release = "macOS " + platform.mac_ver()[0] else: try: - os_release = subprocess.run(['lsb_release', '-ds'], capture_output=True, text=True, check=True).stdout.strip() + os_release = subprocess.run(["lsb_release", "-ds"], capture_output=True, text=True, check=True).stdout.strip() except Exception: pass - + return { "os_arch": platform.machine(), "os_family": platform.system().lower(), @@ -226,7 +232,8 @@ def get_os_info() -> Dict[str, Any]: "os_release": os_release, } -def get_host_stats() -> Dict[str, Any]: + +def get_host_stats() -> dict[str, Any]: try: with tempfile.TemporaryDirectory() as tmp_dir: tmp_usage = psutil.disk_usage(str(tmp_dir)) @@ -267,24 +274,23 @@ def get_host_stats() -> Dict[str, Any]: except Exception: return {} -def get_host_immutable_info(host_info: Dict[str, Any]) -> Dict[str, Any]: - return { - key: value - for key, value in host_info.items() - if key in ['guid', 'net_mac', 'os_family', 'cpu_arch'] - } - + +def get_host_immutable_info(host_info: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in host_info.items() if key in ["guid", "net_mac", "os_family", "cpu_arch"]} + + def get_host_guid() -> str: - return machineid.hashed_id('archivebox') + return machineid.hashed_id("archivebox") + # Example usage if __name__ == "__main__": host_info = { - 'guid': get_host_guid(), - 'os': get_os_info(), - 'vm': get_vm_info(), - 'net': get_host_network(), - 'stats': get_host_stats(), + "guid": get_host_guid(), + "os": get_os_info(), + "vm": get_vm_info(), + "net": get_host_network(), + "stats": get_host_stats(), } print(host_info) diff --git a/archivebox/machine/env_utils.py b/archivebox/machine/env_utils.py new file mode 100644 index 00000000..06a42563 --- /dev/null +++ b/archivebox/machine/env_utils.py @@ -0,0 +1,51 @@ +__package__ = "archivebox.machine" + +import json +import shlex +from typing import Any + + +SENSITIVE_ENV_KEY_PARTS = ("KEY", "TOKEN", "SECRET") + + +def stringify_env_value(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + if isinstance(value, bool): + return "True" if value else "False" + return json.dumps(value, separators=(",", ":")) + + +def is_redacted_env_key(key: str) -> bool: + upper_key = str(key or "").upper() + return any(part in upper_key for part in SENSITIVE_ENV_KEY_PARTS) + + +def redact_env(env: dict[str, Any] | None) -> dict[str, Any]: + if not isinstance(env, dict): + return {} + return { + str(key): value + for key, value in env.items() + if key is not None and not is_redacted_env_key(str(key)) + } + + +def env_to_dotenv_text(env: dict[str, Any] | None) -> str: + redacted_env = redact_env(env) + return "\n".join( + f"{key}={shlex.quote(stringify_env_value(value))}" + for key, value in sorted(redacted_env.items()) + if value is not None + ) + + +def env_to_shell_exports(env: dict[str, Any] | None) -> str: + redacted_env = redact_env(env) + return " ".join( + f"{key}={shlex.quote(stringify_env_value(value))}" + for key, value in sorted(redacted_env.items()) + if value is not None + ) diff --git a/archivebox/machine/migrations/0001_initial.py b/archivebox/machine/migrations/0001_initial.py index 05d0b3b9..2a0f018c 100644 --- a/archivebox/machine/migrations/0001_initial.py +++ b/archivebox/machine/migrations/0001_initial.py @@ -8,11 +8,9 @@ from archivebox.uuid_compat import uuid7 class Migration(migrations.Migration): - initial = True - dependencies = [ - ] + dependencies = [] operations = [ migrations.SeparateDatabaseAndState( @@ -105,87 +103,143 @@ class Migration(migrations.Migration): DROP TABLE IF EXISTS machine_binary; DROP TABLE IF EXISTS machine_networkinterface; DROP TABLE IF EXISTS machine_machine; - """ + """, ), ], state_operations=[ migrations.CreateModel( - name='Machine', + name="Machine", fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)), - ('hostname', models.CharField(default=None, max_length=63)), - ('hw_in_docker', models.BooleanField(default=False)), - ('hw_in_vm', models.BooleanField(default=False)), - ('hw_manufacturer', models.CharField(default=None, max_length=63)), - ('hw_product', models.CharField(default=None, max_length=63)), - ('hw_uuid', models.CharField(default=None, max_length=255)), - ('os_arch', models.CharField(default=None, max_length=15)), - ('os_family', models.CharField(default=None, max_length=15)), - ('os_platform', models.CharField(default=None, max_length=63)), - ('os_release', models.CharField(default=None, max_length=63)), - ('os_kernel', models.CharField(default=None, max_length=255)), - ('stats', models.JSONField(blank=True, default=dict, null=True)), - ('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)), - ('num_uses_succeeded', models.PositiveIntegerField(default=0)), - ('num_uses_failed', models.PositiveIntegerField(default=0)), + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("guid", models.CharField(default=None, editable=False, max_length=64, unique=True)), + ("hostname", models.CharField(default=None, max_length=63)), + ("hw_in_docker", models.BooleanField(default=False)), + ("hw_in_vm", models.BooleanField(default=False)), + ("hw_manufacturer", models.CharField(default=None, max_length=63)), + ("hw_product", models.CharField(default=None, max_length=63)), + ("hw_uuid", models.CharField(default=None, max_length=255)), + ("os_arch", models.CharField(default=None, max_length=15)), + ("os_family", models.CharField(default=None, max_length=15)), + ("os_platform", models.CharField(default=None, max_length=63)), + ("os_release", models.CharField(default=None, max_length=63)), + ("os_kernel", models.CharField(default=None, max_length=255)), + ("stats", models.JSONField(blank=True, default=dict, null=True)), + ( + "config", + models.JSONField( + blank=True, + default=dict, + help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)", + null=True, + ), + ), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), ], options={ - 'app_label': 'machine', + "app_label": "machine", }, ), migrations.CreateModel( - name='NetworkInterface', + name="NetworkInterface", fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('mac_address', models.CharField(default=None, editable=False, max_length=17)), - ('ip_public', models.GenericIPAddressField(default=None, editable=False)), - ('ip_local', models.GenericIPAddressField(default=None, editable=False)), - ('dns_server', models.GenericIPAddressField(default=None, editable=False)), - ('hostname', models.CharField(default=None, max_length=63)), - ('iface', models.CharField(default=None, max_length=15)), - ('isp', models.CharField(default=None, max_length=63)), - ('city', models.CharField(default=None, max_length=63)), - ('region', models.CharField(default=None, max_length=63)), - ('country', models.CharField(default=None, max_length=63)), - ('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), - ('num_uses_succeeded', models.PositiveIntegerField(default=0)), - ('num_uses_failed', models.PositiveIntegerField(default=0)), + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("mac_address", models.CharField(default=None, editable=False, max_length=17)), + ("ip_public", models.GenericIPAddressField(default=None, editable=False)), + ("ip_local", models.GenericIPAddressField(default=None, editable=False)), + ("dns_server", models.GenericIPAddressField(default=None, editable=False)), + ("hostname", models.CharField(default=None, max_length=63)), + ("iface", models.CharField(default=None, max_length=15)), + ("isp", models.CharField(default=None, max_length=63)), + ("city", models.CharField(default=None, max_length=63)), + ("region", models.CharField(default=None, max_length=63)), + ("country", models.CharField(default=None, max_length=63)), + ("machine", models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to="machine.machine")), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), ], options={ - 'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')}, - 'app_label': 'machine', + "unique_together": {("machine", "ip_public", "ip_local", "mac_address", "dns_server")}, + "app_label": "machine", }, ), migrations.CreateModel( - name='Binary', + name="Binary", fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('name', models.CharField(blank=True, db_index=True, default='', max_length=63)), - ('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)), - ('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}")), - ('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)), - ('abspath', models.CharField(blank=True, default='', max_length=255)), - ('version', models.CharField(blank=True, default='', max_length=32)), - ('sha256', models.CharField(blank=True, default='', max_length=64)), - ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)), - ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)), - ('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)), - ('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), - ('num_uses_succeeded', models.PositiveIntegerField(default=0)), - ('num_uses_failed', models.PositiveIntegerField(default=0)), + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(blank=True, db_index=True, default="", max_length=63)), + ( + "binproviders", + models.CharField( + blank=True, + default="env", + help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env", + max_length=127, + ), + ), + ( + "overrides", + models.JSONField( + blank=True, + default=dict, + help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}", + ), + ), + ( + "binprovider", + models.CharField( + blank=True, + default="", + help_text="Provider that successfully installed this binary", + max_length=31, + ), + ), + ("abspath", models.CharField(blank=True, default="", max_length=255)), + ("version", models.CharField(blank=True, default="", max_length=32)), + ("sha256", models.CharField(blank=True, default="", max_length=64)), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("succeeded", "Succeeded"), ("failed", "Failed")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ( + "retry_at", + models.DateTimeField( + blank=True, + db_index=True, + default=django.utils.timezone.now, + help_text="When to retry this binary installation", + null=True, + ), + ), + ( + "output_dir", + models.CharField( + blank=True, + default="", + help_text="Directory where installation hook logs are stored", + max_length=255, + ), + ), + ("machine", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="machine.machine")), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), ], options={ - 'verbose_name': 'Binary', - 'verbose_name_plural': 'Binaries', - 'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')}, - 'app_label': 'machine', + "verbose_name": "Binary", + "verbose_name_plural": "Binaries", + "unique_together": {("machine", "name", "abspath", "version", "sha256")}, + "app_label": "machine", }, ), ], diff --git a/archivebox/machine/migrations/0005_converge_binary_model.py b/archivebox/machine/migrations/0005_converge_binary_model.py index e7e3a733..a83c98ad 100644 --- a/archivebox/machine/migrations/0005_converge_binary_model.py +++ b/archivebox/machine/migrations/0005_converge_binary_model.py @@ -16,17 +16,17 @@ def converge_binary_table(apps, schema_editor): cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')") existing_tables = {row[0] for row in cursor.fetchall()} - print(f'DEBUG 0005: Existing tables: {existing_tables}') + print(f"DEBUG 0005: Existing tables: {existing_tables}") # Drop old InstalledBinary table if it exists (0.8.6rc0 path) - if 'machine_installedbinary' in existing_tables: - print('✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)') + if "machine_installedbinary" in existing_tables: + print("✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)") cursor.execute("DROP TABLE IF EXISTS machine_installedbinary") # Create Binary table if it doesn't exist # This handles the case where 0.8.6rc0's 0001_initial didn't create it - if 'machine_binary' not in existing_tables: - print('✓ Creating machine_binary table with correct schema') + if "machine_binary" not in existing_tables: + print("✓ Creating machine_binary table with correct schema") cursor.execute(""" CREATE TABLE machine_binary ( id TEXT PRIMARY KEY NOT NULL, @@ -53,15 +53,14 @@ def converge_binary_table(apps, schema_editor): cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)") cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)") - print('✓ machine_binary table created') + print("✓ machine_binary table created") else: - print('✓ machine_binary table already exists') + print("✓ machine_binary table already exists") class Migration(migrations.Migration): - dependencies = [ - ('machine', '0001_initial'), + ("machine", "0001_initial"), ] operations = [ diff --git a/archivebox/machine/migrations/0006_process.py b/archivebox/machine/migrations/0006_process.py index b989d482..eb234756 100644 --- a/archivebox/machine/migrations/0006_process.py +++ b/archivebox/machine/migrations/0006_process.py @@ -8,39 +8,95 @@ from archivebox.uuid_compat import uuid7 class Migration(migrations.Migration): - dependencies = [ - ('machine', '0005_converge_binary_model'), + ("machine", "0005_converge_binary_model"), ] operations = [ migrations.CreateModel( - name='Process', + name="Process", fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)), - ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')), - ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')), - ('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')), - ('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)), - ('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)), - ('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')), - ('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')), - ('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)), - ('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)), - ('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)), - ('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)), - ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)), - ('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')), - ('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')), - ('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')), + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("pwd", models.CharField(blank=True, default="", help_text="Working directory for process execution", max_length=512)), + ("cmd", models.JSONField(blank=True, default=list, help_text="Command as array of arguments")), + ("env", models.JSONField(blank=True, default=dict, help_text="Environment variables for process")), + ("timeout", models.IntegerField(default=120, help_text="Timeout in seconds")), + ("pid", models.IntegerField(blank=True, default=None, help_text="OS process ID", null=True)), + ("exit_code", models.IntegerField(blank=True, default=None, help_text="Process exit code (0 = success)", null=True)), + ("stdout", models.TextField(blank=True, default="", help_text="Standard output from process")), + ("stderr", models.TextField(blank=True, default="", help_text="Standard error from process")), + ("started_at", models.DateTimeField(blank=True, default=None, help_text="When process was launched", null=True)), + ("ended_at", models.DateTimeField(blank=True, default=None, help_text="When process completed/terminated", null=True)), + ( + "url", + models.URLField( + blank=True, + default=None, + help_text="Connection URL (CDP endpoint, sonic server, etc.)", + max_length=2048, + null=True, + ), + ), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("running", "Running"), ("exited", "Exited")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ( + "retry_at", + models.DateTimeField( + blank=True, + db_index=True, + default=django.utils.timezone.now, + help_text="When to retry this process", + null=True, + ), + ), + ( + "binary", + models.ForeignKey( + blank=True, + help_text="Binary used by this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="process_set", + to="machine.binary", + ), + ), + ( + "iface", + models.ForeignKey( + blank=True, + help_text="Network interface used by this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="process_set", + to="machine.networkinterface", + ), + ), + ( + "machine", + models.ForeignKey( + help_text="Machine where this process executed", + on_delete=django.db.models.deletion.CASCADE, + related_name="process_set", + to="machine.machine", + ), + ), ], options={ - 'verbose_name': 'Process', - 'verbose_name_plural': 'Processes', - 'indexes': [models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'), models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx')], + "verbose_name": "Process", + "verbose_name_plural": "Processes", + "indexes": [ + models.Index(fields=["machine", "status", "retry_at"], name="machine_pro_machine_5e3a87_idx"), + models.Index(fields=["binary", "exit_code"], name="machine_pro_binary__7bd19c_idx"), + ], }, ), ] diff --git a/archivebox/machine/migrations/0007_add_process_type_and_parent.py b/archivebox/machine/migrations/0007_add_process_type_and_parent.py index b63fa400..9d81a773 100644 --- a/archivebox/machine/migrations/0007_add_process_type_and_parent.py +++ b/archivebox/machine/migrations/0007_add_process_type_and_parent.py @@ -5,20 +5,38 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('machine', '0006_process'), + ("machine", "0006_process"), ] operations = [ migrations.AddField( - model_name='process', - name='parent', - field=models.ForeignKey(blank=True, help_text='Parent process that spawned this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='machine.process'), + model_name="process", + name="parent", + field=models.ForeignKey( + blank=True, + help_text="Parent process that spawned this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="children", + to="machine.process", + ), ), migrations.AddField( - model_name='process', - name='process_type', - field=models.CharField(choices=[('supervisord', 'Supervisord'), ('orchestrator', 'Orchestrator'), ('worker', 'Worker'), ('cli', 'CLI'), ('binary', 'Binary')], db_index=True, default='cli', help_text='Type of process (cli, worker, orchestrator, binary, supervisord)', max_length=16), + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("worker", "Worker"), + ("cli", "CLI"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), ), ] diff --git a/archivebox/machine/migrations/0008_add_worker_type_field.py b/archivebox/machine/migrations/0008_add_worker_type_field.py index 0588e60c..905870cf 100644 --- a/archivebox/machine/migrations/0008_add_worker_type_field.py +++ b/archivebox/machine/migrations/0008_add_worker_type_field.py @@ -4,15 +4,20 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('machine', '0007_add_process_type_and_parent'), + ("machine", "0007_add_process_type_and_parent"), ] operations = [ migrations.AddField( - model_name='process', - name='worker_type', - field=models.CharField(blank=True, db_index=True, default='', help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)', max_length=32), + model_name="process", + name="worker_type", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Worker type name for WORKER processes (crawl, snapshot, archiveresult)", + max_length=32, + ), ), ] diff --git a/archivebox/machine/migrations/0009_alter_binary_status.py b/archivebox/machine/migrations/0009_alter_binary_status.py index 88ed39ad..bbc27598 100644 --- a/archivebox/machine/migrations/0009_alter_binary_status.py +++ b/archivebox/machine/migrations/0009_alter_binary_status.py @@ -4,15 +4,19 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('machine', '0008_add_worker_type_field'), + ("machine", "0008_add_worker_type_field"), ] operations = [ migrations.AlterField( - model_name='binary', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('installed', 'Installed')], db_index=True, default='queued', max_length=16), + model_name="binary", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("installed", "Installed")], + db_index=True, + default="queued", + max_length=16, + ), ), ] diff --git a/archivebox/machine/migrations/0010_alter_process_process_type.py b/archivebox/machine/migrations/0010_alter_process_process_type.py index ebf81294..477ea353 100644 --- a/archivebox/machine/migrations/0010_alter_process_process_type.py +++ b/archivebox/machine/migrations/0010_alter_process_process_type.py @@ -4,15 +4,27 @@ from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ - ('machine', '0009_alter_binary_status'), + ("machine", "0009_alter_binary_status"), ] operations = [ migrations.AlterField( - model_name='process', - name='process_type', - field=models.CharField(choices=[('supervisord', 'Supervisord'), ('orchestrator', 'Orchestrator'), ('worker', 'Worker'), ('cli', 'CLI'), ('hook', 'Hook'), ('binary', 'Binary')], db_index=True, default='cli', help_text='Type of process (cli, worker, orchestrator, binary, supervisord)', max_length=16), + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("worker", "Worker"), + ("cli", "CLI"), + ("hook", "Hook"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), ), ] diff --git a/archivebox/machine/migrations/0011_remove_binary_output_dir.py b/archivebox/machine/migrations/0011_remove_binary_output_dir.py index acffbfcc..0a24dff1 100644 --- a/archivebox/machine/migrations/0011_remove_binary_output_dir.py +++ b/archivebox/machine/migrations/0011_remove_binary_output_dir.py @@ -6,17 +6,16 @@ def remove_output_dir_if_exists(apps, schema_editor): cursor.execute("PRAGMA table_info(machine_binary)") columns = {row[1] for row in cursor.fetchall()} - if 'output_dir' not in columns: + if "output_dir" not in columns: return - Binary = apps.get_model('machine', 'Binary') - schema_editor.remove_field(Binary, Binary._meta.get_field('output_dir')) + Binary = apps.get_model("machine", "Binary") + schema_editor.remove_field(Binary, Binary._meta.get_field("output_dir")) class Migration(migrations.Migration): - dependencies = [ - ('machine', '0010_alter_process_process_type'), + ("machine", "0010_alter_process_process_type"), ] operations = [ @@ -26,8 +25,8 @@ class Migration(migrations.Migration): ], state_operations=[ migrations.RemoveField( - model_name='binary', - name='output_dir', + model_name="binary", + name="output_dir", ), ], ), diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 441b8cf1..1d90572f 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -1,6 +1,6 @@ from __future__ import annotations -__package__ = 'archivebox.machine' +__package__ = "archivebox.machine" import os import sys @@ -26,6 +26,7 @@ from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, g _psutil: Any | None = None try: import psutil as _psutil_import + PSUTIL_AVAILABLE = True except ImportError: PSUTIL_AVAILABLE = False @@ -48,35 +49,36 @@ NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 BINARY_RECHECK_INTERVAL = 1 * 30 * 60 PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid +PROCESS_TIMEOUT_GRACE = timedelta(seconds=30) # Extra margin before force-cleaning timed-out RUNNING rows START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"}) -def _find_existing_binary_for_reference(machine: 'Machine', reference: str) -> 'Binary | None': - reference = str(reference or '').strip() +def _find_existing_binary_for_reference(machine: Machine, reference: str) -> Binary | None: + reference = str(reference or "").strip() if not reference: return None qs = Binary.objects.filter(machine=machine) - direct_match = qs.filter(abspath=reference).order_by('-modified_at').first() + direct_match = qs.filter(abspath=reference).order_by("-modified_at").first() if direct_match: return direct_match ref_name = Path(reference).name if ref_name: - named_match = qs.filter(name=ref_name).order_by('-modified_at').first() + named_match = qs.filter(name=ref_name).order_by("-modified_at").first() if named_match: return named_match - return qs.filter(name=reference).order_by('-modified_at').first() + return qs.filter(name=reference).order_by("-modified_at").first() def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]: env = env or {} - plugin_name = str(plugin_name or '').strip() - hook_path = str(hook_path or '').strip() - plugin_key = plugin_name.upper().replace('-', '_') + plugin_name = str(plugin_name or "").strip() + hook_path = str(hook_path or "").strip() + plugin_key = plugin_name.upper().replace("-", "_") keys: list[str] = [] seen: set[str] = set() @@ -86,40 +88,38 @@ def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str keys.append(key) if plugin_key: - add(f'{plugin_key}_BINARY') + add(f"{plugin_key}_BINARY") try: from archivebox.hooks import discover_plugin_configs plugin_schema = discover_plugin_configs().get(plugin_name, {}) - schema_keys = [ - key - for key in (plugin_schema.get('properties') or {}) - if key.endswith('_BINARY') - ] + schema_keys = [key for key in (plugin_schema.get("properties") or {}) if key.endswith("_BINARY")] except Exception: schema_keys = [] - schema_keys.sort(key=lambda key: ( - key != f'{plugin_key}_BINARY', - key.endswith('_NODE_BINARY'), - key.endswith('_CHROME_BINARY'), - key, - )) + schema_keys.sort( + key=lambda key: ( + key != f"{plugin_key}_BINARY", + key.endswith("_NODE_BINARY"), + key.endswith("_CHROME_BINARY"), + key, + ), + ) for key in schema_keys: add(key) - if plugin_name.startswith('search_backend_'): - backend_name = plugin_name.removeprefix('search_backend_').upper().replace('-', '_') - configured_engine = str(env.get('SEARCH_BACKEND_ENGINE') or '').strip().upper().replace('-', '_') + if plugin_name.startswith("search_backend_"): + backend_name = plugin_name.removeprefix("search_backend_").upper().replace("-", "_") + configured_engine = str(env.get("SEARCH_BACKEND_ENGINE") or "").strip().upper().replace("-", "_") if backend_name and backend_name == configured_engine: - add(f'{backend_name}_BINARY') + add(f"{backend_name}_BINARY") hook_suffix = Path(hook_path).suffix.lower() - if hook_suffix == '.js': + if hook_suffix == ".js": if plugin_key: - add(f'{plugin_key}_NODE_BINARY') - add('NODE_BINARY') + add(f"{plugin_key}_NODE_BINARY") + add("NODE_BINARY") return keys @@ -135,7 +135,7 @@ def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]: class MachineManager(models.Manager): - def current(self) -> 'Machine': + def current(self) -> Machine: return Machine.current() @@ -156,19 +156,23 @@ class Machine(ModelWithHealthStats): os_release = models.CharField(max_length=63, default=None, null=False) os_kernel = models.CharField(max_length=255, default=None, null=False) stats = models.JSONField(default=dict, null=True, blank=True) - config = models.JSONField(default=dict, null=True, blank=True, - help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)") + config = models.JSONField( + default=dict, + null=True, + blank=True, + help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)", + ) num_uses_failed = models.PositiveIntegerField(default=0) num_uses_succeeded = models.PositiveIntegerField(default=0) objects = MachineManager() # pyright: ignore[reportIncompatibleVariableOverride] - networkinterface_set: models.Manager['NetworkInterface'] + networkinterface_set: models.Manager[NetworkInterface] class Meta(ModelWithHealthStats.Meta): - app_label = 'machine' + app_label = "machine" @classmethod - def current(cls) -> 'Machine': + def current(cls) -> Machine: global _CURRENT_MACHINE if _CURRENT_MACHINE: if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL): @@ -176,35 +180,28 @@ class Machine(ModelWithHealthStats): _CURRENT_MACHINE = None _CURRENT_MACHINE, _ = cls.objects.update_or_create( guid=get_host_guid(), - defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()}, + defaults={"hostname": socket.gethostname(), **get_os_info(), **get_vm_info(), "stats": get_host_stats()}, ) return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE)) @classmethod - def _hydrate_config_from_sibling(cls, machine: 'Machine') -> 'Machine': + def _hydrate_config_from_sibling(cls, machine: Machine) -> Machine: if machine.config: return machine - sibling = ( - cls.objects - .exclude(pk=machine.pk) - .filter(hostname=machine.hostname) - .exclude(config={}) - .order_by('-modified_at') - .first() - ) + sibling = cls.objects.exclude(pk=machine.pk).filter(hostname=machine.hostname).exclude(config={}).order_by("-modified_at").first() if sibling and sibling.config: machine.config = dict(sibling.config) - machine.save(update_fields=['config', 'modified_at']) + machine.save(update_fields=["config", "modified_at"]) return machine @classmethod - def _sanitize_config(cls, machine: 'Machine') -> 'Machine': + def _sanitize_config(cls, machine: Machine) -> Machine: sanitized = _sanitize_machine_config(machine.config) current = machine.config or {} if sanitized != current: machine.config = sanitized - machine.save(update_fields=['config', 'modified_at']) + machine.save(update_fields=["config", "modified_at"]) return machine def to_json(self) -> dict: @@ -212,24 +209,25 @@ class Machine(ModelWithHealthStats): Convert Machine model instance to a JSON-serializable dict. """ from archivebox.config import VERSION + return { - 'type': 'Machine', - 'schema_version': VERSION, - 'id': str(self.id), - 'guid': self.guid, - 'hostname': self.hostname, - 'hw_in_docker': self.hw_in_docker, - 'hw_in_vm': self.hw_in_vm, - 'hw_manufacturer': self.hw_manufacturer, - 'hw_product': self.hw_product, - 'hw_uuid': self.hw_uuid, - 'os_arch': self.os_arch, - 'os_family': self.os_family, - 'os_platform': self.os_platform, - 'os_kernel': self.os_kernel, - 'os_release': self.os_release, - 'stats': self.stats, - 'config': self.config or {}, + "type": "Machine", + "schema_version": VERSION, + "id": str(self.id), + "guid": self.guid, + "hostname": self.hostname, + "hw_in_docker": self.hw_in_docker, + "hw_in_vm": self.hw_in_vm, + "hw_manufacturer": self.hw_manufacturer, + "hw_product": self.hw_product, + "hw_uuid": self.hw_uuid, + "os_arch": self.os_arch, + "os_family": self.os_family, + "os_platform": self.os_platform, + "os_kernel": self.os_kernel, + "os_release": self.os_release, + "stats": self.stats, + "config": self.config or {}, } @staticmethod @@ -244,18 +242,18 @@ class Machine(ModelWithHealthStats): Returns: Machine instance or None """ - config_patch = _sanitize_machine_config(record.get('config')) + config_patch = _sanitize_machine_config(record.get("config")) if config_patch: machine = Machine.current() machine.config = _sanitize_machine_config(machine.config) machine.config.update(config_patch) - machine.save(update_fields=['config']) + machine.save(update_fields=["config"]) return machine return None class NetworkInterfaceManager(models.Manager): - def current(self) -> 'NetworkInterface': + def current(self) -> NetworkInterface: return NetworkInterface.current() @@ -281,11 +279,11 @@ class NetworkInterface(ModelWithHealthStats): machine_id: uuid.UUID class Meta(ModelWithHealthStats.Meta): - app_label = 'machine' - unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),) + app_label = "machine" + unique_together = (("machine", "ip_public", "ip_local", "mac_address", "dns_server"),) @classmethod - def current(cls, refresh: bool = False) -> 'NetworkInterface': + def current(cls, refresh: bool = False) -> NetworkInterface: global _CURRENT_INTERFACE machine = Machine.current() if _CURRENT_INTERFACE: @@ -298,32 +296,45 @@ class NetworkInterface(ModelWithHealthStats): _CURRENT_INTERFACE = None net_info = get_host_network() _CURRENT_INTERFACE, _ = cls.objects.update_or_create( - machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'), - mac_address=net_info.pop('mac_address'), dns_server=net_info.pop('dns_server'), defaults=net_info, + machine=machine, + ip_public=net_info.pop("ip_public"), + ip_local=net_info.pop("ip_local"), + mac_address=net_info.pop("mac_address"), + dns_server=net_info.pop("dns_server"), + defaults=net_info, ) return _CURRENT_INTERFACE - class BinaryManager(models.Manager): - def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary': + def get_from_db_or_cache(self, name: str, abspath: str = "", version: str = "", sha256: str = "", binprovider: str = "env") -> Binary: """Get or create an Binary record from the database or cache.""" cached = _CURRENT_BINARIES.get(name) if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL): return cached _CURRENT_BINARIES[name], _ = self.update_or_create( - machine=Machine.current(), name=name, binprovider=binprovider, - version=version, abspath=abspath, sha256=sha256, + machine=Machine.current(), + name=name, + binprovider=binprovider, + version=version, + abspath=abspath, + sha256=sha256, ) return _CURRENT_BINARIES[name] - def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'Binary | None': + def get_valid_binary(self, name: str, machine: Machine | None = None) -> Binary | None: """Get a valid Binary for the given name on the current machine, or None if not found.""" machine = machine or Machine.current() - return self.filter( - machine=machine, - name__iexact=name, - ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first() + return ( + self.filter( + machine=machine, + name__iexact=name, + ) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + .first() + ) class Binary(ModelWithHealthStats, ModelWithStateMachine): @@ -342,8 +353,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): """ class StatusChoices(models.TextChoices): - QUEUED = 'queued', 'Queued' - INSTALLED = 'installed', 'Installed' + QUEUED = "queued", "Queued" + INSTALLED = "installed", "Installed" id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) @@ -351,23 +362,38 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): machine = models.ForeignKey(Machine, on_delete=models.CASCADE, null=False) # Binary metadata - name = models.CharField(max_length=63, default='', null=False, blank=True, db_index=True) - binproviders = models.CharField(max_length=127, default='env', null=False, blank=True, - help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env") - overrides = models.JSONField(default=dict, blank=True, - help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}") + name = models.CharField(max_length=63, default="", null=False, blank=True, db_index=True) + binproviders = models.CharField( + max_length=127, + default="env", + null=False, + blank=True, + help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env", + ) + overrides = models.JSONField( + default=dict, + blank=True, + help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}", + ) # Installation results (populated after installation) - binprovider = models.CharField(max_length=31, default='', null=False, blank=True, - help_text="Provider that successfully installed this binary") - abspath = models.CharField(max_length=255, default='', null=False, blank=True) - version = models.CharField(max_length=32, default='', null=False, blank=True) - sha256 = models.CharField(max_length=64, default='', null=False, blank=True) + binprovider = models.CharField( + max_length=31, + default="", + null=False, + blank=True, + help_text="Provider that successfully installed this binary", + ) + abspath = models.CharField(max_length=255, default="", null=False, blank=True) + version = models.CharField(max_length=32, default="", null=False, blank=True) + sha256 = models.CharField(max_length=64, default="", null=False, blank=True) # State machine fields status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED, max_length=16) - retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now, - help_text="When to retry this binary installation") + retry_at = ModelWithStateMachine.RetryAtField( + default=timezone.now, + help_text="When to retry this binary installation", + ) # Health stats num_uses_failed = models.PositiveIntegerField(default=0) @@ -375,19 +401,19 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): machine_id: uuid.UUID - state_machine_name: str | None = 'archivebox.machine.models.BinaryMachine' + state_machine_name: str | None = "archivebox.machine.models.BinaryMachine" active_state: str = StatusChoices.QUEUED objects = BinaryManager() # pyright: ignore[reportIncompatibleVariableOverride] class Meta(ModelWithHealthStats.Meta, ModelWithStateMachine.Meta): - app_label = 'machine' - verbose_name = 'Binary' - verbose_name_plural = 'Binaries' - unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),) + app_label = "machine" + verbose_name = "Binary" + verbose_name_plural = "Binaries" + unique_together = (("machine", "name", "abspath", "version", "sha256"),) def __str__(self) -> str: - return f'{self.name}@{self.binprovider}+{self.abspath}@{self.version}' + return f"{self.name}@{self.binprovider}+{self.abspath}@{self.version}" @property def is_valid(self) -> bool: @@ -398,11 +424,11 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): def binary_info(self) -> dict: """Return info about the binary.""" return { - 'name': self.name, - 'abspath': self.abspath, - 'version': self.version, - 'binprovider': self.binprovider, - 'is_valid': self.is_valid, + "name": self.name, + "abspath": self.abspath, + "version": self.version, + "binprovider": self.binprovider, + "is_valid": self.is_valid, } @property @@ -412,24 +438,26 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): Path: data/machines/{machine_uuid}/binaries/{binary_name}/{binary_uuid} """ from django.conf import settings - return Path(settings.DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id) + + return Path(settings.DATA_DIR) / "machines" / str(self.machine_id) / "binaries" / self.name / str(self.id) def to_json(self) -> dict: """ Convert Binary model instance to a JSON-serializable dict. """ from archivebox.config import VERSION + return { - 'type': 'Binary', - 'schema_version': VERSION, - 'id': str(self.id), - 'machine_id': str(self.machine_id), - 'name': self.name, - 'binprovider': self.binprovider, - 'abspath': self.abspath, - 'version': self.version, - 'sha256': self.sha256, - 'status': self.status, + "type": "Binary", + "schema_version": VERSION, + "id": str(self.id), + "machine_id": str(self.machine_id), + "name": self.name, + "binprovider": self.binprovider, + "abspath": self.abspath, + "version": self.version, + "sha256": self.sha256, + "status": self.status, } @staticmethod @@ -450,36 +478,36 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): Returns: Binary instance or None """ - name = record.get('name') + name = record.get("name") if not name: return None machine = Machine.current() overrides = overrides or {} - binary_overrides = record.get('overrides', {}) + binary_overrides = record.get("overrides", {}) normalized_overrides = binary_overrides if isinstance(binary_overrides, dict) else {} # abx-plugins currently emits a GitHub install URL for readability-extractor, # but the package is published on npm. Prefer the registry package to avoid # long git-based installs in CI while still using canonical install_args. if ( - name == 'readability-extractor' - and isinstance(normalized_overrides.get('npm'), dict) - and normalized_overrides['npm'].get('install_args') == ['https://github.com/ArchiveBox/readability-extractor'] + name == "readability-extractor" + and isinstance(normalized_overrides.get("npm"), dict) + and normalized_overrides["npm"].get("install_args") == ["https://github.com/ArchiveBox/readability-extractor"] ): normalized_overrides = { **normalized_overrides, - 'npm': { - **normalized_overrides['npm'], - 'install_args': ['readability-extractor'], + "npm": { + **normalized_overrides["npm"], + "install_args": ["readability-extractor"], }, } # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders # This happens when on_Crawl hooks detect already-installed binaries - abspath = record.get('abspath') - version = record.get('version') - binproviders = record.get('binproviders') + abspath = record.get("abspath") + version = record.get("version") + binproviders = record.get("binproviders") if abspath and version and binproviders: # Binary is already installed, create INSTALLED record with binproviders filter @@ -487,28 +515,28 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): machine=machine, name=name, defaults={ - 'abspath': abspath, - 'version': version, - 'sha256': record.get('sha256', ''), - 'binprovider': record.get('binprovider', 'env'), - 'binproviders': binproviders, # Preserve the filter - 'status': Binary.StatusChoices.INSTALLED, - 'retry_at': None, - } + "abspath": abspath, + "version": version, + "sha256": record.get("sha256", ""), + "binprovider": record.get("binprovider", "env"), + "binproviders": binproviders, # Preserve the filter + "status": Binary.StatusChoices.INSTALLED, + "retry_at": None, + }, ) return binary # Case 2: From binaries.json - create queued binary (needs installation) - if 'binproviders' in record or ('overrides' in record and not abspath): + if "binproviders" in record or ("overrides" in record and not abspath): binary, _ = Binary.objects.update_or_create( machine=machine, name=name, defaults={ - 'binproviders': record.get('binproviders', 'env'), - 'overrides': normalized_overrides, - 'status': Binary.StatusChoices.QUEUED, - 'retry_at': timezone.now(), - } + "binproviders": record.get("binproviders", "env"), + "overrides": normalized_overrides, + "status": Binary.StatusChoices.QUEUED, + "retry_at": timezone.now(), + }, ) return binary @@ -518,13 +546,13 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): machine=machine, name=name, defaults={ - 'abspath': abspath, - 'version': version, - 'sha256': record.get('sha256', ''), - 'binprovider': record.get('binprovider', 'env'), - 'status': Binary.StatusChoices.INSTALLED, - 'retry_at': None, - } + "abspath": abspath, + "version": version, + "sha256": record.get("sha256", ""), + "binprovider": record.get("binprovider", "env"), + "status": Binary.StatusChoices.INSTALLED, + "retry_at": None, + }, ) return binary @@ -545,10 +573,10 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): def _allowed_binproviders(self) -> set[str] | None: """Return the allowed binproviders for this binary, or None for wildcard.""" - providers = str(self.binproviders or '').strip() - if not providers or providers == '*': + providers = str(self.binproviders or "").strip() + if not providers or providers == "*": return None - return {provider.strip() for provider in providers.split(',') if provider.strip()} + return {provider.strip() for provider in providers.split(",") if provider.strip()} def _get_custom_install_command(self) -> str | None: """Extract a custom install command from overrides when the custom provider is used.""" @@ -557,23 +585,23 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): if not isinstance(self.overrides, dict): return None - for key in ('custom_cmd', 'cmd', 'command'): + for key in ("custom_cmd", "cmd", "command"): value = self.overrides.get(key) if isinstance(value, str) and value.strip(): return value.strip() - custom_overrides = self.overrides.get('custom') + custom_overrides = self.overrides.get("custom") if isinstance(custom_overrides, dict): - for key in ('custom_cmd', 'cmd', 'command'): + for key in ("custom_cmd", "cmd", "command"): value = custom_overrides.get(key) if isinstance(value, str) and value.strip(): return value.strip() - install_args = custom_overrides.get('install_args') + install_args = custom_overrides.get("install_args") if isinstance(install_args, str) and install_args.strip(): return install_args.strip() if isinstance(install_args, list) and install_args: - return ' '.join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip()) + return " ".join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip()) return None @@ -601,16 +629,16 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): # ArchiveBox installs the puppeteer package and Chromium in separate # hook phases. Suppress puppeteer's bundled browser download during the # package install step so the dedicated chromium hook owns that work. - if self.name == 'puppeteer': - config.setdefault('PUPPETEER_SKIP_DOWNLOAD', 'true') - config.setdefault('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD', 'true') + if self.name == "puppeteer": + config.setdefault("PUPPETEER_SKIP_DOWNLOAD", "true") + config.setdefault("PUPPETEER_SKIP_CHROMIUM_DOWNLOAD", "true") # Create output directory output_dir = self.output_dir output_dir.mkdir(parents=True, exist_ok=True) # Discover ALL on_Binary__install_* hooks - hooks = discover_hooks('Binary', config=config) + hooks = discover_hooks("Binary", config=config) if not hooks: # No hooks available - stay queued, will retry later return @@ -628,7 +656,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): custom_cmd = None overrides_json = None - if plugin_name == 'custom': + if plugin_name == "custom": custom_cmd = self._get_custom_install_command() if not custom_cmd: continue @@ -659,26 +687,25 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): # Parse JSONL output to check for successful installation from archivebox.hooks import extract_records_from_process, process_hook_records + records = extract_records_from_process(process) if records: process_hook_records(records, overrides={}) - binary_records = [ - record for record in records - if record.get('type') == 'Binary' and record.get('abspath') - ] + binary_records = [record for record in records if record.get("type") == "Binary" and record.get("abspath")] if binary_records: record = binary_records[0] # Update self from successful installation - self.abspath = record['abspath'] - self.version = record.get('version', '') - self.sha256 = record.get('sha256', '') - self.binprovider = record.get('binprovider', 'env') + self.abspath = record["abspath"] + self.version = record.get("version", "") + self.sha256 = record.get("sha256", "") + self.binprovider = record.get("binprovider", "env") self.status = self.StatusChoices.INSTALLED self.save() # Symlink binary into LIB_BIN_DIR if configured from django.conf import settings - lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None) + + lib_bin_dir = getattr(settings, "LIB_BIN_DIR", None) if lib_bin_dir: self.symlink_to_lib_bin(lib_bin_dir) @@ -706,12 +733,12 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): for process in running_hooks: killed_count = process.kill_tree(graceful_timeout=2.0) if killed_count > 0: - print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]') + print(f"[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]") # Clean up .pid files from output directory output_dir = self.output_dir if output_dir.exists(): - for pid_file in output_dir.glob('**/*.pid'): + for pid_file in output_dir.glob("**/*.pid"): pid_file.unlink(missing_ok=True) def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None: @@ -783,14 +810,15 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): # Process Model # ============================================================================= + class ProcessManager(models.Manager): """Manager for Process model.""" - def current(self) -> 'Process': + def current(self) -> Process: """Get the Process record for the current OS process.""" return Process.current() - def get_by_pid(self, pid: int, machine: 'Machine | None' = None) -> 'Process | None': + def get_by_pid(self, pid: int, machine: Machine | None = None) -> Process | None: """ Find a Process by PID with proper validation against PID reuse. @@ -825,7 +853,7 @@ class ProcessManager(models.Manager): pid=pid, status=Process.StatusChoices.RUNNING, started_at__gte=timezone.now() - PID_REUSE_WINDOW, - ).order_by('-started_at') + ).order_by("-started_at") for candidate in candidates: # Validate start time matches (within tolerance) @@ -842,17 +870,17 @@ class ProcessManager(models.Manager): Called during migration and when creating new ArchiveResults. """ - iface = kwargs.get('iface') or NetworkInterface.current() + iface = kwargs.get("iface") or NetworkInterface.current() # Defaults from ArchiveResult if not provided defaults = { - 'machine': iface.machine, - 'pwd': kwargs.get('pwd') or str(archiveresult.snapshot.output_dir / archiveresult.plugin), - 'cmd': kwargs.get('cmd') or [], - 'status': 'queued', - 'timeout': kwargs.get('timeout', 120), - 'env': kwargs.get('env', {}), - 'iface': iface, + "machine": iface.machine, + "pwd": kwargs.get("pwd") or str(archiveresult.snapshot.output_dir / archiveresult.plugin), + "cmd": kwargs.get("cmd") or [], + "status": "queued", + "timeout": kwargs.get("timeout", 120), + "env": kwargs.get("env", {}), + "iface": iface, } defaults.update(kwargs) @@ -877,17 +905,17 @@ class Process(models.Model): """ class StatusChoices(models.TextChoices): - QUEUED = 'queued', 'Queued' - RUNNING = 'running', 'Running' - EXITED = 'exited', 'Exited' + QUEUED = "queued", "Queued" + RUNNING = "running", "Running" + EXITED = "exited", "Exited" class TypeChoices(models.TextChoices): - SUPERVISORD = 'supervisord', 'Supervisord' - ORCHESTRATOR = 'orchestrator', 'Orchestrator' - WORKER = 'worker', 'Worker' - CLI = 'cli', 'CLI' - HOOK = 'hook', 'Hook' - BINARY = 'binary', 'Binary' + SUPERVISORD = "supervisord", "Supervisord" + ORCHESTRATOR = "orchestrator", "Orchestrator" + WORKER = "worker", "Worker" + CLI = "cli", "CLI" + HOOK = "hook", "Hook" + BINARY = "binary", "Binary" # Primary fields id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) @@ -899,17 +927,18 @@ class Process(models.Model): Machine, on_delete=models.CASCADE, null=False, - related_name='process_set', - help_text='Machine where this process executed' + related_name="process_set", + help_text="Machine where this process executed", ) # Parent process (optional) parent = models.ForeignKey( - 'self', + "self", on_delete=models.SET_NULL, - null=True, blank=True, - related_name='children', - help_text='Parent process that spawned this process' + null=True, + blank=True, + related_name="children", + help_text="Parent process that spawned this process", ) # Process type (cli, worker, orchestrator, binary, supervisord) @@ -918,64 +947,111 @@ class Process(models.Model): choices=TypeChoices.choices, default=TypeChoices.CLI, db_index=True, - help_text='Type of process (cli, worker, orchestrator, binary, supervisord)' + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", ) # Worker type (only for WORKER processes: crawl, snapshot, archiveresult) worker_type = models.CharField( max_length=32, - default='', + default="", null=False, blank=True, db_index=True, - help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)' + help_text="Worker type name for WORKER processes (crawl, snapshot, archiveresult)", ) # Execution metadata - pwd = models.CharField(max_length=512, default='', null=False, blank=True, - help_text='Working directory for process execution') - cmd = models.JSONField(default=list, null=False, blank=True, - help_text='Command as array of arguments') - env = models.JSONField(default=dict, null=False, blank=True, - help_text='Environment variables for process') - timeout = models.IntegerField(default=120, null=False, - help_text='Timeout in seconds') + pwd = models.CharField( + max_length=512, + default="", + null=False, + blank=True, + help_text="Working directory for process execution", + ) + cmd = models.JSONField( + default=list, + null=False, + blank=True, + help_text="Command as array of arguments", + ) + env = models.JSONField( + default=dict, + null=False, + blank=True, + help_text="Environment variables for process", + ) + timeout = models.IntegerField( + default=120, + null=False, + help_text="Timeout in seconds", + ) # Process results - pid = models.IntegerField(default=None, null=True, blank=True, - help_text='OS process ID') - exit_code = models.IntegerField(default=None, null=True, blank=True, - help_text='Process exit code (0 = success)') - stdout = models.TextField(default='', null=False, blank=True, - help_text='Standard output from process') - stderr = models.TextField(default='', null=False, blank=True, - help_text='Standard error from process') + pid = models.IntegerField( + default=None, + null=True, + blank=True, + help_text="OS process ID", + ) + exit_code = models.IntegerField( + default=None, + null=True, + blank=True, + help_text="Process exit code (0 = success)", + ) + stdout = models.TextField( + default="", + null=False, + blank=True, + help_text="Standard output from process", + ) + stderr = models.TextField( + default="", + null=False, + blank=True, + help_text="Standard error from process", + ) # Timing - started_at = models.DateTimeField(default=None, null=True, blank=True, - help_text='When process was launched') - ended_at = models.DateTimeField(default=None, null=True, blank=True, - help_text='When process completed/terminated') + started_at = models.DateTimeField( + default=None, + null=True, + blank=True, + help_text="When process was launched", + ) + ended_at = models.DateTimeField( + default=None, + null=True, + blank=True, + help_text="When process completed/terminated", + ) # Optional FKs binary = models.ForeignKey( Binary, on_delete=models.SET_NULL, - null=True, blank=True, - related_name='process_set', - help_text='Binary used by this process' + null=True, + blank=True, + related_name="process_set", + help_text="Binary used by this process", ) iface = models.ForeignKey( NetworkInterface, on_delete=models.SET_NULL, - null=True, blank=True, - related_name='process_set', - help_text='Network interface used by this process' + null=True, + blank=True, + related_name="process_set", + help_text="Network interface used by this process", ) # Optional connection URL (for CDP, sonic, etc.) - url = models.URLField(max_length=2048, default=None, null=True, blank=True, - help_text='Connection URL (CDP endpoint, sonic server, etc.)') + url = models.URLField( + max_length=2048, + default=None, + null=True, + blank=True, + help_text="Connection URL (CDP endpoint, sonic server, etc.)", + ) # Reverse relation to ArchiveResult (OneToOne from AR side) # archiveresult: OneToOneField defined on ArchiveResult model @@ -985,96 +1061,98 @@ class Process(models.Model): max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, - db_index=True + db_index=True, ) retry_at = models.DateTimeField( default=timezone.now, - null=True, blank=True, + null=True, + blank=True, db_index=True, - help_text='When to retry this process' + help_text="When to retry this process", ) machine_id: uuid.UUID parent_id: uuid.UUID | None binary_id: uuid.UUID | None - children: models.Manager['Process'] - archiveresult: 'ArchiveResult' + children: models.Manager[Process] + archiveresult: ArchiveResult - state_machine_name: str = 'archivebox.machine.models.ProcessMachine' + state_machine_name: str = "archivebox.machine.models.ProcessMachine" objects = ProcessManager() # pyright: ignore[reportIncompatibleVariableOverride] class Meta(TypedModelMeta): - app_label = 'machine' - verbose_name = 'Process' - verbose_name_plural = 'Processes' + app_label = "machine" + verbose_name = "Process" + verbose_name_plural = "Processes" indexes = [ - models.Index(fields=['machine', 'status', 'retry_at']), - models.Index(fields=['binary', 'exit_code']), + models.Index(fields=["machine", "status", "retry_at"]), + models.Index(fields=["binary", "exit_code"]), ] def __str__(self) -> str: - cmd_str = ' '.join(self.cmd[:3]) if self.cmd else '(no cmd)' - return f'Process[{self.id}] {cmd_str} ({self.status})' + cmd_str = " ".join(self.cmd[:3]) if self.cmd else "(no cmd)" + return f"Process[{self.id}] {cmd_str} ({self.status})" # Properties that delegate to related objects @property def cmd_version(self) -> str: """Get version from associated binary.""" - return self.binary.version if self.binary else '' + return self.binary.version if self.binary else "" @property def bin_abspath(self) -> str: """Get absolute path from associated binary.""" - return self.binary.abspath if self.binary else '' + return self.binary.abspath if self.binary else "" @property def plugin(self) -> str: """Get plugin name from associated ArchiveResult (if any).""" - if hasattr(self, 'archiveresult'): + if hasattr(self, "archiveresult"): # Inline import to avoid circular dependency return self.archiveresult.plugin - return '' + return "" @property def hook_name(self) -> str: """Get hook name from associated ArchiveResult (if any).""" - if hasattr(self, 'archiveresult'): + if hasattr(self, "archiveresult"): return self.archiveresult.hook_name - return '' + return "" def to_json(self) -> dict: """ Convert Process model instance to a JSON-serializable dict. """ from archivebox.config import VERSION + record = { - 'type': 'Process', - 'schema_version': VERSION, - 'id': str(self.id), - 'machine_id': str(self.machine_id), - 'cmd': self.cmd, - 'pwd': self.pwd, - 'status': self.status, - 'exit_code': self.exit_code, - 'started_at': self.started_at.isoformat() if self.started_at else None, - 'ended_at': self.ended_at.isoformat() if self.ended_at else None, + "type": "Process", + "schema_version": VERSION, + "id": str(self.id), + "machine_id": str(self.machine_id), + "cmd": self.cmd, + "pwd": self.pwd, + "status": self.status, + "exit_code": self.exit_code, + "started_at": self.started_at.isoformat() if self.started_at else None, + "ended_at": self.ended_at.isoformat() if self.ended_at else None, } # Include optional fields if set if self.binary_id: - record['binary_id'] = str(self.binary_id) + record["binary_id"] = str(self.binary_id) if self.pid: - record['pid'] = self.pid + record["pid"] = self.pid if self.timeout: - record['timeout'] = self.timeout + record["timeout"] = self.timeout return record - def hydrate_binary_from_context(self, *, plugin_name: str = '', hook_path: str = '') -> 'Binary | None': + def hydrate_binary_from_context(self, *, plugin_name: str = "", hook_path: str = "") -> Binary | None: machine = self.machine if self.machine_id else Machine.current() references: list[str] = [] for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env): - value = str(self.env.get(key) or '').strip() + value = str(self.env.get(key) or "").strip() if value and value not in references: references.append(value) @@ -1101,7 +1179,7 @@ class Process(models.Model): return records for line in text.splitlines(): record = parse_line(line) - if record and record.get('type'): + if record and record.get("type"): records.append(record) return records @@ -1110,7 +1188,7 @@ class Process(models.Model): stdout = self.stdout if not stdout and self.stdout_file and self.stdout_file.exists(): stdout = self.stdout_file.read_text() - return self.parse_records_from_text(stdout or '') + return self.parse_records_from_text(stdout or "") @staticmethod def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): @@ -1124,7 +1202,7 @@ class Process(models.Model): Returns: Process instance or None """ - process_id = record.get('id') + process_id = record.get("id") if process_id: try: return Process.objects.get(id=process_id) @@ -1148,7 +1226,7 @@ class Process(models.Model): # ========================================================================= @classmethod - def current(cls) -> 'Process': + def current(cls) -> Process: """ Get or create the Process record for the current OS process. @@ -1176,7 +1254,7 @@ class Process(models.Model): ): if _CURRENT_PROCESS.iface_id != iface.id: _CURRENT_PROCESS.iface = iface - _CURRENT_PROCESS.save(update_fields=['iface', 'modified_at']) + _CURRENT_PROCESS.save(update_fields=["iface", "modified_at"]) _CURRENT_PROCESS.ensure_log_files() return _CURRENT_PROCESS _CURRENT_PROCESS = None @@ -1193,12 +1271,16 @@ class Process(models.Model): # Try to find existing Process for this PID on this machine # Filter by: machine + PID + RUNNING + recent + start time matches if os_start_time: - existing = cls.objects.filter( - machine=machine, - pid=current_pid, - status=cls.StatusChoices.RUNNING, - started_at__gte=timezone.now() - PID_REUSE_WINDOW, - ).order_by('-started_at').first() + existing = ( + cls.objects.filter( + machine=machine, + pid=current_pid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ) + .order_by("-started_at") + .first() + ) if existing and existing.started_at: db_start_time = existing.started_at.timestamp() @@ -1206,7 +1288,7 @@ class Process(models.Model): _CURRENT_PROCESS = existing if existing.iface_id != iface.id: existing.iface = iface - existing.save(update_fields=['iface', 'modified_at']) + existing.save(update_fields=["iface", "modified_at"]) _CURRENT_PROCESS.ensure_log_files() return existing @@ -1245,7 +1327,7 @@ class Process(models.Model): return _CURRENT_PROCESS @classmethod - def _find_parent_process(cls, machine: 'Machine | None' = None) -> 'Process | None': + def _find_parent_process(cls, machine: Machine | None = None) -> Process | None: """ Find the parent Process record by looking up PPID. @@ -1279,7 +1361,7 @@ class Process(models.Model): pid=ppid, status=cls.StatusChoices.RUNNING, started_at__gte=timezone.now() - PID_REUSE_WINDOW, - ).order_by('-started_at') + ).order_by("-started_at") # print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) @@ -1300,26 +1382,27 @@ class Process(models.Model): """ Detect the type of the current process from sys.argv. """ - argv_str = ' '.join(sys.argv).lower() + argv_str = " ".join(sys.argv).lower() - if 'supervisord' in argv_str: + if "supervisord" in argv_str: return cls.TypeChoices.SUPERVISORD - elif 'runner_watch' in argv_str: + elif "runner_watch" in argv_str: return cls.TypeChoices.WORKER - elif 'archivebox run' in argv_str: + elif "archivebox run" in argv_str: return cls.TypeChoices.ORCHESTRATOR - elif 'archivebox' in argv_str: + elif "archivebox" in argv_str: return cls.TypeChoices.CLI else: return cls.TypeChoices.BINARY @classmethod - def cleanup_stale_running(cls, machine: 'Machine | None' = None) -> int: + def cleanup_stale_running(cls, machine: Machine | None = None) -> int: """ - Mark stale RUNNING processes as EXITED. + Mark stale RUNNING processes as EXITED in the DB. Processes are stale if: - Status is RUNNING but OS process no longer exists + - Status is RUNNING but exceeded its timeout plus a small grace margin - Status is RUNNING but started_at is older than PID_REUSE_WINDOW Returns count of processes cleaned up. @@ -1333,12 +1416,22 @@ class Process(models.Model): ) for proc in stale: + if proc.poll() is not None: + cleaned += 1 + continue + is_stale = False + if proc.started_at: + timeout_seconds = max(int(proc.timeout or 0), 0) + timeout_deadline = proc.started_at + timedelta(seconds=timeout_seconds) + PROCESS_TIMEOUT_GRACE + if timezone.now() >= timeout_deadline: + is_stale = True + # Check if too old (PID definitely reused) - if proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW: + if not is_stale and proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW: is_stale = True - elif PSUTIL_AVAILABLE and proc.pid is not None: + elif not is_stale and PSUTIL_AVAILABLE and proc.pid is not None: # Check if OS process still exists with matching start time try: os_proc = psutil.Process(proc.pid) @@ -1354,7 +1447,7 @@ class Process(models.Model): proc.status = cls.StatusChoices.EXITED proc.ended_at = proc.ended_at or timezone.now() proc.exit_code = proc.exit_code if proc.exit_code is not None else 0 - proc.save(update_fields=['status', 'ended_at', 'exit_code']) + proc.save(update_fields=["status", "ended_at", "exit_code"]) cleaned += 1 return cleaned @@ -1364,7 +1457,7 @@ class Process(models.Model): # ========================================================================= @property - def root(self) -> 'Process': + def root(self) -> Process: """Get the root process (CLI command) of this hierarchy.""" proc = self while proc.parent_id: @@ -1372,7 +1465,7 @@ class Process(models.Model): return proc @property - def ancestors(self) -> list['Process']: + def ancestors(self) -> list[Process]: """Get all ancestor processes from parent to root.""" ancestors = [] proc = self.parent @@ -1393,10 +1486,10 @@ class Process(models.Model): else: pks = [] - children = list(self.children.values_list('pk', flat=True)) + children = list(self.children.values_list("pk", flat=True)) while children: pks.extend(children) - children = list(Process.objects.filter(parent_id__in=children).values_list('pk', flat=True)) + children = list(Process.objects.filter(parent_id__in=children).values_list("pk", flat=True)) return Process.objects.filter(pk__in=pks) @@ -1405,7 +1498,7 @@ class Process(models.Model): # ========================================================================= @property - def proc(self) -> 'psutil.Process | None': + def proc(self) -> psutil.Process | None: """ Get validated psutil.Process for this record. @@ -1452,14 +1545,10 @@ class Process(models.Model): try: os_cmdline = os_proc.cmdline() if os_cmdline and self.cmd: - db_binary = self.cmd[0] if self.cmd else '' + db_binary = self.cmd[0] if self.cmd else "" if db_binary: db_binary_name = Path(db_binary).name - cmd_matches = any( - arg == db_binary or Path(arg).name == db_binary_name - for arg in os_cmdline - if arg - ) + cmd_matches = any(arg == db_binary or Path(arg).name == db_binary_name for arg in os_cmdline if arg) if not cmd_matches: return None # Different command, PID reused except (psutil.AccessDenied, psutil.ZombieProcess): @@ -1498,7 +1587,7 @@ class Process(models.Model): if proc: try: mem = proc.memory_info() - return {'rss': mem.rss, 'vms': mem.vms} + return {"rss": mem.rss, "vms": mem.vms} except (psutil.NoSuchProcess, psutil.AccessDenied): pass return None @@ -1531,25 +1620,25 @@ class Process(models.Model): def pid_file(self) -> Path | None: """Path to PID file for this process.""" runtime_dir = self.runtime_dir - return runtime_dir / 'process.pid' if runtime_dir else None + return runtime_dir / "process.pid" if runtime_dir else None @property def cmd_file(self) -> Path | None: """Path to cmd.sh script for this process.""" runtime_dir = self.runtime_dir - return runtime_dir / 'cmd.sh' if runtime_dir else None + return runtime_dir / "cmd.sh" if runtime_dir else None @property def stdout_file(self) -> Path | None: """Path to stdout log.""" runtime_dir = self.runtime_dir - return runtime_dir / 'stdout.log' if runtime_dir else None + return runtime_dir / "stdout.log" if runtime_dir else None @property def stderr_file(self) -> Path | None: """Path to stderr log.""" runtime_dir = self.runtime_dir - return runtime_dir / 'stderr.log' if runtime_dir else None + return runtime_dir / "stderr.log" if runtime_dir else None @property def hook_script_name(self) -> str | None: @@ -1559,10 +1648,10 @@ class Process(models.Model): for arg in self.cmd: arg = str(arg) - if arg.startswith('-'): + if arg.startswith("-"): continue candidate = Path(arg).name - if candidate.startswith('on_') and Path(candidate).suffix in {'.py', '.js', '.sh'}: + if candidate.startswith("on_") and Path(candidate).suffix in {".py", ".js", ".sh"}: return candidate return None @@ -1576,7 +1665,7 @@ class Process(models.Model): base_dir = Path(self.pwd) hook_name = self.hook_script_name if hook_name: - return base_dir / '.hooks' / hook_name + return base_dir / ".hooks" / hook_name return base_dir def tail_stdout(self, lines: int = 50, follow: bool = False): @@ -1596,7 +1685,8 @@ class Process(models.Model): if follow: # Follow mode - yield new lines as they appear (tail -f) import time - with open(self.stdout_file, 'r') as f: + + with open(self.stdout_file) as f: # Seek to end minus roughly 'lines' worth of bytes f.seek(0, 2) # Seek to end file_size = f.tell() @@ -1610,13 +1700,13 @@ class Process(models.Model): # Yield existing lines for line in f: - yield line.rstrip('\n') + yield line.rstrip("\n") # Now follow for new lines while True: line = f.readline() if line: - yield line.rstrip('\n') + yield line.rstrip("\n") else: time.sleep(0.1) # Wait before checking again else: @@ -1645,7 +1735,8 @@ class Process(models.Model): if follow: # Follow mode - yield new lines as they appear (tail -f) import time - with open(self.stderr_file, 'r') as f: + + with open(self.stderr_file) as f: # Seek to end minus roughly 'lines' worth of bytes f.seek(0, 2) # Seek to end file_size = f.tell() @@ -1659,13 +1750,13 @@ class Process(models.Model): # Yield existing lines for line in f: - yield line.rstrip('\n') + yield line.rstrip("\n") # Now follow for new lines while True: line = f.readline() if line: - yield line.rstrip('\n') + yield line.rstrip("\n") else: time.sleep(0.1) # Wait before checking again else: @@ -1686,6 +1777,7 @@ class Process(models.Model): follow: If True, follow the file and print new lines as they appear """ import sys + for line in self.tail_stdout(lines=lines, follow=follow): print(line, file=sys.stdout, flush=True) @@ -1698,6 +1790,7 @@ class Process(models.Model): follow: If True, follow the file and print new lines as they appear """ import sys + for line in self.tail_stderr(lines=lines, follow=follow): print(line, file=sys.stderr, flush=True) @@ -1718,12 +1811,13 @@ class Process(models.Model): """Write cmd.sh script for debugging/validation.""" if self.cmd and self.cmd_file: self.cmd_file.parent.mkdir(parents=True, exist_ok=True) + # Escape shell arguments (quote if contains space, ", or $) def escape(arg: str) -> str: - return f'"{arg.replace(chr(34), chr(92)+chr(34))}"' if any(c in arg for c in ' "$') else arg + return f'"{arg.replace(chr(34), chr(92) + chr(34))}"' if any(c in arg for c in ' "$') else arg # Write executable shell script - script = '#!/bin/bash\n' + ' '.join(escape(arg) for arg in self.cmd) + '\n' + script = "#!/bin/bash\n" + " ".join(escape(arg) for arg in self.cmd) + "\n" self.cmd_file.write_text(script) try: self.cmd_file.chmod(0o755) @@ -1763,7 +1857,7 @@ class Process(models.Model): elif isinstance(value, str): env[key] = value # Already a string, use as-is elif isinstance(value, bool): - env[key] = 'True' if value else 'False' + env[key] = "True" if value else "False" elif isinstance(value, (int, float)): env[key] = str(value) else: @@ -1772,7 +1866,7 @@ class Process(models.Model): return env - def launch(self, background: bool = False, cwd: str | None = None) -> 'Process': + def launch(self, background: bool = False, cwd: str | None = None) -> Process: """ Spawn the subprocess and update this Process record. @@ -1802,9 +1896,9 @@ class Process(models.Model): if stderr_path: stderr_path.parent.mkdir(parents=True, exist_ok=True) if stdout_path is None or stderr_path is None: - raise RuntimeError('Process log paths could not be determined') + raise RuntimeError("Process log paths could not be determined") - with open(stdout_path, 'a') as out, open(stderr_path, 'a') as err: + with open(stdout_path, "a") as out, open(stderr_path, "a") as err: proc = subprocess.Popen( self.cmd, cwd=working_dir, @@ -1819,7 +1913,7 @@ class Process(models.Model): ps_proc = psutil.Process(proc.pid) self.started_at = datetime.fromtimestamp( ps_proc.create_time(), - tz=timezone.get_current_timezone() + tz=timezone.get_current_timezone(), ) except (psutil.NoSuchProcess, psutil.AccessDenied): self.started_at = timezone.now() @@ -1913,7 +2007,7 @@ class Process(models.Model): if self.status == self.StatusChoices.EXITED: if self.exit_code == -1: self.exit_code = 137 - self.save(update_fields=['exit_code']) + self.save(update_fields=["exit_code"]) return self.exit_code if not self.is_running: @@ -2099,7 +2193,7 @@ class Process(models.Model): # Phase 2: Poll all processes in parallel all_procs = children + [proc] - still_running = set(p.pid for p in all_procs) + still_running = {p.pid for p in all_procs} while still_running and time.time() < deadline: time.sleep(0.1) @@ -2162,7 +2256,7 @@ class Process(models.Model): # ========================================================================= @classmethod - def get_running(cls, process_type: str | None = None, machine: 'Machine | None' = None) -> 'QuerySet[Process]': + def get_running(cls, process_type: str | None = None, machine: Machine | None = None) -> QuerySet[Process]: """ Get all running processes, optionally filtered by type. @@ -2187,7 +2281,7 @@ class Process(models.Model): return qs @classmethod - def get_running_count(cls, process_type: str | None = None, machine: 'Machine | None' = None) -> int: + def get_running_count(cls, process_type: str | None = None, machine: Machine | None = None) -> int: """ Get count of running processes. @@ -2197,7 +2291,7 @@ class Process(models.Model): return cls.get_running(process_type=process_type, machine=machine).count() @classmethod - def stop_all(cls, process_type: str | None = None, machine: 'Machine | None' = None, graceful: bool = True) -> int: + def stop_all(cls, process_type: str | None = None, machine: Machine | None = None, graceful: bool = True) -> int: """ Stop all running processes of a given type. @@ -2220,7 +2314,7 @@ class Process(models.Model): return stopped @classmethod - def get_next_worker_id(cls, process_type: str = 'worker', machine: 'Machine | None' = None) -> int: + def get_next_worker_id(cls, process_type: str = "worker", machine: Machine | None = None) -> int: """ Get the next available worker ID for spawning new workers. @@ -2255,13 +2349,13 @@ class Process(models.Model): from pathlib import Path from django.conf import settings - chrome_utils = Path(__file__).parent.parent / 'plugins' / 'chrome' / 'chrome_utils.js' + chrome_utils = Path(__file__).parent.parent / "plugins" / "chrome" / "chrome_utils.js" if not chrome_utils.exists(): return 0 try: result = subprocess.run( - ['node', str(chrome_utils), 'killZombieChrome', str(settings.DATA_DIR)], + ["node", str(chrome_utils), "killZombieChrome", str(settings.DATA_DIR)], capture_output=True, timeout=30, text=True, @@ -2269,17 +2363,17 @@ class Process(models.Model): if result.returncode == 0: killed = int(result.stdout.strip()) if killed > 0: - print(f'[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]') + print(f"[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]") return killed except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e: - print(f'[red]Failed to cleanup orphaned Chrome: {e}[/red]') + print(f"[red]Failed to cleanup orphaned Chrome: {e}[/red]") return 0 @classmethod def cleanup_orphaned_workers(cls) -> int: """ - Kill orphaned worker/hook processes whose root process is no longer running. + Mark orphaned worker/hook processes as EXITED in the DB. Orphaned if: - Root (orchestrator/cli) is not running, or @@ -2287,7 +2381,7 @@ class Process(models.Model): Standalone worker runs (archivebox run --snapshot-id) are allowed. """ - killed = 0 + cleaned = 0 running_children = cls.objects.filter( process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK], @@ -2307,24 +2401,22 @@ class Process(models.Model): if root.process_type in (cls.TypeChoices.ORCHESTRATOR, cls.TypeChoices.CLI) and root.is_running: continue - try: - if proc.process_type == cls.TypeChoices.HOOK: - proc.kill_tree(graceful_timeout=1.0) - else: - proc.terminate(graceful_timeout=1.0) - killed += 1 - except Exception: - continue + proc.status = cls.StatusChoices.EXITED + proc.ended_at = proc.ended_at or timezone.now() + proc.exit_code = proc.exit_code if proc.exit_code is not None else 0 + proc.save(update_fields=["status", "ended_at", "exit_code"]) + cleaned += 1 - if killed: - print(f'[yellow]🧹 Cleaned up {killed} orphaned worker/hook process(es)[/yellow]') - return killed + if cleaned: + print(f"[yellow]🧹 Cleaned up {cleaned} orphaned worker/hook process record(s)[/yellow]") + return cleaned # ============================================================================= # Binary State Machine # ============================================================================= + class BinaryMachine(BaseStateMachine): """ State machine for managing Binary installation lifecycle. @@ -2345,7 +2437,7 @@ class BinaryMachine(BaseStateMachine): If installation fails, Binary stays in QUEUED with retry_at bumped. """ - model_attr_name = 'binary' + model_attr_name = "binary" binary: Binary # States @@ -2353,10 +2445,7 @@ class BinaryMachine(BaseStateMachine): installed = State(value=Binary.StatusChoices.INSTALLED, final=True) # Tick Event - install happens during transition - tick = ( - queued.to.itself(unless='can_install') - | queued.to(installed, cond='can_install', on='on_install') - ) + tick = queued.to.itself(unless="can_install") | queued.to(installed, cond="can_install", on="on_install") def can_install(self) -> bool: """Check if binary installation can start.""" @@ -2374,7 +2463,7 @@ class BinaryMachine(BaseStateMachine): """Called during queued→installed transition. Runs installation synchronously.""" import sys - print(f'[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]', file=sys.stderr) + print(f"[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]", file=sys.stderr) # Run installation hooks (synchronous, updates abspath/version/sha256 and sets status) self.binary.run() @@ -2385,7 +2474,7 @@ class BinaryMachine(BaseStateMachine): if self.binary.status != Binary.StatusChoices.INSTALLED: # Installation failed - abort transition, stay in queued - print(f'[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]', file=sys.stderr) + print(f"[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]", file=sys.stderr) # Bump retry_at to try again later self.binary.update_and_requeue( @@ -2397,9 +2486,9 @@ class BinaryMachine(BaseStateMachine): self.binary.increment_health_stats(success=False) # Abort the transition - this will raise an exception and keep us in queued - raise Exception(f'Binary {self.binary.name} installation failed') + raise Exception(f"Binary {self.binary.name} installation failed") - print(f'[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]', file=sys.stderr) + print(f"[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]", file=sys.stderr) @installed.enter def enter_installed(self): @@ -2417,6 +2506,7 @@ class BinaryMachine(BaseStateMachine): # Process State Machine # ============================================================================= + class ProcessMachine(BaseStateMachine): """ State machine for managing Process (OS subprocess) lifecycle. @@ -2449,7 +2539,7 @@ class ProcessMachine(BaseStateMachine): the archival-specific logic (status, output parsing, etc.). """ - model_attr_name = 'process' + model_attr_name = "process" process: Process # States @@ -2459,10 +2549,10 @@ class ProcessMachine(BaseStateMachine): # Tick Event - transitions based on conditions tick = ( - queued.to.itself(unless='can_start') - | queued.to(running, cond='can_start') - | running.to.itself(unless='is_exited') - | running.to(exited, cond='is_exited') + queued.to.itself(unless="can_start") + | queued.to(running, cond="can_start") + | running.to.itself(unless="is_exited") + | running.to(exited, cond="is_exited") ) # Additional events (for explicit control) diff --git a/archivebox/manage.py b/archivebox/manage.py index 37d436a9..ee4e8d7b 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -2,30 +2,30 @@ import os import sys -if __name__ == '__main__': +if __name__ == "__main__": # if you're a developer working on archivebox, still prefer the archivebox # versions of ./manage.py commands whenever possible. When that's not possible # (e.g. makemigrations), you can comment out this check temporarily - allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs', 'test'] + allowed_commands = ["makemigrations", "migrate", "startapp", "squashmigrations", "generate_stubs", "test"] if not any(cmd in sys.argv for cmd in allowed_commands): print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") print() - print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:') - print(' archivebox init (migrates the databse to latest version)') - print(' archivebox server (runs the Django web server)') - print(' archivebox shell (opens an iPython Django shell with all models imported)') - print(' archivebox manage [cmd] (any other management commands)') + print(" Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:") + print(" archivebox init (migrates the database to latest version)") + print(" archivebox server (runs the Django web server)") + print(" archivebox shell (opens an iPython Django shell with all models imported)") + print(" archivebox manage [cmd] (any other management commands)") raise SystemExit(2) - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") try: from django.core.management import execute_from_command_line except ImportError as exc: raise ImportError( "Couldn't import Django. Are you sure it's installed and " "available on your PYTHONPATH environment variable? Did you " - "forget to activate a virtual environment?" + "forget to activate a virtual environment?", ) from exc execute_from_command_line(sys.argv) diff --git a/archivebox/mcp/__init__.py b/archivebox/mcp/__init__.py index d05fc2fc..dd4a67f3 100644 --- a/archivebox/mcp/__init__.py +++ b/archivebox/mcp/__init__.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.mcp' +__package__ = "archivebox.mcp" """ Model Context Protocol (MCP) server for ArchiveBox. diff --git a/archivebox/mcp/apps.py b/archivebox/mcp/apps.py index 2eeb3b2b..3413e01b 100644 --- a/archivebox/mcp/apps.py +++ b/archivebox/mcp/apps.py @@ -1,9 +1,9 @@ -__package__ = 'archivebox.mcp' +__package__ = "archivebox.mcp" from django.apps import AppConfig class MCPConfig(AppConfig): - name = 'mcp' - verbose_name = 'Model Context Protocol Server' - default_auto_field = 'django.db.models.BigAutoField' + name = "mcp" + verbose_name = "Model Context Protocol Server" + default_auto_field = "django.db.models.BigAutoField" diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py index 19e31b7e..26196b79 100644 --- a/archivebox/mcp/server.py +++ b/archivebox/mcp/server.py @@ -8,7 +8,7 @@ Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport. import sys import json import traceback -from typing import Any, Optional +from typing import Any import click from click.testing import CliRunner @@ -21,9 +21,9 @@ class MCPJSONEncoder(json.JSONEncoder): def default(self, o): # Handle Click's sentinel values - sentinel_type = getattr(click.core, '_SentinelClass', None) + sentinel_type = getattr(click.core, "_SentinelClass", None) if isinstance(sentinel_type, type) and isinstance(o, sentinel_type): - return None + return None # Handle tuples (convert to lists) if isinstance(o, tuple): @@ -76,13 +76,13 @@ def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> di # Extract parameters from Click command for param in click_command.params: # Skip internal parameters - if param.name is None or param.name in ('help', 'version'): + if param.name is None or param.name in ("help", "version"): continue param_schema = click_type_to_json_schema_type(param.type) # Add description from Click help text - help_text = getattr(param, 'help', None) + help_text = getattr(param, "help", None) if help_text: param_schema["description"] = help_text @@ -95,7 +95,7 @@ def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> di properties[param.name] = { "type": "array", "items": param_schema, - "description": param_schema.get("description", f"Multiple {param.name} values") + "description": param_schema.get("description", f"Multiple {param.name} values"), } else: properties[param.name] = param_schema @@ -110,8 +110,8 @@ def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> di "inputSchema": { "type": "object", "properties": properties, - "required": required - } + "required": required, + }, } @@ -124,21 +124,25 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments # Setup Django for archive commands (commands that need database access) from archivebox.cli import ArchiveBoxGroup + if cmd_name in ArchiveBoxGroup.archive_commands: try: from archivebox.config.django import setup_django from archivebox.misc.checks import check_data_folder + setup_django() check_data_folder() except Exception as e: # If Django setup fails, return error (unless it's manage/shell which handle this themselves) - if cmd_name not in ('manage', 'shell'): + if cmd_name not in ("manage", "shell"): return { - "content": [{ - "type": "text", - "text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory." - }], - "isError": True + "content": [ + { + "type": "text", + "text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory.", + }, + ], + "isError": True, } # Use Click's test runner to invoke command programmatically @@ -152,7 +156,7 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments positional_args = [] for key, value in arguments.items(): - param_name = key.replace('_', '-') # Click uses dashes + param_name = key.replace("_", "-") # Click uses dashes param = param_map.get(key) # Check if this is a positional Argument (not an Option) @@ -168,14 +172,14 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments # Options - add with dashes if isinstance(value, bool): if value: - args.append(f'--{param_name}') + args.append(f"--{param_name}") elif isinstance(value, list): # Multiple values for an option (rare) for item in value: - args.append(f'--{param_name}') + args.append(f"--{param_name}") args.append(str(item)) elif value is not None: - args.append(f'--{param_name}') + args.append(f"--{param_name}") args.append(str(value)) # Add positional arguments at the end @@ -189,42 +193,50 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments content = [] if result.output: - content.append({ - "type": "text", - "text": result.output - }) + content.append( + { + "type": "text", + "text": result.output, + }, + ) if result.stderr_bytes: - stderr_text = result.stderr_bytes.decode('utf-8', errors='replace') + stderr_text = result.stderr_bytes.decode("utf-8", errors="replace") if stderr_text.strip(): - content.append({ - "type": "text", - "text": f"[stderr]\n{stderr_text}" - }) + content.append( + { + "type": "text", + "text": f"[stderr]\n{stderr_text}", + }, + ) # Check exit code is_error = result.exit_code != 0 if is_error and not content: - content.append({ - "type": "text", - "text": f"Command failed with exit code {result.exit_code}" - }) + content.append( + { + "type": "text", + "text": f"Command failed with exit code {result.exit_code}", + }, + ) return { "content": content or [{"type": "text", "text": "(no output)"}], - "isError": is_error + "isError": is_error, } except Exception as e: # Capture any exceptions during execution error_trace = traceback.format_exc() return { - "content": [{ - "type": "text", - "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}" - }], - "isError": True + "content": [ + { + "type": "text", + "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}", + }, + ], + "isError": True, } @@ -244,7 +256,7 @@ class MCPServer: self.protocol_version = "2025-11-25" self._tool_cache = {} # Cache loaded Click commands - def get_click_command(self, cmd_name: str) -> Optional[click.Command]: + def get_click_command(self, cmd_name: str) -> click.Command | None: """Get a Click command by name, with caching""" if cmd_name not in self._tool_cache: if cmd_name not in self.cli_group.all_subcommands: @@ -257,12 +269,12 @@ class MCPServer: return { "protocolVersion": self.protocol_version, "capabilities": { - "tools": {} + "tools": {}, }, "serverInfo": { "name": "archivebox-mcp", - "version": VERSION - } + "version": VERSION, + }, } def handle_tools_list(self, params: dict) -> dict: @@ -283,8 +295,8 @@ class MCPServer: def handle_tools_call(self, params: dict) -> dict: """Handle MCP tools/call request - executes a CLI command""" - tool_name = params.get('name') - arguments = params.get('arguments', {}) + tool_name = params.get("name") + arguments = params.get("arguments", {}) if not tool_name: raise ValueError("Missing required parameter: name") @@ -303,17 +315,17 @@ class MCPServer: Supports MCP methods: initialize, tools/list, tools/call """ - method = request.get('method') - params = request.get('params', {}) - request_id = request.get('id') + method = request.get("method") + params = request.get("params", {}) + request_id = request.get("id") try: # Route to appropriate handler - if method == 'initialize': + if method == "initialize": result = self.handle_initialize(params) - elif method == 'tools/list': + elif method == "tools/list": result = self.handle_tools_list(params) - elif method == 'tools/call': + elif method == "tools/call": result = self.handle_tools_call(params) else: # Method not found @@ -322,15 +334,15 @@ class MCPServer: "id": request_id, "error": { "code": -32601, - "message": f"Method not found: {method}" - } + "message": f"Method not found: {method}", + }, } # Success response return { "jsonrpc": "2.0", "id": request_id, - "result": result + "result": result, } except Exception as e: @@ -342,8 +354,8 @@ class MCPServer: "error": { "code": -32603, "message": str(e), - "data": error_trace - } + "data": error_trace, + }, } def run_stdio_server(self): @@ -378,8 +390,8 @@ class MCPServer: "error": { "code": -32700, "message": "Parse error", - "data": str(e) - } + "data": str(e), + }, } print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True) diff --git a/archivebox/misc/__init__.py b/archivebox/misc/__init__.py index c305c57e..1619d056 100644 --- a/archivebox/misc/__init__.py +++ b/archivebox/misc/__init__.py @@ -1 +1 @@ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 91d4c081..46444662 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" import os import sys @@ -23,69 +23,74 @@ def check_data_folder() -> None: from archivebox import DATA_DIR, ARCHIVE_DIR from archivebox.config import CONSTANTS from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir - + archive_dir_exists = os.path.isdir(ARCHIVE_DIR) if not archive_dir_exists: - print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr) - print(f' {DATA_DIR}', file=sys.stderr) + print("[red][X] No archivebox index found in the current directory.[/red]", file=sys.stderr) + print(f" {DATA_DIR}", file=sys.stderr) print(file=sys.stderr) - print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr) - print(' cd path/to/your/archive/folder', file=sys.stderr) - print(' archivebox [command]', file=sys.stderr) + print(" [violet]Hint[/violet]: Are you running archivebox in the right folder?", file=sys.stderr) + print(" cd path/to/your/archive/folder", file=sys.stderr) + print(" archivebox [command]", file=sys.stderr) print(file=sys.stderr) - print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr) - print(' archivebox init', file=sys.stderr) + print(" [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:", file=sys.stderr) + print(" archivebox init", file=sys.stderr) raise SystemExit(2) - - + # Create data dir subdirs create_and_chown_dir(CONSTANTS.SOURCES_DIR) - create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default') + create_and_chown_dir(CONSTANTS.PERSONAS_DIR / "Default") create_and_chown_dir(CONSTANTS.LOGS_DIR) # create_and_chown_dir(CONSTANTS.CACHE_DIR) - + # Create /tmp and /lib dirs if they don't exist get_or_create_working_tmp_dir(autofix=True, quiet=False) get_or_create_working_lib_dir(autofix=True, quiet=False) - + # Check data dir permissions, /tmp, and /lib permissions check_data_dir_permissions() - + def check_migrations(): from archivebox import DATA_DIR from archivebox.misc.db import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] - is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init']) + is_migrating = any(arg in sys.argv for arg in ["makemigrations", "migrate", "init"]) if pending_migrations and not is_migrating: - print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]') - print(f' {DATA_DIR}', file=sys.stderr) + print("[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]") + print(f" {DATA_DIR}", file=sys.stderr) print(file=sys.stderr) - print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr) - print(' archivebox init', file=sys.stderr) + print( + f" [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:", + file=sys.stderr, + ) + print(" archivebox init", file=sys.stderr) raise SystemExit(3) def check_io_encoding(): - PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8') - - if PYTHON_ENCODING != 'UTF-8': - print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr) + PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace("UTF8", "UTF-8") + + if PYTHON_ENCODING != "UTF-8": + print( + f"[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]", + file=sys.stderr, + ) print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr) print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr) - print('') - print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr) + print("") + print(" Confirm that it's fixed by opening a new shell and running:", file=sys.stderr) print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr) raise SystemExit(2) - + # # hard errors: check python version # if sys.version_info[:3] < (3, 10, 0): # print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr) # print(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr) # raise SystemExit(2) - + # # hard errors: check django version # if int(django.VERSION[0]) < 5: # print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr) @@ -96,35 +101,44 @@ def check_io_encoding(): def check_not_root(): from archivebox.config.permissions import IS_ROOT, IN_DOCKER - attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else '' - is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv - is_getting_version = '--version' in sys.argv or 'version' in sys.argv - is_installing = 'setup' in sys.argv or 'install' in sys.argv + attempted_command = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else "" + is_getting_help = "-h" in sys.argv or "--help" in sys.argv or "help" in sys.argv + is_getting_version = "--version" in sys.argv or "version" in sys.argv + is_installing = "setup" in sys.argv or "install" in sys.argv if IS_ROOT and not (is_getting_help or is_getting_version or is_installing): - print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr) - print(' For more information, see the security overview documentation:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr) + print("[red][!] ArchiveBox should never be run as root![/red]", file=sys.stderr) + print(" For more information, see the security overview documentation:", file=sys.stderr) + print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root", file=sys.stderr) if IN_DOCKER: - print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr) - print(' docker compose run archivebox {attempted_command}', file=sys.stderr) - print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr) - print(' or:', file=sys.stderr) - print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr) - print(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr) + print( + "[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:", + file=sys.stderr, + ) + print(" docker compose run archivebox {attempted_command}", file=sys.stderr) + print(f" docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}", file=sys.stderr) + print(" or:", file=sys.stderr) + print( + f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', + file=sys.stderr, + ) + print( + f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"', + file=sys.stderr, + ) raise SystemExit(2) def check_not_inside_source_dir(): """Prevent running ArchiveBox from inside its source directory (would pollute repo with data files).""" cwd = Path(os.getcwd()).resolve() - is_source_dir = (cwd / 'archivebox' / '__init__.py').exists() and (cwd / 'pyproject.toml').exists() - data_dir_set_elsewhere = os.environ.get('DATA_DIR', '').strip() and Path(os.environ['DATA_DIR']).resolve() != cwd - is_testing = 'pytest' in sys.modules or 'unittest' in sys.modules + is_source_dir = (cwd / "archivebox" / "__init__.py").exists() and (cwd / "pyproject.toml").exists() + data_dir_set_elsewhere = os.environ.get("DATA_DIR", "").strip() and Path(os.environ["DATA_DIR"]).resolve() != cwd + is_testing = "pytest" in sys.modules or "unittest" in sys.modules if is_source_dir and not data_dir_set_elsewhere and not is_testing: - raise SystemExit('[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first') + raise SystemExit("[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first") def check_data_dir_permissions(): @@ -132,28 +146,42 @@ def check_data_dir_permissions(): from archivebox.misc.logging import STDERR from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir - + data_dir_stat = Path(DATA_DIR).stat() data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid data_owned_by_root = data_dir_uid == 0 - + # data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK)) if data_owned_by_root: - STDERR.print('\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]') + STDERR.print( + "\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]", + ) elif data_owner_doesnt_match or data_not_writable: - STDERR.print(f'\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]') - + STDERR.print( + f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]", + ) + if data_owned_by_root or data_owner_doesnt_match or data_not_writable: - STDERR.print(f'[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:') - STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}') + STDERR.print( + f"[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:", + ) + STDERR.print(f" [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}") STDERR.print() - STDERR.print('[blue]More info:[/blue]') - STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]') - STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]') - STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]') - STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]') + STDERR.print("[blue]More info:[/blue]") + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]", + ) from archivebox.config.common import STORAGE_CONFIG @@ -172,8 +200,8 @@ def check_data_dir_permissions(): # Check /lib dir permissions check_lib_dir(lib_dir, throw=False, must_exist=True) - - os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 + + os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): @@ -182,45 +210,57 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): from archivebox.misc.logging_util import pretty_path from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP from archivebox.config.common import STORAGE_CONFIG - + tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR socket_file = tmp_dir.absolute().resolve() / "supervisord.sock" if not must_exist and not os.path.isdir(tmp_dir): # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable) - return len(f'file://{socket_file}') <= 96 + return len(f"file://{socket_file}") <= 96 tmp_is_valid = False - allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes') + allow_no_unix_sockets = os.environ.get("ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS", "").lower() in ("1", "true", "yes") try: tmp_is_valid = dir_is_writable(tmp_dir) if not allow_no_unix_sockets: tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) - assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}' - assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.' + assert tmp_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}" + assert len(f"file://{socket_file}") <= 96, ( + f"ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars." + ) return True except Exception as e: if not quiet: STDERR.print() - ERROR_TEXT = '\n'.join(( - '', - f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]', - f' [yellow]{e}[/yellow]', - '', - '[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.', - ' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).', - f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).', - ' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.', - ' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]', - '', - '[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:', - f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]', - '', - )) - STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.')) + ERROR_TEXT = "\n".join( + ( + "", + f"[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]", + f" [yellow]{e}[/yellow]", + "", + "[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.", + " - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).", + f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).", + " - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.", + " - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]", + "", + "[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:", + f" [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or '/tmp/archivebox'}[/green]", + "", + ), + ) + STDERR.print( + Panel( + ERROR_TEXT, + expand=False, + border_style="red", + title="[red]:cross_mark: Error with configured TMP_DIR[/red]", + subtitle="Background workers may fail to start until fixed.", + ), + ) STDERR.print() if throw: - raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e + raise OSError(f"TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!") from e return False @@ -230,38 +270,48 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex from archivebox.misc.logging_util import pretty_path from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir from archivebox.config.common import STORAGE_CONFIG - + lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR - + # assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config" - + if not must_exist and not os.path.isdir(lib_dir): return True - + lib_is_valid = False try: lib_is_valid = dir_is_writable(lib_dir) - assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}' + assert lib_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}" return True except Exception as e: if not quiet: STDERR.print() - ERROR_TEXT = '\n'.join(( - '', - f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]', - f' [yellow]{e}[/yellow]', - '', - '[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.', - f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).', - ' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).', - ' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]', - '', - '[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:', - f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]', - '', - )) - STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]')) + ERROR_TEXT = "\n".join( + ( + "", + f"[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]", + f" [yellow]{e}[/yellow]", + "", + "[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.", + f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).", + " - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).", + " - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]", + "", + "[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:", + f" [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or '/usr/local/share/archivebox'}[/green]", + "", + ), + ) + STDERR.print( + Panel( + ERROR_TEXT, + expand=False, + border_style="red", + title="[red]:cross_mark: Error with configured LIB_DIR[/red]", + subtitle="[yellow]Dependencies may not auto-install properly until fixed.[/yellow]", + ), + ) STDERR.print() if throw: - raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e + raise OSError(f"LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.") from e return False diff --git a/archivebox/misc/db.py b/archivebox/misc/db.py index c438df53..d9e66f3f 100644 --- a/archivebox/misc/db.py +++ b/archivebox/misc/db.py @@ -2,18 +2,18 @@ Database utility functions for ArchiveBox. """ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" from io import StringIO from pathlib import Path -from typing import Any, List, Tuple +from typing import Any from archivebox.config import DATA_DIR from archivebox.misc.util import enforce_types @enforce_types -def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]: +def list_migrations(out_dir: Path = DATA_DIR) -> list[tuple[bool, str]]: """List all Django migrations and their status""" from django.core.management import call_command @@ -23,9 +23,9 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]: migrations = [] for line in out.readlines(): - if line.strip() and ']' in line: - status_str, name_str = line.strip().split(']', 1) - is_applied = 'X' in status_str + if line.strip() and "]" in line: + status_str, name_str = line.strip().split("]", 1) + is_applied = "X" in status_str migration_name = name_str.strip() migrations.append((is_applied, migration_name)) @@ -33,23 +33,21 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]: @enforce_types -def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]: +def apply_migrations(out_dir: Path = DATA_DIR) -> list[str]: """Apply pending Django migrations""" from django.core.management import call_command out1 = StringIO() - call_command("migrate", interactive=False, database='default', stdout=out1) + call_command("migrate", interactive=False, database="default", stdout=out1) out1.seek(0) - return [ - line.strip() for line in out1.readlines() if line.strip() - ] + return [line.strip() for line in out1.readlines() if line.strip()] @enforce_types -def get_admins(out_dir: Path = DATA_DIR) -> List[Any]: +def get_admins(out_dir: Path = DATA_DIR) -> list[Any]: """Get list of superuser accounts""" from django.contrib.auth.models import User - return list(User.objects.filter(is_superuser=True).exclude(username='system')) + return list(User.objects.filter(is_superuser=True).exclude(username="system")) diff --git a/archivebox/misc/debugging.py b/archivebox/misc/debugging.py index d92109bf..4ada510c 100644 --- a/archivebox/misc/debugging.py +++ b/archivebox/misc/debugging.py @@ -1,6 +1,7 @@ from functools import wraps from time import time + def timed_function(func): """ Very simple profiling decorator for debugging. @@ -8,23 +9,25 @@ def timed_function(func): @timed_function def my_func(): ... - + More advanced alternatives: - viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof - Django Debug Toolbar + django-debug-toolbar-flamegraph + Django Requests Tracker (requests-tracker) """ + @wraps(func) def wrap(*args, **kwargs): - if args and hasattr(args[0], '__module__'): + if args and hasattr(args[0], "__module__"): module = args[0].__module__ else: module = func.__module__ ts_start = time() result = func(*args, **kwargs) ts_end = time() - ms_elapsed = int((ts_end-ts_start) * 1000) - print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)') + ms_elapsed = int((ts_end - ts_start) * 1000) + print(f"[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)") return result + return wrap diff --git a/archivebox/misc/folders.py b/archivebox/misc/folders.py index dd8bbc1f..2d9a55a5 100644 --- a/archivebox/misc/folders.py +++ b/archivebox/misc/folders.py @@ -5,20 +5,19 @@ Note: This file only contains legacy cleanup utilities. The DB is the single source of truth - use Snapshot.objects queries for all status checks. """ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" import os import json import shutil from pathlib import Path -from typing import Tuple, List from archivebox.config import DATA_DIR, CONSTANTS from archivebox.misc.util import enforce_types @enforce_types -def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]: +def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> tuple[list[str], list[str]]: """ Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json. @@ -29,19 +28,19 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L cant_fix = [] for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): - index_path = Path(entry.path) / 'index.json' + index_path = Path(entry.path) / "index.json" if index_path.exists(): try: - with open(index_path, 'r') as f: + with open(index_path) as f: data = json.load(f) - timestamp = data.get('timestamp') + timestamp = data.get("timestamp") except Exception: continue if not timestamp: continue - if not entry.path.endswith(f'/{timestamp}'): + if not entry.path.endswith(f"/{timestamp}"): dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp if dest.exists(): cant_fix.append(entry.path) diff --git a/archivebox/misc/hashing.py b/archivebox/misc/hashing.py index 3b9208a9..f671195e 100644 --- a/archivebox/misc/hashing.py +++ b/archivebox/misc/hashing.py @@ -2,20 +2,22 @@ import hashlib import mimetypes from functools import lru_cache from pathlib import Path -from typing import Callable +from collections.abc import Callable from datetime import datetime + @lru_cache(maxsize=1024) def _cached_file_hash(filepath: str, size: int, mtime: float) -> str: """Internal function to calculate file hash with cache key based on path, size and mtime.""" sha256_hash = hashlib.sha256() - with open(filepath, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b''): + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): sha256_hash.update(chunk) return sha256_hash.hexdigest() + @lru_cache(maxsize=10) def hash_file(file_path: Path, pwd: Path | None = None) -> str: """Calculate SHA256 hash of a file with caching based on path, size and mtime.""" @@ -30,9 +32,10 @@ def hash_file(file_path: Path, pwd: Path | None = None) -> str: return _cached_file_hash( str(abs_path), stat_info.st_size, - stat_info.st_mtime + stat_info.st_mtime, ) + @lru_cache(maxsize=10) def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]: """Calculate SHA256 hashes for all files and directories recursively.""" @@ -48,9 +51,12 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl # Get all files recursively all_files = get_dir_entries( - dir_path, pwd=pwd, recursive=True, - include_files=True, include_dirs=False, - filter_func=filter_func + dir_path, + pwd=pwd, + recursive=True, + include_files=True, + include_dirs=False, + filter_func=filter_func, ) hashes: dict[str, str] = {} @@ -65,39 +71,48 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl # Calculate hashes for all directories subdirs = get_dir_entries( - dir_path, pwd=pwd, recursive=True, - include_files=False, include_dirs=True, - include_hidden=False, filter_func=filter_func, - max_depth=max_depth + dir_path, + pwd=pwd, + recursive=True, + include_files=False, + include_dirs=True, + include_hidden=False, + filter_func=filter_func, + max_depth=max_depth, ) for subdir in subdirs: subdir_path = dir_path / subdir subdir_hashes = get_dir_hashes( - subdir_path, filter_func=filter_func, - max_depth=0 + subdir_path, + filter_func=filter_func, + max_depth=0, ) - hashes[subdir] = subdir_hashes['.'] + hashes[subdir] = subdir_hashes["."] # Filter results by max_depth if max_depth >= 0: - hashes = { - path: value for path, value in hashes.items() - if len(Path(path).parts) <= max_depth + 1 - } + hashes = {path: value for path, value in hashes.items() if len(Path(path).parts) <= max_depth + 1} # Calculate root directory hash hashable_summary.sort() - root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest() - hashes['.'] = root_sha256 + root_sha256 = hashlib.sha256("\n".join(hashable_summary).encode()).hexdigest() + hashes["."] = root_sha256 return hashes @lru_cache(maxsize=128) -def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True, - include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False, - filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]: +def get_dir_entries( + dir_path: Path, + pwd: Path | None = None, + recursive: bool = True, + include_files: bool = True, + include_dirs: bool = True, + include_hidden: bool = False, + filter_func: Callable | None = None, + max_depth: int = -1, +) -> tuple[str, ...]: """Get filtered list of directory entries.""" pwd = Path(pwd) if pwd else None dir_path = Path(dir_path) @@ -107,20 +122,20 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T results = [] def process_path(path: Path, depth: int): - if not include_hidden and path.name.startswith('.'): + if not include_hidden and path.name.startswith("."): return False if max_depth >= 0 and depth > max_depth: return False if filter_func: info = { "abspath": str(path.absolute()), - "relpath": str(path.relative_to(dir_path)) + "relpath": str(path.relative_to(dir_path)), } if not filter_func(info): return False return True - for path in dir_path.rglob('*') if recursive else dir_path.glob('*'): + for path in dir_path.rglob("*") if recursive else dir_path.glob("*"): current_depth = len(path.relative_to(dir_path).parts) if path.is_file() and include_files and process_path(path, current_depth): @@ -133,6 +148,7 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T return tuple(sorted(results)) # Make immutable for caching + @lru_cache(maxsize=1024) def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]: """Calculate sizes for all files and directories recursively.""" @@ -146,10 +162,10 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str sizes[path_key] = full_path.stat().st_size else: total = 0 - for file_path in full_path.rglob('*'): - if file_path.is_file() and not file_path.name.startswith('.'): + for file_path in full_path.rglob("*"): + if file_path.is_file() and not file_path.name.startswith("."): total += file_path.stat().st_size - sizes[path_key + '/'] = total + sizes[path_key + "/"] = total return sizes @@ -165,23 +181,23 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth) sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth) - num_total_subpaths = sum(1 for name in hashes if name != '.') + num_total_subpaths = sum(1 for name in hashes if name != ".") details = {} for filename, sha256_hash in sorted(hashes.items()): abs_path = (dir_path / filename).resolve() stat_info = abs_path.stat() - num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/')) + num_subpaths = sum(1 for p in hashes if p.startswith(filename + "/")) is_dir = abs_path.is_dir() if is_dir: - mime_type = 'inode/directory' + mime_type = "inode/directory" basename = abs_path.name - extension = '' - num_bytes = sizes[filename + '/'] - if filename == '.': + extension = "" + num_bytes = sizes[filename + "/"] + if filename == ".": num_subpaths = num_total_subpaths else: - filename += '/' + filename += "/" num_subpaths = num_subpaths else: # is_file num_subpaths = None @@ -191,14 +207,14 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable num_bytes = sizes[filename] details[filename] = { - 'basename': basename, - 'mime_type': mime_type, - 'extension': extension, - 'num_subpaths': num_subpaths, - 'num_bytes': num_bytes, - 'hash_sha256': sha256_hash, - 'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(), - 'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(), + "basename": basename, + "mime_type": mime_type, + "extension": extension, + "num_subpaths": num_subpaths, + "num_bytes": num_bytes, + "hash_sha256": sha256_hash, + "created_at": datetime.fromtimestamp(stat_info.st_ctime).isoformat(), + "modified_at": datetime.fromtimestamp(stat_info.st_mtime).isoformat(), } if filter_func and not filter_func(details[filename]): @@ -207,12 +223,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable return details -if __name__ == '__main__': +if __name__ == "__main__": import json - dir_info = get_dir_info(Path('.'), max_depth=6) - with open('.hashes.json', 'w') as f: + + dir_info = get_dir_info(Path("."), max_depth=6) + with open(".hashes.json", "w") as f: json.dump(dir_info, f, indent=4) - print('Wrote .hashes.json') + print("Wrote .hashes.json") # Example output: # { diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index 07428002..62705281 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -20,72 +20,73 @@ Plain URLs (also supported): https://foo.com """ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" import sys import json import select -from typing import Iterable, Iterator, Dict, Any, Optional, TextIO +from typing import Any, TextIO +from collections.abc import Iterable, Iterator from pathlib import Path # Type constants for JSONL records -TYPE_SNAPSHOT = 'Snapshot' -TYPE_ARCHIVERESULT = 'ArchiveResult' -TYPE_TAG = 'Tag' -TYPE_CRAWL = 'Crawl' -TYPE_BINARY = 'Binary' -TYPE_PROCESS = 'Process' -TYPE_MACHINE = 'Machine' +TYPE_SNAPSHOT = "Snapshot" +TYPE_ARCHIVERESULT = "ArchiveResult" +TYPE_TAG = "Tag" +TYPE_CRAWL = "Crawl" +TYPE_BINARY = "Binary" +TYPE_PROCESS = "Process" +TYPE_MACHINE = "Machine" VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE} -def parse_line(line: str) -> Optional[Dict[str, Any]]: +def parse_line(line: str) -> dict[str, Any] | None: """ Parse a single line of input as either JSONL or plain URL. Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid. """ line = line.strip() - if not line or line.startswith('#'): + if not line or line.startswith("#"): return None # Try to parse as JSON first - if line.startswith('{'): + if line.startswith("{"): try: record = json.loads(line) # If it has a type, validate it - if 'type' in record and record['type'] not in VALID_TYPES: + if "type" in record and record["type"] not in VALID_TYPES: # Unknown type, treat as raw data pass # If it has url but no type, assume Snapshot - if 'url' in record and 'type' not in record: - record['type'] = TYPE_SNAPSHOT + if "url" in record and "type" not in record: + record["type"] = TYPE_SNAPSHOT return record except json.JSONDecodeError: pass # Treat as plain URL if it looks like one - if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'): - return {'type': TYPE_SNAPSHOT, 'url': line} + if line.startswith("http://") or line.startswith("https://") or line.startswith("file://"): + return {"type": TYPE_SNAPSHOT, "url": line} # Could be a snapshot ID (UUID with dashes or compact 32-char hex) - if len(line) == 36 and line.count('-') == 4: - return {'type': TYPE_SNAPSHOT, 'id': line} + if len(line) == 36 and line.count("-") == 4: + return {"type": TYPE_SNAPSHOT, "id": line} if len(line) == 32: try: int(line, 16) except ValueError: pass else: - return {'type': TYPE_SNAPSHOT, 'id': line} + return {"type": TYPE_SNAPSHOT, "id": line} # Unknown format, skip return None -def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]: +def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]: """ Read JSONL or plain URLs from stdin. @@ -112,20 +113,20 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]: yield record -def read_file(path: Path) -> Iterator[Dict[str, Any]]: +def read_file(path: Path) -> Iterator[dict[str, Any]]: """ Read JSONL or plain URLs from a file. Yields parsed records as dicts. """ - with open(path, 'r') as f: + with open(path) as f: for line in f: record = parse_line(line) if record: yield record -def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]: +def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]: """ Read from CLI arguments if provided, otherwise from stdin. @@ -145,16 +146,16 @@ def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) -> yield from read_stdin(stream) -def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None: +def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None: """ Write a single JSONL record to stdout (or provided stream). """ active_stream: TextIO = sys.stdout if stream is None else stream - active_stream.write(json.dumps(record) + '\n') + active_stream.write(json.dumps(record) + "\n") active_stream.flush() -def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int: +def write_records(records: Iterator[dict[str, Any]], stream: TextIO | None = None) -> int: """ Write multiple JSONL records to stdout (or provided stream). diff --git a/archivebox/misc/legacy.py b/archivebox/misc/legacy.py index e936151d..d4a62b05 100644 --- a/archivebox/misc/legacy.py +++ b/archivebox/misc/legacy.py @@ -8,24 +8,26 @@ This is separate from the hooks-based parser system which handles importing new URLs from bookmark files, RSS feeds, etc. """ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" import os import json from pathlib import Path from datetime import datetime, timezone -from typing import Iterator, TypedDict, List +from typing import TypedDict +from collections.abc import Iterator class SnapshotDict(TypedDict, total=False): """ Dictionary type representing a snapshot/link, compatible with Snapshot model fields. """ - url: str # Required: the URL to archive - timestamp: str # Optional: unix timestamp string - title: str # Optional: page title - tags: str # Optional: comma-separated tags string - sources: List[str] # Optional: list of source file paths + + url: str # Required: the URL to archive + timestamp: str # Optional: unix timestamp string + title: str # Optional: page title + tags: str # Optional: comma-separated tags string + sources: list[str] # Optional: list of source file paths def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]: @@ -41,16 +43,16 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]: return try: - with open(index_path, 'r', encoding='utf-8') as f: + with open(index_path, encoding="utf-8") as f: data = json.load(f) - links = data.get('links', []) + links = data.get("links", []) for link in links: yield { - 'url': link.get('url', ''), - 'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())), - 'title': link.get('title'), - 'tags': link.get('tags', ''), + "url": link.get("url", ""), + "timestamp": link.get("timestamp", str(datetime.now(timezone.utc).timestamp())), + "title": link.get("title"), + "tags": link.get("tags", ""), } except (json.JSONDecodeError, KeyError, TypeError): return @@ -81,12 +83,12 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]: if jsonl_file.exists(): try: - with open(jsonl_file, 'r', encoding='utf-8') as f: + with open(jsonl_file, encoding="utf-8") as f: for line in f: line = line.strip() - if line.startswith('{'): + if line.startswith("{"): record = json.loads(line) - if record.get('type') == 'Snapshot': + if record.get("type") == "Snapshot": link = record break except (json.JSONDecodeError, KeyError, TypeError): @@ -94,15 +96,15 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]: if link is None and json_file.exists(): try: - with open(json_file, 'r', encoding='utf-8') as f: + with open(json_file, encoding="utf-8") as f: link = json.load(f) except (json.JSONDecodeError, KeyError, TypeError): pass if link: yield { - 'url': link.get('url', ''), - 'timestamp': link.get('timestamp', entry.name), - 'title': link.get('title'), - 'tags': link.get('tags', ''), + "url": link.get("url", ""), + "timestamp": link.get("timestamp", entry.name), + "title": link.get("title"), + "tags": link.get("tags", ""), } diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py index a113143b..61affd0e 100644 --- a/archivebox/misc/logging.py +++ b/archivebox/misc/logging.py @@ -1,10 +1,9 @@ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" # Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers) # Higher-level logging functions are in logging_util.py import sys -from typing import Optional, Union, Tuple, List from collections import defaultdict from random import randint @@ -19,11 +18,13 @@ CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True) STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True) IS_TTY = sys.stdout.isatty() + class RainbowHighlighter(Highlighter): def highlight(self, text): for index in range(len(text)): text.stylize(f"color({randint(90, 98)})", index, index + 1) + rainbow = RainbowHighlighter() @@ -38,49 +39,55 @@ DEFAULT_CLI_COLORS = benedict( "blue": "\033[01;34m", "white": "\033[01;37m", "black": "\033[01;30m", - } + }, +) +ANSI = benedict({k: "" for k in DEFAULT_CLI_COLORS.keys()}) + +COLOR_DICT = defaultdict( + lambda: [(0, 0, 0), (0, 0, 0)], + { + "00": [(0, 0, 0), (0, 0, 0)], + "30": [(0, 0, 0), (0, 0, 0)], + "31": [(255, 0, 0), (128, 0, 0)], + "32": [(0, 200, 0), (0, 128, 0)], + "33": [(255, 255, 0), (128, 128, 0)], + "34": [(0, 0, 255), (0, 0, 128)], + "35": [(255, 0, 255), (128, 0, 128)], + "36": [(0, 255, 255), (0, 128, 128)], + "37": [(255, 255, 255), (255, 255, 255)], + }, ) -ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()}) -COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { - '00': [(0, 0, 0), (0, 0, 0)], - '30': [(0, 0, 0), (0, 0, 0)], - '31': [(255, 0, 0), (128, 0, 0)], - '32': [(0, 200, 0), (0, 128, 0)], - '33': [(255, 255, 0), (128, 128, 0)], - '34': [(0, 0, 255), (0, 0, 128)], - '35': [(255, 0, 255), (128, 0, 128)], - '36': [(0, 255, 255), (0, 128, 128)], - '37': [(255, 255, 255), (255, 255, 255)], -}) # Logging Helpers (DEPRECATED, use rich.print instead going forward) -def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI +def stdout(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI if color: - strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] + strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"] else: - strs = [' '.join(str(a) for a in args), '\n'] + strs = [" ".join(str(a) for a in args), "\n"] - sys.stdout.write(prefix + ''.join(strs)) + sys.stdout.write(prefix + "".join(strs)) -def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI + +def stderr(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI if color: - strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] + strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"] else: - strs = [' '.join(str(a) for a in args), '\n'] + strs = [" ".join(str(a) for a in args), "\n"] - sys.stderr.write(prefix + ''.join(strs)) + sys.stderr.write(prefix + "".join(strs)) -def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[benedict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI + +def hint(text: tuple[str, ...] | list[str] | str, prefix=" ", config: benedict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI if isinstance(text, str): stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}") else: stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}") for line in text[1:]: - stderr(f'{prefix} {line}') + stderr(f"{prefix} {line}") diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index de1f3566..c5458eeb 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox' +__package__ = "archivebox" # High-level logging functions for CLI output and progress tracking # Low-level primitives (Rich console, ANSI colors) are in logging.py @@ -14,7 +14,8 @@ from pathlib import Path from datetime import datetime, timezone from dataclasses import dataclass -from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING, cast +from typing import Any, Optional, IO, TYPE_CHECKING, cast +from collections.abc import Iterable if TYPE_CHECKING: from archivebox.core.models import Snapshot @@ -28,6 +29,7 @@ from archivebox.misc.system import get_dir_size from archivebox.misc.util import enforce_types from archivebox.misc.logging import ANSI + @dataclass class RuntimeStats: """mutable stats counter for logging archiving timing info to CLI output""" @@ -36,14 +38,15 @@ class RuntimeStats: succeeded: int = 0 failed: int = 0 - parse_start_ts: Optional[datetime] = None - parse_end_ts: Optional[datetime] = None + parse_start_ts: datetime | None = None + parse_end_ts: datetime | None = None - index_start_ts: Optional[datetime] = None - index_end_ts: Optional[datetime] = None + index_start_ts: datetime | None = None + index_end_ts: datetime | None = None + + archiving_start_ts: datetime | None = None + archiving_end_ts: datetime | None = None - archiving_start_ts: Optional[datetime] = None - archiving_end_ts: Optional[datetime] = None # globals are bad, mmkay _LAST_RUN_STATS = RuntimeStats() @@ -52,49 +55,47 @@ _LAST_RUN_STATS = RuntimeStats() class TimedProgress: """Show a progress bar and measure elapsed time until .end() is called""" - def __init__(self, seconds, prefix=''): + def __init__(self, seconds, prefix=""): self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS self.ANSI = SHELL_CONFIG.ANSI - + if self.SHOW_PROGRESS: self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI)) self.p.start() - self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None} + self.stats = {"start_ts": datetime.now(timezone.utc), "end_ts": None} def end(self): """immediately end progress, clear the progressbar line, and save end_ts""" - end_ts = datetime.now(timezone.utc) - self.stats['end_ts'] = end_ts - + self.stats["end_ts"] = end_ts + if self.SHOW_PROGRESS: # terminate if we havent already terminated try: # kill the progress bar subprocess try: - self.p.close() # must be closed *before* its terminnated + self.p.close() # must be closed *before* its terminnated except (KeyboardInterrupt, SystemExit): print() raise - except BaseException: # lgtm [py/catch-base-exception] + except BaseException: # lgtm [py/catch-base-exception] pass self.p.terminate() time.sleep(0.1) # sometimes the timer doesn't terminate properly, then blocks at the join until # the full time has elapsed. sending a kill tries to avoid that. try: - self.p.kill() + self.p.kill() except Exception: pass - # clear whole terminal line try: - sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), self.ANSI['reset'])) - except (IOError, BrokenPipeError): + sys.stdout.write("\r{}{}\r".format((" " * SHELL_CONFIG.TERM_WIDTH), self.ANSI["reset"])) + except (OSError, BrokenPipeError): # ignore when the parent proc has stopped listening to our stdout pass except ValueError: @@ -102,10 +103,10 @@ class TimedProgress: @enforce_types -def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None: +def progress_bar(seconds: int, prefix: str = "", ANSI: dict[str, str] = ANSI) -> None: """show timer in the form of progress bar, with percentage and seconds remaining""" - output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__) - chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#' + output_buf = sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__ + chunk = "█" if output_buf and output_buf.encoding.upper() == "UTF-8" else "#" last_width = SHELL_CONFIG.TERM_WIDTH chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width) try: @@ -114,37 +115,41 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non if max_width < last_width: # when the terminal size is shrunk, we have to write a newline # otherwise the progress bar will keep wrapping incorrectly - sys.stdout.write('\r\n') + sys.stdout.write("\r\n") sys.stdout.flush() chunks = max_width - len(prefix) - 20 pct_complete = s / chunks / seconds * 100 log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;) - bar_width = round(log_pct/(100/chunks)) + bar_width = round(log_pct / (100 / chunks)) last_width = max_width # ████████████████████ 0.9% (1/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['green' if pct_complete < 80 else 'lightyellow'], - (chunk * bar_width).ljust(chunks), - ANSI['reset'], - round(pct_complete, 1), - round(s/chunks), - seconds, - )) + sys.stdout.write( + "\r{}{}{}{} {}% ({}/{}sec)".format( + prefix, + ANSI["green" if pct_complete < 80 else "lightyellow"], + (chunk * bar_width).ljust(chunks), + ANSI["reset"], + round(pct_complete, 1), + round(s / chunks), + seconds, + ), + ) sys.stdout.flush() time.sleep(1 / chunks) # ██████████████████████████████████ 100.0% (60/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['red'], - chunk * chunks, - ANSI['reset'], - 100.0, - seconds, - seconds, - )) + sys.stdout.write( + "\r{}{}{}{} {}% ({}/{}sec)".format( + prefix, + ANSI["red"], + chunk * chunks, + ANSI["reset"], + 100.0, + seconds, + seconds, + ), + ) sys.stdout.flush() # uncomment to have it disappear when it hits 100% instead of staying full red: # time.sleep(0.5) @@ -154,10 +159,10 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non print() -def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'): - args = ' '.join(subcommand_args) - version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( - now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), +def log_cli_command(subcommand: str, subcommand_args: Iterable[str] = (), stdin: str | IO | None = None, pwd: str = "."): + args = " ".join(subcommand_args) + version_msg = "[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]".format( + now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), VERSION=VERSION, subcommand=subcommand, args=args, @@ -166,44 +171,54 @@ def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: s # stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI)) # stderr() print(Panel(version_msg), file=sys.stderr) - + + ### Parsing Stage -def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool): +def log_importing_started(urls: str | list[str], depth: int, index_only: bool): _LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc) - print('[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]'.format( - _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'), - len(urls) if isinstance(urls, list) else len(urls.split('\n')), - depth, - ' (index only)' if index_only else '', - )) + print( + "[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]".format( + _LAST_RUN_STATS.parse_start_ts.strftime("%Y-%m-%d %H:%M:%S"), + len(urls) if isinstance(urls, list) else len(urls.split("\n")), + depth, + " (index only)" if index_only else "", + ), + ) + def log_source_saved(source_file: str): - print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) + print(" > Saved verbatim input to {}/{}".format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit("/", 1)[-1])) + def log_parsing_finished(num_parsed: int, parser_name: str): _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc) - print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name)) + print(f" > Parsed {num_parsed} URLs from input ({parser_name})") + def log_deduping_finished(num_new_links: int): - print(' > Found {} new URLs not already in index'.format(num_new_links)) + print(f" > Found {num_new_links} new URLs not already in index") def log_crawl_started(new_links): print() - print(f'[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]') + print(f"[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]") + ### Indexing Stage + def log_indexing_process_started(num_links: int): start_ts = datetime.now(timezone.utc) _LAST_RUN_STATS.index_start_ts = start_ts print() - print('[bright_black][*] [{}] Writing {} links to main index...[/]'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - )) + print( + "[bright_black][*] [{}] Writing {} links to main index...[/]".format( + start_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + ), + ) def log_indexing_process_finished(): @@ -213,46 +228,55 @@ def log_indexing_process_finished(): def log_indexing_started(out_path: str): if SHELL_CONFIG.IS_TTY: - sys.stdout.write(f' > ./{Path(out_path).relative_to(DATA_DIR)}') + sys.stdout.write(f" > ./{Path(out_path).relative_to(DATA_DIR)}") def log_indexing_finished(out_path: str): - print(f'\r √ ./{Path(out_path).relative_to(DATA_DIR)}') + print(f"\r √ ./{Path(out_path).relative_to(DATA_DIR)}") ### Archiving Stage -def log_archiving_started(num_links: int, resume: Optional[float]=None): + +def log_archiving_started(num_links: int, resume: float | None = None): start_ts = datetime.now(timezone.utc) _LAST_RUN_STATS.archiving_start_ts = start_ts print() if resume: - print('[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - resume, - )) + print( + "[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]".format( + start_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + resume, + ), + ) else: - print('[green][▶] [{}] Starting archiving of {} snapshots in index...[/]'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - )) + print( + "[green][▶] [{}] Starting archiving of {} snapshots in index...[/]".format( + start_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + ), + ) + def log_archiving_paused(num_links: int, idx: int, timestamp: str): end_ts = datetime.now(timezone.utc) _LAST_RUN_STATS.archiving_end_ts = end_ts print() - print('\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]'.format( - now=end_ts.strftime('%Y-%m-%d %H:%M:%S'), - idx=idx+1, - timestamp=timestamp, - total=num_links, - )) + print( + "\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]".format( + now=end_ts.strftime("%Y-%m-%d %H:%M:%S"), + idx=idx + 1, + timestamp=timestamp, + total=num_links, + ), + ) print() - print(' Continue archiving where you left off by running:') - print(' archivebox update --resume={}'.format(timestamp)) + print(" Continue archiving where you left off by running:") + print(f" archivebox update --resume={timestamp}") + def log_archiving_finished(num_links: int): @@ -263,24 +287,26 @@ def log_archiving_finished(num_links: int): assert _LAST_RUN_STATS.archiving_start_ts is not None seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp() if seconds > 60: - duration = '{0:.2f} min'.format(seconds / 60) + duration = f"{seconds / 60:.2f} min" else: - duration = '{0:.2f} sec'.format(seconds) + duration = f"{seconds:.2f} sec" print() - print('[green][√] [{}] Update of {} pages complete ({})[/]'.format( - end_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - duration, - )) - print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped)) - print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed)) - print(' - {} links had errors'.format(_LAST_RUN_STATS.failed)) - + print( + "[green][√] [{}] Update of {} pages complete ({})[/]".format( + end_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + duration, + ), + ) + print(f" - {_LAST_RUN_STATS.skipped} links skipped") + print(f" - {_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed} links updated") + print(f" - {_LAST_RUN_STATS.failed} links had errors") + if Snapshot.objects.count() < 50: print() - print(' [violet]Hint:[/] To manage your archive in a Web UI, run:') - print(' archivebox server 0.0.0.0:8000') + print(" [violet]Hint:[/] To manage your archive in a Web UI, run:") + print(" archivebox server 0.0.0.0:8000") def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool): @@ -289,41 +315,51 @@ def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: b # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ # > output/archive/1478739709 - print('\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format( - symbol_color='green' if is_new else 'bright_black', - symbol='+' if is_new else '√', - now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), - title=snapshot.title or snapshot.base_url, - )) - print(f' [sky_blue1]{snapshot.url}[/]') - print(' {} {}'.format( - '>' if is_new else '√', - pretty_path(out_dir), - )) + print( + '\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format( + symbol_color="green" if is_new else "bright_black", + symbol="+" if is_new else "√", + now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + title=snapshot.title or snapshot.base_url, + ), + ) + print(f" [sky_blue1]{snapshot.url}[/]") + print( + " {} {}".format( + ">" if is_new else "√", + pretty_path(out_dir), + ), + ) + def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime): total = sum(stats.values()) - if stats['failed'] > 0 : + if stats["failed"] > 0: _LAST_RUN_STATS.failed += 1 - elif stats['skipped'] == total: + elif stats["skipped"] == total: _LAST_RUN_STATS.skipped += 1 else: _LAST_RUN_STATS.succeeded += 1 try: - size = get_dir_size(out_dir) - except FileNotFoundError: - size = (0, None, '0') + results = snapshot.archiveresult_set.only("output_files", "output_size") + total_bytes = sum(result.output_size or result.output_size_from_files() for result in results) + total_files = sum(result.output_file_count() for result in results) + size = (total_bytes, 0, total_files) + except Exception: + try: + size = get_dir_size(out_dir) + except FileNotFoundError: + size = (0, None, "0") end_ts = datetime.now(timezone.utc) - duration = str(end_ts - start_ts).split('.')[0] - print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration)) - + duration = str(end_ts - start_ts).split(".")[0] + print(f" [bright_black]{size[2]} files ({printable_filesize(size[0])}) in {duration}s [/]") def log_archive_method_started(method: str): - print(' > {}'.format(method)) + print(f" > {method}") def log_archive_method_finished(result: dict): @@ -332,122 +368,117 @@ def log_archive_method_finished(result: dict): copy-paste the outputted string directly to run the cmd """ # Prettify CMD string and make it safe to copy-paste by quoting arguments - quoted_cmd = ' '.join( - '"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg - for arg in result['cmd'] - ) + quoted_cmd = " ".join(f'"{arg}"' if (" " in arg) or (":" in arg) else arg for arg in result["cmd"]) - if result['status'] == 'failed': - output = result.get('output') - if output and output.__class__.__name__ == 'TimeoutExpired': - duration = (result['end_ts'] - result['start_ts']).seconds + if result["status"] == "failed": + output = result.get("output") + if output and output.__class__.__name__ == "TimeoutExpired": + duration = (result["end_ts"] - result["start_ts"]).seconds hint_header = [ - f'[yellow3]Extractor timed out after {duration}s.[/]', + f"[yellow3]Extractor timed out after {duration}s.[/]", ] else: - error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error' + error_name = output.__class__.__name__.replace("ArchiveError", "") if output else "Error" hint_header = [ - '[yellow3]Extractor failed:[/]', - f' {error_name} [red1]{output}[/]', + "[yellow3]Extractor failed:[/]", + f" {error_name} [red1]{output}[/]", ] # Prettify error output hints string and limit to five lines - hints = getattr(output, 'hints', None) or () if output else () + hints = getattr(output, "hints", None) or () if output else () if hints: if isinstance(hints, (list, tuple, type(_ for _ in ()))): hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints] else: if isinstance(hints, bytes): hints = hints.decode() - hints = hints.split('\n') + hints = hints.split("\n") - hints = ( - f' [yellow1]{line.strip()}[/]' - for line in list(hints)[:5] if line.strip() - ) + hints = (f" [yellow1]{line.strip()}[/]" for line in list(hints)[:5] if line.strip()) docker_hints = () - if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'): - docker_hints = ( - ' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash', - ) + if os.environ.get("IN_DOCKER") in ("1", "true", "True", "TRUE", "yes"): + docker_hints = (" docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash",) # Collect and prefix output lines with indentation output_lines = [ *hint_header, *hints, - '[violet]Run to see full output:[/]', + "[violet]Run to see full output:[/]", *docker_hints, - *([' cd {};'.format(result.get('pwd'))] if result.get('pwd') else []), - ' {}'.format(quoted_cmd), + *([" cd {};".format(result.get("pwd"))] if result.get("pwd") else []), + f" {quoted_cmd}", ] - print('\n'.join( - ' {}'.format(line) - for line in output_lines - if line - )) + print( + "\n".join(f" {line}" for line in output_lines if line), + ) print() -def log_list_started(filter_patterns: Optional[List[str]], filter_type: str): - print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]') - print(' {}'.format(' '.join(filter_patterns or ()))) +def log_list_started(filter_patterns: list[str] | None, filter_type: str): + print(f"[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]") + print(" {}".format(" ".join(filter_patterns or ()))) + def log_list_finished(snapshots): from archivebox.core.models import Snapshot + print() - print('---------------------------------------------------------------------------------------------------') + print("---------------------------------------------------------------------------------------------------") csv_queryset = cast(Any, Snapshot.objects.filter(pk__in=[s.pk for s in snapshots])) - print(csv_queryset.to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) - print('---------------------------------------------------------------------------------------------------') + print(csv_queryset.to_csv(cols=["timestamp", "is_archived", "num_outputs", "url"], header=True, ljust=16, separator=" | ")) + print("---------------------------------------------------------------------------------------------------") print() def log_removal_started(snapshots, yes: bool, delete: bool): - count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots) - print(f'[yellow3][i] Found {count} matching URLs to remove.[/]') + count = snapshots.count() if hasattr(snapshots, "count") else len(snapshots) + print(f"[yellow3][i] Found {count} matching URLs to remove.[/]") if delete: file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)] print( - f' {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' - f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)' + f" {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n" + f" ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)", ) else: print( - ' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' - ' (Pass --delete if you also want to permanently delete the data folders)' + " Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n" + " (Pass --delete if you also want to permanently delete the data folders)", ) if not yes: print() - print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]') + print(f"[yellow3][?] Do you want to proceed with removing these {count} links?[/]") try: - assert input(' y/[n]: ').lower() == 'y' + assert input(" y/[n]: ").lower() == "y" except (KeyboardInterrupt, EOFError, AssertionError): raise SystemExit(0) + def log_removal_finished(remaining_links: int, removed_links: int): if remaining_links == 0 and removed_links == 0: print() - print('[red1][X] No matching links found.[/]') + print("[red1][X] No matching links found.[/]") else: total_before = remaining_links + removed_links print() - print(f'[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]') - print(f' Index now contains {remaining_links} links.') + print(f"[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]") + print(f" Index now contains {remaining_links} links.") ### Search Indexing Stage + def log_index_started(url: str): - print('[green][*] Indexing url: {} in the search index[/]'.format(url)) + print(f"[green][*] Indexing url: {url} in the search index[/]") print() ### Helpers + @enforce_types -def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str: +def pretty_path(path: Path | str, pwd: Path | str = DATA_DIR, color: bool = True) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" pwd = str(Path(pwd)) # .resolve() path = str(path) @@ -456,46 +487,46 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: b return path # replace long absolute paths with ./ relative ones to save on terminal output width - if path.startswith(pwd) and (pwd != '/') and path != pwd: + if path.startswith(pwd) and (pwd != "/") and path != pwd: if color: - path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1) + path = path.replace(pwd, "[light_slate_blue].[/light_slate_blue]", 1) else: - path = path.replace(pwd, '.', 1) - + path = path.replace(pwd, ".", 1) + # quote paths containing spaces - if ' ' in path: + if " " in path: path = f'"{path}"' - + # replace home directory with ~ for shorter output - path = path.replace(str(Path('~').expanduser()), '~') + path = path.replace(str(Path("~").expanduser()), "~") return path @enforce_types -def printable_filesize(num_bytes: Union[int, float]) -> str: - for count in ['Bytes','KB','MB','GB']: +def printable_filesize(num_bytes: int | float) -> str: + for count in ["Bytes", "KB", "MB", "GB"]: if num_bytes > -1024.0 and num_bytes < 1024.0: - return '%3.1f %s' % (num_bytes, count) + return f"{num_bytes:3.1f} {count}" num_bytes /= 1024.0 - return '%3.1f %s' % (num_bytes, 'TB') + return "{:3.1f} {}".format(num_bytes, "TB") @enforce_types def format_duration(seconds: float) -> str: """Format duration in human-readable form.""" if seconds < 1: - return f'{seconds*1000:.0f}ms' + return f"{seconds * 1000:.0f}ms" elif seconds < 60: - return f'{seconds:.1f}s' + return f"{seconds:.1f}s" elif seconds < 3600: minutes = int(seconds // 60) secs = int(seconds % 60) - return f'{minutes}min {secs}s' if secs else f'{minutes}min' + return f"{minutes}min {secs}s" if secs else f"{minutes}min" else: hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) - return f'{hours}hr {minutes}min' if minutes else f'{hours}hr' + return f"{hours}hr {minutes}min" if minutes else f"{hours}hr" @enforce_types @@ -504,15 +535,15 @@ def truncate_url(url: str, max_length: int = 60) -> str: if len(url) <= max_length: return url # Try to keep the domain and beginning of path - if '://' in url: - protocol, rest = url.split('://', 1) - if '/' in rest: - domain, path = rest.split('/', 1) + if "://" in url: + protocol, rest = url.split("://", 1) + if "/" in rest: + domain, path = rest.split("/", 1) available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..." if available > 10: - return f'{protocol}://{domain}/{path[:available]}...' + return f"{protocol}://{domain}/{path[:available]}..." # Fallback: just truncate - return url[:max_length-3] + '...' + return url[: max_length - 3] + "..." @enforce_types @@ -520,12 +551,12 @@ def log_worker_event( worker_type: str, event: str, indent_level: int = 0, - pid: Optional[int] = None, - worker_id: Optional[str] = None, - url: Optional[str] = None, - plugin: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - error: Optional[Exception] = None, + pid: int | None = None, + worker_id: str | None = None, + url: str | None = None, + plugin: str | None = None, + metadata: dict[str, Any] | None = None, + error: Exception | None = None, ) -> None: """ Log a worker event with structured metadata and indentation. @@ -541,17 +572,17 @@ def log_worker_event( metadata: Dict of metadata to show in curly braces error: Exception if event is an error """ - indent = ' ' * indent_level + indent = " " * indent_level from rich.markup import escape # Build worker identifier (without URL/plugin) worker_parts = [worker_type] # Don't add pid/worker_id for DB operations (they happen in whatever process is running) - if pid and worker_type != 'DB': - worker_parts.append(f'pid={pid}') - if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB': - worker_parts.append(f'id={worker_id}') + if pid and worker_type != "DB": + worker_parts.append(f"pid={pid}") + if worker_id and worker_type in ("CrawlWorker", "Orchestrator") and worker_type != "DB": + worker_parts.append(f"id={worker_id}") # Build worker label parts for brackets (shown inside brackets) worker_label_base = worker_parts[0] @@ -560,53 +591,53 @@ def log_worker_event( # Build URL/plugin display (shown AFTER the label, outside brackets) url_extractor_parts = [] if url: - url_extractor_parts.append(f'url: {escape(url)}') + url_extractor_parts.append(f"url: {escape(url)}") if plugin: - url_extractor_parts.append(f'extractor: {escape(plugin)}') + url_extractor_parts.append(f"extractor: {escape(plugin)}") - url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else '' + url_extractor_str = " | ".join(url_extractor_parts) if url_extractor_parts else "" # Build metadata string - metadata_str = '' + metadata_str = "" if metadata: # Format metadata nicely meta_parts = [] for k, v in metadata.items(): if isinstance(v, float): # Format floats nicely (durations, sizes) - if 'duration' in k.lower(): - meta_parts.append(f'{k}: {format_duration(v)}') - elif 'size' in k.lower(): - meta_parts.append(f'{k}: {printable_filesize(int(v))}') + if "duration" in k.lower(): + meta_parts.append(f"{k}: {format_duration(v)}") + elif "size" in k.lower(): + meta_parts.append(f"{k}: {printable_filesize(int(v))}") else: - meta_parts.append(f'{k}: {v:.2f}') + meta_parts.append(f"{k}: {v:.2f}") elif isinstance(v, int): # Format integers - check if it's a size - if 'size' in k.lower() or 'bytes' in k.lower(): - meta_parts.append(f'{k}: {printable_filesize(v)}') + if "size" in k.lower() or "bytes" in k.lower(): + meta_parts.append(f"{k}: {printable_filesize(v)}") else: - meta_parts.append(f'{k}: {v}') + meta_parts.append(f"{k}: {v}") elif isinstance(v, (list, tuple)): - meta_parts.append(f'{k}: {len(v)}') + meta_parts.append(f"{k}: {len(v)}") else: - meta_parts.append(f'{k}: {v}') - metadata_str = ' | '.join(meta_parts) + meta_parts.append(f"{k}: {v}") + metadata_str = " | ".join(meta_parts) # Determine color based on event - color = 'white' - if event in ('Starting...', 'Started', 'STARTED', 'Started in background'): - color = 'green' - elif event.startswith('Created'): - color = 'cyan' # DB creation events - elif event in ('Completed', 'COMPLETED', 'All work complete'): - color = 'blue' - elif event in ('Failed', 'ERROR', 'Failed to spawn worker'): - color = 'red' - elif event in ('Shutting down', 'SHUTDOWN'): - color = 'grey53' + color = "white" + if event in ("Starting...", "Started", "STARTED", "Started in background"): + color = "green" + elif event.startswith("Created"): + color = "cyan" # DB creation events + elif event in ("Completed", "COMPLETED", "All work complete"): + color = "blue" + elif event in ("Failed", "ERROR", "Failed to spawn worker"): + color = "red" + elif event in ("Shutting down", "SHUTDOWN"): + color = "grey53" # Build final message - error_str = f' {type(error).__name__}: {error}' if error else '' + error_str = f" {type(error).__name__}: {error}" if error else "" from archivebox.misc.logging import CONSOLE, STDERR from rich.text import Text @@ -618,19 +649,19 @@ def log_worker_event( # Add bracketed content if present (using Text.append to avoid markup issues) if worker_bracket_content: - text.append('[', style=color) + text.append("[", style=color) text.append(worker_bracket_content, style=color) - text.append(']', style=color) + text.append("]", style=color) - text.append(f' {event}{error_str}', style=color) + text.append(f" {event}{error_str}", style=color) # Add URL/plugin info first (more important) if url_extractor_str: - text.append(f' | {url_extractor_str}') + text.append(f" | {url_extractor_str}") # Then add other metadata if metadata_str: - text.append(f' | {metadata_str}') + text.append(f" | {metadata_str}") # Stdout is reserved for JSONL records whenever commands are piped together. # Route worker/DB progress to stderr in non-TTY contexts so pipelines like @@ -640,90 +671,85 @@ def log_worker_event( @enforce_types -def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str: - return '\n'.join( - f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' - for folder, snapshot in folders.items() - ) - - - -@enforce_types -def printable_config(config: dict, prefix: str='') -> str: - return f'\n{prefix}'.join( - f'{key}={val}' - for key, val in config.items() - if not (isinstance(val, dict) or callable(val)) - ) +def printable_folders(folders: dict[str, Optional["Snapshot"]], with_headers: bool = False) -> str: + return "\n".join(f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' for folder, snapshot in folders.items()) @enforce_types -def printable_folder_status(name: str, folder: Dict) -> str: - if folder['enabled']: - if folder['is_valid']: - color, symbol, note, num_files = 'green', '√', 'valid', '' +def printable_config(config: dict, prefix: str = "") -> str: + return f"\n{prefix}".join(f"{key}={val}" for key, val in config.items() if not (isinstance(val, dict) or callable(val))) + + +@enforce_types +def printable_folder_status(name: str, folder: dict) -> str: + if folder["enabled"]: + if folder["is_valid"]: + color, symbol, note, num_files = "green", "√", "valid", "" else: - color, symbol, note, num_files = 'red', 'X', 'invalid', '?' + color, symbol, note, num_files = "red", "X", "invalid", "?" else: - color, symbol, note, num_files = 'grey53', '-', 'unused', '-' + color, symbol, note, num_files = "grey53", "-", "unused", "-" - - if folder['path']: - if os.access(folder['path'], os.R_OK): + if folder["path"]: + if os.access(folder["path"], os.R_OK): try: num_files = ( - f'{len(os.listdir(folder["path"]))} files' - if os.path.isdir(folder['path']) else - printable_filesize(Path(folder['path']).stat().st_size) + f"{len(os.listdir(folder['path']))} files" + if os.path.isdir(folder["path"]) + else printable_filesize(Path(folder["path"]).stat().st_size) ) except PermissionError: - num_files = 'error' + num_files = "error" else: - num_files = 'missing' - - if folder.get('is_mount'): + num_files = "missing" + + if folder.get("is_mount"): # add symbol @ next to filecount if path is a remote filesystem mount - num_files = f'{num_files} @' if num_files else '@' + num_files = f"{num_files} @" if num_files else "@" - path = pretty_path(folder['path']) + path = pretty_path(folder["path"]) - return ' '.join(( - f'[{color}]', - symbol, - '[/]', - name.ljust(21).replace('DATA_DIR', '[light_slate_blue]DATA_DIR[/light_slate_blue]'), - num_files.ljust(14).replace('missing', '[grey53]missing[/grey53]'), - f'[{color}]', - note.ljust(8), - '[/]', - path.ljust(76), - )) + return " ".join( + ( + f"[{color}]", + symbol, + "[/]", + name.ljust(21).replace("DATA_DIR", "[light_slate_blue]DATA_DIR[/light_slate_blue]"), + num_files.ljust(14).replace("missing", "[grey53]missing[/grey53]"), + f"[{color}]", + note.ljust(8), + "[/]", + path.ljust(76), + ), + ) @enforce_types -def printable_dependency_version(name: str, dependency: Dict) -> str: - color, symbol, note, version = 'red', 'X', 'invalid', '?' +def printable_dependency_version(name: str, dependency: dict) -> str: + color, symbol, note, version = "red", "X", "invalid", "?" - if dependency['enabled']: - if dependency['is_valid']: - color, symbol, note = 'green', '√', 'valid' + if dependency["enabled"]: + if dependency["is_valid"]: + color, symbol, note = "green", "√", "valid" - parsed_version_num = re.search(r'[\d\.]+', dependency['version']) + parsed_version_num = re.search(r"[\d\.]+", dependency["version"]) if parsed_version_num: - version = f'v{parsed_version_num[0]}' + version = f"v{parsed_version_num[0]}" else: - color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' + color, symbol, note, version = "lightyellow", "-", "disabled", "-" - path = pretty_path(dependency['path']) + path = pretty_path(dependency["path"]) - return ' '.join(( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(21), - version.ljust(14), - ANSI[color], - note.ljust(8), - ANSI['reset'], - path.ljust(76), - )) + return " ".join( + ( + ANSI[color], + symbol, + ANSI["reset"], + name.ljust(21), + version.ljust(14), + ANSI[color], + note.ljust(8), + ANSI["reset"], + path.ljust(76), + ), + ) diff --git a/archivebox/misc/monkey_patches.py b/archivebox/misc/monkey_patches.py index 2f4bb146..20430385 100644 --- a/archivebox/misc/monkey_patches.py +++ b/archivebox/misc/monkey_patches.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox' +__package__ = "archivebox" import datetime @@ -13,7 +13,7 @@ django_stubs_ext.monkeypatch() # monkey patch django timezone to add back utc (it was removed in Django 5.0) -setattr(timezone, 'utc', datetime.timezone.utc) +setattr(timezone, "utc", datetime.UTC) # monkey patch django-signals-webhooks to change how it shows up in Admin UI # from signal_webhooks.apps import DjangoSignalWebhooksConfig @@ -28,28 +28,29 @@ setattr(timezone, 'utc', datetime.timezone.utc) # Hide site-packages/sonic/client.py:115: SyntaxWarning # https://github.com/xmonader/python-sonic-client/pull/18 -warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic') +warnings.filterwarnings("ignore", category=SyntaxWarning, module="sonic") -# Make daphne log requests quieter and esier to read + +# Make daphne log requests quieter and easier to read class ModifiedAccessLogGenerator(access.AccessLogGenerator): """Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files""" - + def write_entry(self, host, date, request, status=None, length=None, ident=None, user=None): - + # Ignore noisy requests to staticfiles / favicons / etc. - if 'GET /static/' in request: + if "GET /static/" in request: return if "GET /health/" in request: return - if 'GET /admin/jsi18n/' in request: + if "GET /admin/jsi18n/" in request: return if request.endswith("/favicon.ico") or request.endswith("/robots.txt") or request.endswith("/screenshot.png"): return - if request.endswith('.css') or request.endswith('.js') or request.endswith('.woff') or request.endswith('.ttf'): + if request.endswith(".css") or request.endswith(".js") or request.endswith(".woff") or request.endswith(".ttf"): return - if str(status) in ('404', '304'): + if str(status) in ("404", "304"): return - + # clean up the log format to mostly match the same format as django.conf.settings.LOGGING rich formats self.stream.write( "%s HTTP %s %s %s\n" @@ -58,13 +59,14 @@ class ModifiedAccessLogGenerator(access.AccessLogGenerator): request, status or "-", "localhost" if host.startswith("127.") else host.split(":")[0], - ) + ), ) - -access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore + + +access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore # fix benedict objects to pretty-print/repr more nicely with rich # https://stackoverflow.com/a/79048811/2156113 # https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol -benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore +benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore diff --git a/archivebox/misc/paginators.py b/archivebox/misc/paginators.py index fa8c6cdb..86ca540b 100644 --- a/archivebox/misc/paginators.py +++ b/archivebox/misc/paginators.py @@ -1,30 +1,30 @@ -__package__ = 'archivebox.misc' +__package__ = "archivebox.misc" from django.core.paginator import Paginator from django.utils.functional import cached_property -class AccelleratedPaginator(Paginator): +class AcceleratedPaginator(Paginator): """ - Accellerated Pagniator ignores DISTINCT when counting total number of rows. + Accelerated paginator ignores DISTINCT when counting total number of rows. Speeds up SELECT Count(*) on Admin views by >20x. https://hakibenita.com/optimizing-the-django-admin-paginator """ @cached_property def count(self): - has_filters = getattr(self.object_list, '_has_filters', None) + has_filters = getattr(self.object_list, "_has_filters", None) if callable(has_filters) and has_filters(): # fallback to normal count method on filtered queryset return super().count - model = getattr(self.object_list, 'model', None) + model = getattr(self.object_list, "model", None) if model is None: return super().count # otherwise count total rows in a separate fast query return model.objects.count() - + # Alternative approach for PostgreSQL: fallback count takes > 200ms # from django.db import connection, transaction, OperationalError # with transaction.atomic(), connection.cursor() as cursor: diff --git a/archivebox/misc/serve_static.py b/archivebox/misc/serve_static.py index 459eefe4..84da7764 100644 --- a/archivebox/misc/serve_static.py +++ b/archivebox/misc/serve_static.py @@ -3,26 +3,35 @@ import json import re import os import stat +import asyncio import posixpath import mimetypes import importlib +import queue +import threading +import time +import zipfile +from datetime import datetime from collections.abc import Callable from pathlib import Path +from urllib.parse import urlencode from django.contrib.staticfiles import finders +from django.template import TemplateDoesNotExist, loader from django.views import static from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified from django.utils._os import safe_join from django.utils.http import http_date from django.utils.translation import gettext as _ from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.logging_util import printable_filesize _HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {} def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None: - hashes_path = snapshot_dir / 'hashes' / 'hashes.json' + hashes_path = snapshot_dir / "hashes" / "hashes.json" if not hashes_path.exists(): return None try: @@ -35,11 +44,11 @@ def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None: return cached[1] try: - data = json.loads(hashes_path.read_text(encoding='utf-8')) + data = json.loads(hashes_path.read_text(encoding="utf-8")) except Exception: return None - file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')} + file_map = {str(entry.get("path")): entry.get("hash") for entry in data.get("files", []) if entry.get("path")} _HASHES_CACHE[hashes_path] = (mtime, file_map) return file_map @@ -52,7 +61,192 @@ def _hash_for_path(document_root: Path, rel_path: str) -> str | None: def _cache_policy() -> str: - return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private' + return "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private" + + +def _format_direntry_timestamp(stat_result: os.stat_result) -> str: + timestamp = getattr(stat_result, "st_birthtime", None) or stat_result.st_mtime + return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M") + + +def _safe_zip_stem(name: str) -> str: + safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", name).strip("._-") + return safe_name or "archivebox" + + +class _StreamingQueueWriter: + """Expose a write-only file-like object so zipfile can stream into a queue.""" + + def __init__(self, output_queue: queue.Queue[bytes | BaseException | object]) -> None: + self.output_queue = output_queue + self.position = 0 + + def write(self, data: bytes) -> int: + if data: + self.output_queue.put(data) + self.position += len(data) + return len(data) + + def tell(self) -> int: + return self.position + + def flush(self) -> None: + return None + + def close(self) -> None: + return None + + def writable(self) -> bool: + return True + + def seekable(self) -> bool: + return False + + +def _iter_visible_files(root: Path): + """Yield non-hidden files in a stable order so ZIP output is deterministic.""" + + for current_root, dirnames, filenames in os.walk(root): + dirnames[:] = sorted(dirname for dirname in dirnames if not dirname.startswith(".")) + for filename in sorted(name for name in filenames if not name.startswith(".")): + yield Path(current_root) / filename + + +def _build_directory_zip_response( + fullpath: Path, + path: str, + *, + is_archive_replay: bool, + use_async_stream: bool, +) -> StreamingHttpResponse: + root_name = _safe_zip_stem(fullpath.name or Path(path).name or "archivebox") + sentinel = object() + output_queue: queue.Queue[bytes | BaseException | object] = queue.Queue(maxsize=8) + initial_chunk_target = 64 * 1024 + initial_chunk_wait = 0.05 + + def build_zip() -> None: + # zipfile wants a write-only file object. Feed those bytes straight into + # a queue so the response can stream them out as soon as they are ready. + writer = _StreamingQueueWriter(output_queue) + try: + with zipfile.ZipFile(writer, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zip_file: + for entry in _iter_visible_files(fullpath): + rel_parts = entry.relative_to(fullpath).parts + arcname = Path(root_name, *rel_parts).as_posix() + zip_file.write(entry, arcname) + except BaseException as err: + output_queue.put(err) + finally: + output_queue.put(sentinel) + + threading.Thread(target=build_zip, name=f"zip-stream-{root_name}", daemon=True).start() + + def iter_zip_chunks(): + # Emit a meaningful first chunk quickly so browsers show the download + # immediately instead of waiting on dozens of tiny ZIP header writes. + first_chunk = bytearray() + initial_deadline = time.monotonic() + initial_chunk_wait + + while True: + timeout = max(initial_deadline - time.monotonic(), 0) if len(first_chunk) < initial_chunk_target else None + try: + chunk = output_queue.get(timeout=timeout) if timeout is not None else output_queue.get() + except queue.Empty: + if first_chunk: + yield bytes(first_chunk) + first_chunk.clear() + continue + chunk = output_queue.get() + + if chunk is sentinel: + if first_chunk: + yield bytes(first_chunk) + break + if isinstance(chunk, BaseException): + raise chunk + if len(first_chunk) < initial_chunk_target: + first_chunk.extend(chunk) + if len(first_chunk) >= initial_chunk_target or time.monotonic() >= initial_deadline: + yield bytes(first_chunk) + first_chunk.clear() + continue + yield chunk + + async def stream_zip_async(): + # Django ASGI buffers sync StreamingHttpResponse iterators by consuming + # them into a list. Drive the same sync iterator from a worker thread so + # Daphne can send each chunk as it arrives instead of buffering the ZIP. + iterator = iter(iter_zip_chunks()) + while True: + chunk = await asyncio.to_thread(next, iterator, None) + if chunk is None: + break + yield chunk + + response = StreamingHttpResponse( + stream_zip_async() if use_async_stream else iter_zip_chunks(), + content_type="application/zip", + ) + response.headers["Content-Disposition"] = f'attachment; filename="{root_name}.zip"' + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime) + response.headers["X-Accel-Buffering"] = "no" + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="application/zip", + is_archive_replay=is_archive_replay, + ) + + +def _render_directory_index(request, path: str, fullpath: Path) -> HttpResponse: + try: + template = loader.select_template( + [ + "static/directory_index.html", + "static/directory_index", + ], + ) + except TemplateDoesNotExist: + return static.directory_index(path, fullpath) + + entries = [] + file_list = [] + visible_entries = sorted( + (entry for entry in fullpath.iterdir() if not entry.name.startswith(".")), + key=lambda entry: (not entry.is_dir(), entry.name.lower()), + ) + for entry in visible_entries: + url = str(entry.relative_to(fullpath)) + if entry.is_dir(): + url += "/" + file_list.append(url) + + stat_result = entry.stat() + entries.append( + { + "name": url, + "url": url, + "is_dir": entry.is_dir(), + "size": "—" if entry.is_dir() else printable_filesize(stat_result.st_size), + "timestamp": _format_direntry_timestamp(stat_result), + }, + ) + + zip_query = request.GET.copy() + zip_query["download"] = "zip" + zip_url = request.path + if zip_query: + zip_url = f"{zip_url}?{zip_query.urlencode()}" + + context = { + "directory": f"{path}/", + "file_list": file_list, + "entries": entries, + "zip_url": zip_url, + } + return HttpResponse(template.render(context)) # Ensure common web types are mapped consistently across platforms. @@ -71,16 +265,16 @@ mimetypes.add_type("application/xml", ".xml") mimetypes.add_type("image/svg+xml", ".svg") try: - _markdown = getattr(importlib.import_module('markdown'), 'markdown') + _markdown = getattr(importlib.import_module("markdown"), "markdown") except ImportError: _markdown: Callable[..., str] | None = None -MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)') -MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') -MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*') -MARKDOWN_ITALIC_RE = re.compile(r'(?]*>') -HTML_BODY_RE = re.compile(r']*>(.*)', flags=re.IGNORECASE | re.DOTALL) +MARKDOWN_INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)") +MARKDOWN_INLINE_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") +MARKDOWN_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*") +MARKDOWN_ITALIC_RE = re.compile(r"(?]*>") +HTML_BODY_RE = re.compile(r"]*>(.*)", flags=re.IGNORECASE | re.DOTALL) RISKY_REPLAY_MIMETYPES = { "text/html", "application/xhtml+xml", @@ -99,8 +293,8 @@ def _extract_markdown_candidate(text: str) -> str: body_match = HTML_BODY_RE.search(candidate) if body_match: candidate = body_match.group(1) - candidate = re.sub(r'^\s*]*>', '', candidate, flags=re.IGNORECASE) - candidate = re.sub(r'

\s*$', '', candidate, flags=re.IGNORECASE) + candidate = re.sub(r"^\s*]*>", "", candidate, flags=re.IGNORECASE) + candidate = re.sub(r"

\s*$", "", candidate, flags=re.IGNORECASE) return candidate.strip() @@ -109,15 +303,115 @@ def _looks_like_markdown(text: str) -> bool: if "" in lower: return False md_markers = 0 - md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE)) - md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE)) - md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE)) - md_markers += text.count('[TOC]') + md_markers += len(re.findall(r"^\s{0,3}#{1,6}\s+\S", text, flags=re.MULTILINE)) + md_markers += len(re.findall(r"^\s*[-*+]\s+\S", text, flags=re.MULTILINE)) + md_markers += len(re.findall(r"^\s*\d+\.\s+\S", text, flags=re.MULTILINE)) + md_markers += text.count("[TOC]") md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text)) - md_markers += text.count('\n---') + text.count('\n***') + md_markers += text.count("\n---") + text.count("\n***") return md_markers >= 6 +def _render_text_preview_document(text: str, title: str) -> str: + escaped_title = html.escape(title) + escaped_text = html.escape(text) + return f""" + + + + + {escaped_title} + + + +
{escaped_title}
+
{escaped_text}
+ +""" + + +def _render_image_preview_document(image_url: str, title: str) -> str: + escaped_title = html.escape(title) + escaped_url = html.escape(image_url, quote=True) + return f""" + + + + + {escaped_title} + + + +
+ {escaped_title} +
+ +""" + + def _render_markdown_fallback(text: str) -> str: if _markdown is not None and not HTML_TAG_RE.search(text): try: @@ -133,11 +427,11 @@ def _render_markdown_fallback(text: str) -> str: headings = [] def slugify(value: str) -> str: - slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-') + slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-") return slug or "section" for raw_line in lines: - heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line) + heading_match = re.match(r"^\s{0,3}(#{1,6})\s+(.*)$", raw_line) if heading_match: level = len(heading_match.group(1)) content = heading_match.group(2).strip() @@ -152,8 +446,8 @@ def _render_markdown_fallback(text: str) -> str: def render_inline(markup: str) -> str: content = MARKDOWN_INLINE_IMAGE_RE.sub(r'\1', markup) content = MARKDOWN_INLINE_LINK_RE.sub(r'\1', content) - content = MARKDOWN_BOLD_RE.sub(r'\1', content) - content = MARKDOWN_ITALIC_RE.sub(r'\1', content) + content = MARKDOWN_BOLD_RE.sub(r"\1", content) + content = MARKDOWN_ITALIC_RE.sub(r"\1", content) return content def close_lists(): @@ -194,7 +488,7 @@ def _render_markdown_fallback(text: str) -> str: html_lines.append("
") continue - heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line) + heading_match = re.match(r"^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$", line) if heading_match: close_lists() if in_blockquote: @@ -205,7 +499,7 @@ def _render_markdown_fallback(text: str) -> str: content = heading_match.group(3).strip() if leading_tags: html_lines.append(leading_tags) - html_lines.append(f"{render_inline(content)}") + html_lines.append(f'{render_inline(content)}') continue if stripped in ("---", "***"): @@ -226,7 +520,7 @@ def _render_markdown_fallback(text: str) -> str: html_lines.append("") in_blockquote = False - ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line) + ul_match = re.match(r"^\s*[-*+]\s+(.*)$", line) if ul_match: if in_ol: html_lines.append("") @@ -237,7 +531,7 @@ def _render_markdown_fallback(text: str) -> str: html_lines.append(f"
  • {render_inline(ul_match.group(1))}
  • ") continue - ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line) + ol_match = re.match(r"^\s*\d+\.\s+(.*)$", line) if ol_match: if in_ul: html_lines.append("") @@ -255,10 +549,10 @@ def _render_markdown_fallback(text: str) -> str: toc_items = [] for level, title, slug in headings: toc_items.append( - f'
  • {title}
  • ' + f'
  • {title}
  • ', ) html_lines.append( - '' + '", ) continue @@ -276,8 +570,8 @@ def _render_markdown_fallback(text: str) -> str: def _render_markdown_document(markdown_text: str) -> str: body = _render_markdown_fallback(markdown_text) wrapped = ( - "" - "" + '' + '' " +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} + +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
    + {% block object-tools %} +
      + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
    + {% endblock %} + {% if cl.formset and cl.formset.errors %} +

    + {% blocktranslate count counter=cl.formset.total_error_count %}Please correct the error below.{% plural %}Please correct the errors below.{% endblocktranslate %} +

    + {{ cl.formset.non_form_errors }} + {% endif %} + {% if cl.model_admin.show_search_mode_selector %} + {% with current_search_mode=cl.params.search_mode|default:cl.model_admin.get_default_search_mode %} +
    + {% endwith %} + {% else %} +
    + {% endif %} +
    +
    + {% block search %}{% search_form cl %}{% endblock %} + {% block date_hierarchy %}{% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %}{% endblock %} + +
    {% csrf_token %} + {% if cl.formset %} +
    {{ cl.formset.management_form }}
    + {% endif %} + + {% block result_list %} + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% result_list cl %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% endblock %} + {% block pagination %} + + {% endblock %} +
    +
    +
    + {% block filters %} + {% if cl.has_filters %} +
    +

    + {% translate 'Filter' %} + +

    + {% if cl.is_facets_optional or cl.has_active_filters %}
    + {% if cl.is_facets_optional %}

    + {% if cl.add_facets %}{% translate "Hide counts" %} + {% else %}{% translate "Show counts" %}{% endif %} +

    {% endif %} + {% if cl.has_active_filters %}

    + ✖ {% translate "Clear all filters" %} +

    {% endif %} +
    {% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
    + {% endif %} + {% endblock %} +
    +
    + {% if cl.has_filters %} + + {% endif %} +{% endblock %} diff --git a/archivebox/templates/admin/change_list_results.html b/archivebox/templates/admin/change_list_results.html new file mode 100644 index 00000000..71f410e3 --- /dev/null +++ b/archivebox/templates/admin/change_list_results.html @@ -0,0 +1,38 @@ +{% load i18n core_tags %} +{% if result_hidden_fields %} +
    {# DIV for HTML validation #} +{% for item in result_hidden_fields %}{{ item }}{% endfor %} +
    +{% endif %} +{% if results %} +
    + + + +{% for header in result_headers %} +{% endfor %} + + + +{% for result in results %} +{% if result.form and result.form.non_field_errors %} + +{% endif %} +{% with row_obj=cl.result_list|index:forloop.counter0 %} +{% for item in result %}{{ item }}{% endfor %} +{% endwith %} +{% endfor %} + +
    + {% if header.sortable and header.sort_priority > 0 %} +
    + + {% if num_sorted_fields > 1 %}{{ header.sort_priority }}{% endif %} + +
    + {% endif %} +
    {% if header.sortable %}{{ header.text|capfirst }}{% else %}{{ header.text|capfirst }}{% endif %}
    +
    +
    {{ result.form.non_field_errors }}
    +
    +{% endif %} diff --git a/archivebox/templates/admin/core/archiveresult/change_list.html b/archivebox/templates/admin/core/archiveresult/change_list.html new file mode 100644 index 00000000..b44e9211 --- /dev/null +++ b/archivebox/templates/admin/core/archiveresult/change_list.html @@ -0,0 +1,142 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_list %} + +{% block title %}{% if cl.formset and cl.formset.errors %}{% translate "Error:" %} {% endif %}{{ block.super }}{% endblock %} +{% block extrastyle %} + {{ block.super }} + + {% if cl.formset %} + + {% endif %} + {% if cl.formset or action_form %} + + {% endif %} + {{ media.css }} + {% if not actions_on_top and not actions_on_bottom %} + + {% endif %} +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
    + {% block object-tools %} +
      + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
    + {% endblock %} + {% if cl.formset and cl.formset.errors %} +

    + {% if cl.formset.total_error_count == 1 %}{% translate "Please correct the error below." %}{% else %}{% translate "Please correct the errors below." %}{% endif %} +

    + {{ cl.formset.non_form_errors }} + {% endif %} +
    +
    +
    + {% block search %}{% search_form cl %}{% endblock %} + {% block date_hierarchy %}{% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %}{% endblock %} + +
    {% csrf_token %} + {% if cl.formset %} +
    {{ cl.formset.management_form }}
    + {% endif %} + + {% block result_list %} + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% result_list cl %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% endblock %} + {% block pagination %} + + {% endblock %} +
    +
    +
    + {% if cl.has_filters %} +
    +

    + {% translate 'Filter' %} + +

    + {% if cl.has_active_filters %}

    + ✖ {% translate "Clear all filters" %} +

    {% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
    + {% endif %} +
    +
    + {% if cl.has_filters %} + + {% endif %} +{% endblock %} diff --git a/archivebox/templates/admin/core/tag/change_list.html b/archivebox/templates/admin/core/tag/change_list.html index 5ce822c5..183826d0 100644 --- a/archivebox/templates/admin/core/tag/change_list.html +++ b/archivebox/templates/admin/core/tag/change_list.html @@ -178,7 +178,7 @@ .tag-grid { display: grid; gap: 12px; - grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); + grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); } .tag-card { @@ -202,17 +202,27 @@ } .tag-card__header { - display: flex; - justify-content: space-between; + display: grid; gap: 10px; - align-items: flex-start; } .tag-card__title { - flex: 1 1 auto; - min-width: 0; display: grid; gap: 4px; + min-width: 0; + } + + .tag-card__display { + min-width: 0; + } + + .tag-card__display a { + display: block; + color: inherit; + overflow: hidden; + text-decoration: none; + text-overflow: ellipsis; + white-space: nowrap; } .tag-card__title strong, @@ -221,7 +231,10 @@ font-size: 17px; line-height: 1.1; color: #111827; - word-break: break-word; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + word-break: normal; } .tag-card__count { @@ -237,10 +250,9 @@ } .tag-card__actions { - flex: 0 0 auto; display: flex; flex-wrap: wrap; - justify-content: flex-end; + justify-content: flex-start; align-items: center; gap: 6px; } diff --git a/archivebox/templates/admin/private_index.html b/archivebox/templates/admin/private_index.html index 370343e6..7db75b30 100644 --- a/archivebox/templates/admin/private_index.html +++ b/archivebox/templates/admin/private_index.html @@ -106,12 +106,17 @@ var storageKey = 'admin-filters-collapsed'; var toggle = document.getElementById('changelist-filter-toggle'); if (!toggle) return; + var toolbarToggle = document.getElementById('changelist-toolbar-filter-toggle'); function applyState() { var collapsed = localStorage.getItem(storageKey) === 'true'; document.body.classList.toggle('filters-collapsed', collapsed); toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel; toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true'); + if (toolbarToggle) { + toolbarToggle.textContent = toggle.dataset.showLabel; + toolbarToggle.style.display = collapsed ? 'inline-block' : 'none'; + } } toggle.addEventListener('click', function() { @@ -119,6 +124,12 @@ localStorage.setItem(storageKey, collapsed ? 'true' : 'false'); applyState(); }); + if (toolbarToggle) { + toolbarToggle.addEventListener('click', function() { + localStorage.setItem(storageKey, 'false'); + applyState(); + }); + } applyState(); })(); diff --git a/archivebox/templates/admin/private_index_grid.html b/archivebox/templates/admin/private_index_grid.html index 370343e6..7db75b30 100644 --- a/archivebox/templates/admin/private_index_grid.html +++ b/archivebox/templates/admin/private_index_grid.html @@ -106,12 +106,17 @@ var storageKey = 'admin-filters-collapsed'; var toggle = document.getElementById('changelist-filter-toggle'); if (!toggle) return; + var toolbarToggle = document.getElementById('changelist-toolbar-filter-toggle'); function applyState() { var collapsed = localStorage.getItem(storageKey) === 'true'; document.body.classList.toggle('filters-collapsed', collapsed); toggle.textContent = collapsed ? toggle.dataset.showLabel : toggle.dataset.hideLabel; toggle.setAttribute('aria-expanded', collapsed ? 'false' : 'true'); + if (toolbarToggle) { + toolbarToggle.textContent = toggle.dataset.showLabel; + toolbarToggle.style.display = collapsed ? 'inline-block' : 'none'; + } } toggle.addEventListener('click', function() { @@ -119,6 +124,12 @@ localStorage.setItem(storageKey, collapsed ? 'true' : 'false'); applyState(); }); + if (toolbarToggle) { + toolbarToggle.addEventListener('click', function() { + localStorage.setItem(storageKey, 'false'); + applyState(); + }); + } applyState(); })(); diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index f5e48789..9d866742 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -765,7 +765,7 @@ ${statusIcon}
    -
    ${formatUrl(snapshot.url)}
    +
    Snapshot: ${formatUrl(snapshot.url)}
    ${snapshotMeta}
    @@ -863,7 +863,7 @@
    ${statusIcon}
    -
    ${crawl.label || '(no label)'}
    +
    Crawl: ${crawl.label || '(no label)'}
    ${metaText}
    - - {{obj.title|default:'Not yet archived...'}} + + {{obj.title|default:'Not yet archived...'}}
    @@ -275,6 +295,7 @@ const detectedURLsPanel = document.querySelector('.detected-urls-panel'); const detectedURLsSummary = document.getElementById('detected-urls-summary'); const detectedURLsList = document.getElementById('detected-urls-list'); + const maxURLsInput = document.querySelector('input[name="max_urls"]'); const sharedURLPattern = urlTextarea.dataset.urlRegex || '(?=(http[s]?://(?:[a-zA-Z]|[0-9]|[-_$@.&+!*\\(\\),]|[^\\u0000-\\u007F])+[^\\]\\[<>"\\\'\\s]+))'; const previewURLPattern = ( sharedURLPattern && sharedURLPattern.startsWith('(?=(') && sharedURLPattern.endsWith('))') @@ -673,6 +694,43 @@ return getEffectiveFilterPatterns('URL_DENYLIST', 'textarea[name="url_filters_denylist"]'); } + function getMaxURLsLimit() { + if (!maxURLsInput) { + return 0; + } + const rawValue = String(maxURLsInput.value || '').trim(); + if (!rawValue) { + return 0; + } + const parsed = parseInt(rawValue, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : 0; + } + + function applyMaxURLsLimit(matches) { + const limit = getMaxURLsLimit(); + if (!limit) { + return matches; + } + + let allowedCount = 0; + return matches.map(match => { + if (match.filter.status === 'denied' || match.filter.status === 'filtered') { + return match; + } + if (allowedCount < limit) { + allowedCount += 1; + return match; + } + return { + ...match, + filter: { + status: 'limited', + message: `Excluded by max_urls limit (${limit})`, + }, + }; + }); + } + function hasDenyPattern(pattern) { return !!pattern && getCurrentDenyPatterns().includes(pattern); } @@ -806,11 +864,13 @@ allowlisted: matches.filter(match => match.filter.status === 'allowlisted').length, denied: matches.filter(match => match.filter.status === 'denied').length, filtered: matches.filter(match => match.filter.status === 'filtered').length, + limited: matches.filter(match => match.filter.status === 'limited').length, }; - summary.total = matches.length - summary.denied - summary.filtered; + summary.total = matches.length - summary.denied - summary.filtered - summary.limited; const summaryParts = []; if (summary.allowlisted) summaryParts.push(`${summary.allowlisted} allowed`); if (summary.denied) summaryParts.push(`${summary.denied} denied`); + if (summary.limited) summaryParts.push(`${summary.limited} over limit`); summaryParts.push(`${summary.total} total`); detectedURLsSummary.textContent = summaryParts.join(' • '); @@ -945,11 +1005,12 @@ filter: getFilterState(match.url), })); const uniqueMatches = dedupeMatchesForFilterView(matches); + const limitedMatches = applyMaxURLsLimit(uniqueMatches); const count = matches.length; urlCounter.textContent = `${count} URL${count !== 1 ? 's' : ''} detected`; urlCounter.className = count > 0 ? 'url-counter url-counter-positive' : 'url-counter'; renderHighlightLayer(urlTextarea.value, matches); - renderDetectedURLs(uniqueMatches); + renderDetectedURLs(limitedMatches); syncHighlightScroll(); } @@ -968,6 +1029,7 @@ document.getElementById('add-form').addEventListener('input', function(event) { if ( event.target === urlTextarea || + event.target === maxURLsInput || event.target.matches('textarea[name="url_filters_allowlist"]') || event.target.matches('textarea[name="url_filters_denylist"]') || event.target.matches('#id_config_rows .kv-key') || @@ -981,6 +1043,7 @@ if ( event.target.matches('textarea[name="url_filters_allowlist"]') || event.target.matches('textarea[name="url_filters_denylist"]') || + event.target.matches('input[name="max_urls"]') || event.target.matches('input[name="url_filters_same_domain_only"]') || event.target.matches('#id_config_rows .kv-key') || event.target.matches('#id_config_rows .kv-value') diff --git a/archivebox/templates/core/public_index.html b/archivebox/templates/core/public_index.html index ab7fe3e1..45535913 100644 --- a/archivebox/templates/core/public_index.html +++ b/archivebox/templates/core/public_index.html @@ -4,7 +4,7 @@ {% block body %}
    -