diff --git a/Dockerfile b/Dockerfile index 36763782..36671225 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # This is the Dockerfile for ArchiveBox, it bundles the following main dependencies: -# python3.14, pip, pipx, uv, python3-ldap +# python3.13, uv, python3-ldap # curl, wget, git, dig, ping, tree, nano # node, npm, single-file, readability-extractor, postlight-parser # ArchiveBox, yt-dlp, playwright, chromium @@ -12,7 +12,7 @@ # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server # Multi-arch build: # docker buildx create --use -# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:dev -t archivebox/archivebox:sha-abc123 +# docker buildx build . --platform=linux/amd64,linux/arm64 --push -t archivebox/archivebox:dev -t archivebox/archivebox:sha-abc123 # Read more here: https://github.com/ArchiveBox/ArchiveBox#archivebox-development @@ -20,9 +20,9 @@ ### Example: Using ArchiveBox in your own project's Dockerfile ######## -# FROM python:3.14-slim +# FROM python:3.13-slim # WORKDIR /data -# RUN pip install archivebox>=0.8.5rc51 # use latest release here +# RUN pip install archivebox>=0.9.0 # use latest release here # RUN archivebox install # RUN useradd -ms /bin/bash archivebox && chown -R archivebox /data @@ -82,8 +82,6 @@ ENV ARCHIVEBOX_USER="archivebox" \ ENV CODE_DIR=/app \ DATA_DIR=/data \ PLAYWRIGHT_BROWSERS_PATH=/browsers - # GLOBAL_VENV=/venv \ - # TODO: add TMP_DIR and LIB_DIR? # Bash SHELL config # http://redsymbol.net/articles/unofficial-bash-strict-mode/ @@ -201,7 +199,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \ echo "[+] APT Installing NODE $NODE_VERSION for $TARGETPLATFORM..." \ && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \ - && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ + && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ && apt-get update -qq \ && apt-get install -qq -y --no-upgrade libatomic1 \ && apt-get install -y --no-upgrade \ @@ -218,7 +216,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # Set up uv and main app /venv -COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /uvx /bin/ +COPY --from=ghcr.io/astral-sh/uv:0.6 /uv /uvx /bin/ ENV UV_COMPILE_BYTECODE=1 \ UV_PYTHON_PREFERENCE=managed \ UV_PYTHON_INSTALL_DIR=/opt/uv/python \ @@ -282,7 +280,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # && service dbus start \ && echo "[+] PIP Installing playwright into /venv and CHROMIUM binary into $PLAYWRIGHT_BROWSERS_PATH..." \ && uv pip install "playwright>=1.49.1" \ - && uv run playwright install chromium --no-shell --with-deps \ + && uv run playwright install chromium --no-shell --with-deps \ && export CHROME_BINARY="$(uv run python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \ && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ && ln -s /browsers/ffmpeg-*/ffmpeg-linux /usr/bin/ffmpeg \ @@ -381,11 +379,9 @@ RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \ ) | tee -a /VERSION.txt -# Run $ archivebox version >> /VERSION.txt -# RUN "$CODE_DIR"/bin/docker_entrypoint.sh init 2>&1 | tee -a /VERSION.txt -# Note: archivebox version is skipped during build due to uv managed Python stdlib issue -# The version will be verified at runtime instead -RUN chmod +x "$CODE_DIR"/bin/*.sh +# Verify ArchiveBox is installed and print version info +RUN chmod +x "$CODE_DIR"/bin/*.sh \ + && archivebox version 2>&1 | tee -a /VERSION.txt || true #################################################### @@ -395,7 +391,7 @@ VOLUME "$DATA_DIR" EXPOSE 8000 HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ - CMD curl --silent 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK' + CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK' ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] CMD ["archivebox", "server", "--init", "0.0.0.0:8000"] diff --git a/README.md b/README.md index 1804ecf8..6615dce4 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
# Option A: Get ArchiveBox with Docker Compose (recommended):
 mkdir -p ~/archivebox/data && cd ~/archivebox
 curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
-docker compose run archivebox init --setup
+docker compose run archivebox init --install
 # docker compose run archivebox add 'https://example.com'
 # docker compose run archivebox help
 # docker compose up
@@ -85,7 +85,7 @@ docker compose run archivebox init --setup
 
# Option B: Or use it as a plain Docker container: mkdir -p ~/archivebox/data && cd ~/archivebox/data -docker run -it -v $PWD:/data archivebox/archivebox init --setup +docker run -it -v $PWD:/data archivebox/archivebox init --install # docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' # docker run -it -v $PWD:/data archivebox/archivebox help # docker run -it -v $PWD:/data -p 8000:8000 archivebox/archivebox @@ -94,7 +94,7 @@ docker run -it -v $PWD:/data archivebox/archivebox init --setup # Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) pip install archivebox mkdir -p ~/archivebox/data && cd ~/archivebox/data -archivebox init --setup +archivebox init --install # archivebox add 'https://example.com' # archivebox help # archivebox server 0.0.0.0:8000 @@ -189,7 +189,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
  • Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml) -
    docker compose run archivebox init --setup
    +
    docker compose run archivebox init --install
     
  • Next steps: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker compose up
    @@ -213,7 +213,7 @@ See below for more usage examples using the C
     
  • Install Docker on your system (if not already installed).
  • Create a new empty directory and initialize your collection (can be anywhere).
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    -docker run -v $PWD:/data -it archivebox/archivebox init --setup
    +docker run -v $PWD:/data -it archivebox/archivebox init --install
     
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. @@ -259,19 +259,18 @@ See "Against curl | sh as a
      -
    1. Install Python >= v3.10 and Node >= v18 on your system (if not already installed).
    2. +
    3. Install Python >= v3.13 and Node >= v22 on your system (if not already installed).
    4. Install the ArchiveBox package using pip3 (or uvx). -
      pip3 install --upgrade archivebox yt-dlp playwright
      -playwright install --with-deps chromium
      +
      pip3 install --upgrade archivebox
       archivebox version
       # install any missing extras shown using apt/brew/pkg/etc. see Wiki for instructions
      -#    python@3.10 node curl wget git ripgrep ...
      +#    python@3.13 node curl wget git ripgrep ...
       
      See the Install: Bare Metal Wiki for full install instructions for each OS...
    5. Create a new empty directory and initialize your collection (can be anywhere).
      mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
      -archivebox init --setup   # instantialize a new collection
      +archivebox init --install   # instantialize a new collection
       # (--setup auto-installs and link JS dependencies: singlefile, readability, mercury, etc.)
       
    6. @@ -312,7 +311,7 @@ archivebox version # make sure all dependencies are inst
    7. Create a new empty directory and initialize your collection (can be anywhere).
      mkdir -p ~/archivebox/data && cd ~/archivebox/data
      -archivebox init --setup
      +archivebox init --install
       

    8. @@ -346,7 +345,7 @@ archivebox version # make sure all dependencies are inst
    9. Create a new empty directory and initialize your collection (can be anywhere).
      mkdir -p ~/archivebox/data && cd ~/archivebox/data
      -archivebox init --setup
      +archivebox init --install
       
    10. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. @@ -519,7 +518,7 @@ archivebox persona create --import=chrome personal # make sure you have pip-installed ArchiveBox and it's available in your $PATH first
      # archivebox [subcommand] [--help] -archivebox init --setup # safe to run init multiple times (also how you update versions) +archivebox init --install # safe to run init multiple times (also how you update versions) archivebox version # get archivebox version info + check dependencies archivebox help # get list of archivebox subcommands that can be run archivebox add --depth=1 'https://news.ycombinator.com' @@ -536,7 +535,7 @@ archivebox add --depth=1 'https://news.ycombinator.com' # make sure you have `docker-compose.yml` from the Quickstart instructions first
      # docker compose run archivebox [subcommand] [--help] -docker compose run archivebox init --setup +docker compose run archivebox init --install docker compose run archivebox version docker compose run archivebox help docker compose run archivebox add --depth=1 'https://news.ycombinator.com' @@ -554,7 +553,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com' # make sure you create and cd into in a new empty directory first
      # docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] -docker run -v $PWD:/data -it archivebox/archivebox init --setup +docker run -v $PWD:/data -it archivebox/archivebox init --install docker run -v $PWD:/data -it archivebox/archivebox version docker run -v $PWD:/data -it archivebox/archivebox help docker run -v $PWD:/data -it archivebox/archivebox add --depth=1 'https://news.ycombinator.com' @@ -760,7 +759,7 @@ env CHROME_BINARY=chromium archivebox ... # run with a one-off config These methods also work the same way when run inside Docker, see the Docker Configuration wiki page for details.
      -The configuration is documented here: **[Configuration Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**, and loaded here: [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config.py). +The configuration is documented here: **[Configuration Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**, and loaded from: [`archivebox/config/`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config/).
      @@ -771,16 +770,12 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
      TIMEOUT=240 # default: 60 add more seconds on slower networks CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL -SAVE_ARCHIVEDOTORG=False # default: True False = disable Archive.org saving -YTDLP_MAX_SIZE=1500m # default: 750m raise/lower yt-dlp output size
      PUBLIC_INDEX=True # default: True whether anon users can view index PUBLIC_SNAPSHOTS=True # default: True whether anon users can view pages PUBLIC_ADD_VIEW=False # default: False whether anon users can add new URLs
      -CHROME_USER_AGENT="Mozilla/5.0 ..." # change these to get around bot blocking -WGET_USER_AGENT="Mozilla/5.0 ..." -CURL_USER_AGENT="Mozilla/5.0 ..." +USER_AGENT="Mozilla/5.0 ..." # change this to get around bot blocking

  • @@ -802,13 +797,13 @@ ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.c
      -
    • Language: Python >=3.10
    • +
    • Language: Python >=3.13
    • Backend: Django + Django-Ninja for REST API
    • -
    • Frontend: Django Admin + Vanilla HTML, CSS, JS
    • -
    • Web Server: Django + channels + daphne]
    • -
    • Database: Django ORM saving to SQLite3 ./data/index.sqlite
    • -
    • Job Queue: Huey using ./data/queue.sqlite3 under supervisord
    • -
    • Build/test/lint: pdm / mypy+pyright+pytest / ruff
    • +
    • Frontend: Django Admin + Vanilla HTML, CSS, JS
    • +
    • Web Server: Django + daphne (ASGI)
    • +
    • Database: Django ORM saving to SQLite3 ./data/index.sqlite3
    • +
    • Job Queue: Custom orchestrator using supervisord for worker management
    • +
    • Build/test/lint: uv / mypy+pyright+pytest / ruff
    • Subdependencies: abx-pkg installs apt/brew/pip/npm pkgs at runtime (e.g. yt-dlp, singlefile, readability, git)
    @@ -838,7 +833,7 @@ If not using Docker, make sure to keep the dependencies up-to-date yourself and # apt/brew/pip/etc install ... (see Quickstart instructions above)
    which -a archivebox # see where you have installed archivebox -archivebox setup # auto install all the extractors and extras +archivebox install # auto install all the extractors and extras archivebox --version # see info and check validity of installed dependencies
    @@ -963,18 +958,11 @@ If you're importing pages with private content or URLs containing secret tokens archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument' archivebox add 'https://vimeo.com/somePrivateVideo' -# without first disabling saving to Archive.org: -archivebox config --set SAVE_ARCHIVEDOTORG=False # disable saving all URLs in Archive.org - # restrict the main index, Snapshot content, and Add Page to authenticated users as-needed: archivebox config --set PUBLIC_INDEX=False archivebox config --set PUBLIC_SNAPSHOTS=False -archivebox config --set PUBLIC_ADD_VIEW=False +archivebox config --set PUBLIC_ADD_VIEW=False archivebox manage createsuperuser - -# if extra paranoid or anti-Google: -archivebox config --set SAVE_FAVICON=False # disable favicon fetching (it calls a Google API passing the URL's domain part only) -archivebox config --set CHROME_BINARY=chromium # ensure it's using Chromium instead of Chrome
    @@ -1017,7 +1005,7 @@ https://127.0.0.1:8000/archive/*

    NOTE: Only the wget & dom extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing.
    -If you are worried about these issues ^ you should disable these extractors using:
    archivebox config --set SAVE_WGET=False SAVE_DOM=False.

    +If you are worried about these issues ^ you can disable specific extractor plugins via the admin UI or configuration.

    Learn More

    @@ -1377,15 +1365,15 @@ git pull --recurse-submodules ```bash # Install ArchiveBox + python dependencies pip install uv -./bin/lock_pkgs.sh # (aka `uv venv; uv sync;` + generate requirements.txt) +uv sync --dev --all-extras source .venv/bin/activate # activate the venv # Install ArchiveBox runtime dependencies mkdir -p data && cd data -archivebox install # on >=v0.8.5 (otherwise `archivebox setup`) +archivebox install # detect and install all extractor dependencies # Run the development server w/ autoreloading (but no bg workers) -archivebox manage runserver --debug --reload 0.0.0.0:8000 +archivebox server --debug --reload 0.0.0.0:8000 # Run the production server (with bg workers but no autoreloading) archivebox server 0.0.0.0:8000 @@ -1399,10 +1387,10 @@ archivebox server 0.0.0.0:8000 # inside the container will reload and pick up your changes ./bin/build_docker.sh dev -docker run -it -v $PWD/data:/data archivebox/archivebox:dev init --setup +docker run -it -v $PWD/data:/data archivebox/archivebox:dev init --install # Run the development server w/ autoreloading (but no bg workers) -docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev manage runserver 0.0.0.0:8000 --debug --reload +docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev server --debug --reload 0.0.0.0:8000 # Run the production server (with bg workers but no autoreloading) docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev server @@ -1427,7 +1415,7 @@ You can also run all these in Docker. For more examples see the GitHub Actions C archivebox config --set DEBUG=True # OR you can run a dev server with DEBUG=True in a few ways: -archivebox manage runserver --debug --reload 0.0.0.0:8000 +archivebox server --debug --reload 0.0.0.0:8000 # or archivebox server --debug 0.0.0.0:8000 # or diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index 9a3b3d3c..b9e10297 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -32,8 +32,8 @@ export ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}" export DEFAULT_PUID=911 export DEFAULT_PGID=911 -# If user tires to set PUID and PGID to root values manually, catch and reject because root is not allowed -if [[ "$PUID" == "0" ]]; then +# If user tries to set PUID and PGID to root values manually, catch and reject because root is not allowed +if [[ "${PUID:-}" == "0" ]]; then echo -e "\n[X] Error: Got PUID=$PUID and PGID=$PGID but ArchiveBox is not allowed to be run as root, please change or unset PUID & PGID and try again." > /dev/stderr echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr diff --git a/docker-compose.yml b/docker-compose.yml index 76b237ea..416a48fc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,11 +1,10 @@ # Usage: # mkdir -p ~/archivebox/data && cd ~/archivebox # curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml -# docker compose run archivebox version -# docker compose run archivebox config --set SAVE_ARCHIVEDOTORG=False +# docker compose run archivebox init # docker compose run archivebox add --depth=1 'https://news.ycombinator.com' # docker compose run -T archivebox add < bookmarks.txt -# docker compose up -d && open 'http://web.archivebox.localhost:8000' +# docker compose up -d && open 'http://localhost:8000' # docker compose run archivebox help # Documentation: # https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose @@ -21,9 +20,8 @@ services: environment: # - ADMIN_USERNAME=admin # creates an admin user on first run with the given user/pass combo # - ADMIN_PASSWORD=SomeSecretPassword - - LISTEN_HOST=archivebox.localhost:8000 - ALLOWED_HOSTS=* # set this to the hostname(s) you're going to serve the site from! - - CSRF_TRUSTED_ORIGINS=http://admin.archivebox.localhost:8000 # MUST match the admin UI URL for login/API to work + - CSRF_TRUSTED_ORIGINS=http://localhost:8000 # MUST match the admin UI URL for login/API to work - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive @@ -33,10 +31,8 @@ services: # - PUID=911 # set to your host user's UID & GID if you encounter permissions issues # - PGID=911 # UID/GIDs lower than 500 may clash with system uids and are not recommended # For options below, it's better to set in data/ArchiveBox.conf or use `docker compose run archivebox config --set SOME_KEY=someval` instead of setting here: - # - YTDLP_MAX_SIZE=750m # increase this filesize limit to allow archiving larger video/audio files # - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out # - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs) - # - SAVE_ARCHIVEDOTORG=True # set to False to disable submitting all URLs to Archive.org when archiving # - USER_AGENT="..." # set a custom USER_AGENT to avoid being blocked as a bot # ... # For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration @@ -79,7 +75,7 @@ services: ### This runs the optional Sonic full-text search backend (much faster than default rg backend). # If Sonic is ever started after not running for a while, update its full-text index by running: - # $ docker-compose run archivebox update --index-only + # $ docker compose run archivebox update --index-only # https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search sonic: