diff --git a/archivebox/api/models.py b/archivebox/api/models.py index 50d5bcc8..29f99913 100755 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -2,7 +2,6 @@ __package__ = 'archivebox.api' import secrets from archivebox.uuid_compat import uuid7 -from datetime import timedelta from django.conf import settings from django.db import models diff --git a/archivebox/api/tests.py b/archivebox/api/tests.py index ee566a63..0dba652c 100644 --- a/archivebox/api/tests.py +++ b/archivebox/api/tests.py @@ -1,16 +1,17 @@ -import os -import django +import importlib from io import StringIO from types import SimpleNamespace -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') -django.setup() +from archivebox.config.django import setup_django -from django.contrib.auth.models import User -from django.test import TestCase +setup_django() -from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule -from archivebox.crawls.models import CrawlSchedule +User = importlib.import_module('django.contrib.auth.models').User +TestCase = importlib.import_module('django.test').TestCase +api_v1_cli = importlib.import_module('archivebox.api.v1_cli') +ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema +cli_schedule = api_v1_cli.cli_schedule +CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule class CLIScheduleAPITests(TestCase): diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py index cc82c371..a77124cf 100644 --- a/archivebox/api/v1_auth.py +++ b/archivebox/api/v1_auth.py @@ -3,10 +3,7 @@ __package__ = 'archivebox.api' from typing import Optional from ninja import Router, Schema -from django.utils import timezone -from datetime import timedelta -from archivebox.api.models import APIToken from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index fe268a3c..ca1b0d87 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -5,7 +5,6 @@ from typing import List, Optional from datetime import datetime from django.utils import timezone -from django.db.models import Q from django.contrib.auth import get_user_model from ninja import Router, Schema diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py index 3c4fa643..0d172fca 100644 --- a/archivebox/base_models/admin.py +++ b/archivebox/base_models/admin.py @@ -6,7 +6,7 @@ import json from django import forms from django.contrib import admin -from django.utils.html import format_html, mark_safe +from django.utils.html import mark_safe from django_object_actions import DjangoObjectActions diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index c036edd1..02cf144b 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -2,12 +2,9 @@ __package__ = 'archivebox.base_models' -from uuid import UUID from archivebox.uuid_compat import uuid7 -from typing import ClassVar from pathlib import Path -from django.contrib import admin from django.db import models from django.db.models import F from django.utils import timezone @@ -17,8 +14,6 @@ from django.conf import settings from django_stubs_ext.db.models import TypedModelMeta -from archivebox import DATA_DIR -from archivebox.misc.hashing import get_dir_info def get_or_create_system_user_pk(username='system'): diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 46ae23a9..a1eecf79 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -57,6 +57,7 @@ def add(urls: str | list[str], from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.personas.models import Persona from archivebox.workers.orchestrator import Orchestrator from archivebox.misc.logging_util import printable_filesize from archivebox.misc.system import get_dir_size @@ -79,11 +80,15 @@ def add(urls: str | list[str], # Read URLs directly into crawl urls_content = sources_file.read_text() + persona_name = (persona or 'Default').strip() or 'Default' + persona_obj, _ = Persona.objects.get_or_create(name=persona_name) + persona_obj.ensure_dirs() crawl = Crawl.objects.create( urls=urls_content, max_depth=depth, tags_str=tag, + persona_id=persona_obj.id, label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]', created_by_id=created_by_id, config={ @@ -91,7 +96,7 @@ def add(urls: str | list[str], 'INDEX_ONLY': index_only, 'OVERWRITE': overwrite, 'PLUGINS': plugins, - 'DEFAULT_PERSONA': persona or 'Default', + 'DEFAULT_PERSONA': persona_name, 'PARSER': parser, } ) @@ -135,8 +140,7 @@ def add(urls: str | list[str], print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]') else: # Foreground mode: run full orchestrator until all work is done - print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]') - from archivebox.workers.orchestrator import Orchestrator + print('[green]\\[*] Starting orchestrator to process crawl...[/green]') orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id)) orchestrator.runloop() # Block until complete diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 751a85ea..c96c0bde 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -94,7 +94,7 @@ def config(*keys, # Display all plugin config in single [PLUGINS] section if plugin_keys: - print(f'[grey53]\\[PLUGINS][/grey53]') + print('[grey53]\\[PLUGINS][/grey53]') print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n')) print('[grey53]################################################################[/grey53]') diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index 9142fbf8..900c0bef 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -31,7 +31,6 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox extract' import sys -from typing import Optional, List import rich_click as click diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index 3c8a4e35..8a91e8d2 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -3,8 +3,6 @@ __package__ = 'archivebox.cli' import os -import sys -import shutil import rich_click as click from rich import print diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py index 1e1d4e60..cc0b95ae 100644 --- a/archivebox/cli/archivebox_persona.py +++ b/archivebox/cli/archivebox_persona.py @@ -410,7 +410,6 @@ def create_personas( """ from archivebox.misc.jsonl import write_record from archivebox.personas.models import Persona - from archivebox.config.constants import CONSTANTS is_tty = sys.stdout.isatty() name_list = list(names) if names else [] @@ -493,10 +492,10 @@ def create_personas( 'SingletonLock', 'SingletonSocket', 'SingletonCookie', ), ) - rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr) + rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr) # Extract cookies via CDP - rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) + rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) if extract_cookies_via_cdp( persona_chrome_dir, @@ -506,8 +505,8 @@ def create_personas( ): rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr) else: - rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr) - rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr) + rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr) + rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr) except Exception as e: rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr) diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py index fe280faa..a6d132ac 100644 --- a/archivebox/cli/archivebox_pluginmap.py +++ b/archivebox/cli/archivebox_pluginmap.py @@ -3,7 +3,6 @@ __package__ = 'archivebox.cli' from typing import Optional -from pathlib import Path import rich_click as click diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py index b066b474..009afa36 100644 --- a/archivebox/cli/archivebox_search.py +++ b/archivebox/cli/archivebox_search.py @@ -4,7 +4,7 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox search' from pathlib import Path -from typing import Optional, List, Any +from typing import Optional, List import rich_click as click from rich import print @@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None, csv: str | None=None, with_headers: bool=False): """List, filter, and export information about archive entries""" - from archivebox.core.models import Snapshot if with_headers and not (json or html or csv): stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 6e6401cd..d3a31a3c 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), get_existing_supervisord_process, get_worker, start_server_workers, - tail_multiple_worker_logs, is_port_in_use, ) from archivebox.workers.orchestrator import Orchestrator @@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), if is_port_in_use(host, int(port)): print(f'[red][X] Error: Port {port} is already in use[/red]') print(f' Another process (possibly daphne) is already listening on {host}:{port}') - print(f' Stop the conflicting process or choose a different port') + print(' Stop the conflicting process or choose a different port') sys.exit(1) # Check if orchestrator is already running for this data directory if Orchestrator.is_running(): - print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]') - print(f' Stop the existing orchestrator before starting a new server') - print(f' To stop: pkill -f "archivebox manage orchestrator"') + print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]') + print(' Stop the existing orchestrator before starting a new server') + print(' To stop: pkill -f "archivebox manage orchestrator"') sys.exit(1) # Check if supervisord is already running @@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), print('[red][X] Error: ArchiveBox server is already running[/red]') print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING': - print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING') + print(' [green]√[/green] Background worker (worker_orchestrator) is RUNNING') print() print('[yellow]To stop the existing server, run:[/yellow]') print(' pkill -f "archivebox server"') diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index c0622f0d..424de1ef 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None: if not snapshot.downloaded_at: continue print( - '[grey53] ' + ( + '[grey53] ' f' > {str(snapshot.downloaded_at)[:16]} ' f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'"{snapshot.title}": {snapshot.url}' - )[:SHELL_CONFIG.TERM_WIDTH] - + '[grey53]', + '[/grey53]' + )[:SHELL_CONFIG.TERM_WIDTH], ) print('[grey53] ...') diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index a3601bd0..9a8fd8e0 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (), from archivebox.config.django import setup_django setup_django() - from archivebox.core.models import Snapshot - from django.utils import timezone from django.core.management import call_command # Run migrations first to ensure DB schema is up-to-date diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 4f80bfe2..c89298f9 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -6,7 +6,7 @@ import sys import os import platform from pathlib import Path -from typing import Iterable, Optional +from typing import Iterable import rich_click as click diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py index 27dec785..ab3b7a8e 100644 --- a/archivebox/cli/tests.py +++ b/archivebox/cli/tests.py @@ -3,13 +3,13 @@ __package__ = 'archivebox.cli' +import importlib import os -import sys import shutil +import sys import unittest -from pathlib import Path - from contextlib import contextmanager +from pathlib import Path TEST_CONFIG = { 'USE_COLOR': 'False', @@ -30,18 +30,15 @@ TEST_CONFIG = { DATA_DIR = 'data.tests' os.environ.update(TEST_CONFIG) -from ..main import init -from archivebox.config.constants import ( - SQL_INDEX_FILENAME, - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, -) - -from . import ( - archivebox_init, - archivebox_add, - archivebox_remove, -) +init = importlib.import_module('archivebox.main').init +constants = importlib.import_module('archivebox.config.constants') +SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME +JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME +HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME +archivebox_init = importlib.import_module('archivebox.cli.archivebox_init') +archivebox_add = importlib.import_module('archivebox.cli.archivebox_add') +archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove') +parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index HIDE_CLI_OUTPUT = True @@ -68,6 +65,13 @@ stdout = sys.stdout stderr = sys.stderr +def load_main_index(*, out_dir: str): + index_path = Path(out_dir) / JSON_INDEX_FILENAME + if not index_path.exists(): + raise FileNotFoundError(index_path) + return list(parse_json_main_index(Path(out_dir))) + + @contextmanager def output_hidden(show_failing=True): if not HIDE_CLI_OUTPUT: diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index 9f8e8c02..623c2567 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -23,7 +23,6 @@ Each command should: __package__ = 'archivebox.cli' import os -import sys import json import shutil import tempfile @@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase): def test_parse_jsonl_with_id(self): """JSONL with id field should be recognized.""" - from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT + from archivebox.misc.jsonl import parse_line line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}' result = parse_line(line) @@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): """ from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, - TYPE_SNAPSHOT + read_args_or_stdin, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): Test: archivebox snapshot URL | archivebox extract Extract should accept JSONL output from snapshot command. """ - from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( read_args_or_stdin, TYPE_SNAPSHOT @@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase): Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract """ from archivebox.hooks import collect_urls_from_plugins - from archivebox.misc.jsonl import TYPE_SNAPSHOT # Create mock output directory snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test' @@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase): def test_crawl_passes_through_other_types(self): """crawl create should pass through records with other types.""" - from archivebox.misc.jsonl import TYPE_CRAWL # Input: a Tag record (not a Crawl or URL) tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'} @@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase): # Mock stdin with both records stdin = StringIO( - json.dumps(tag_record) + '\n' + - json.dumps(url_record) + json.dumps(tag_record) + + '\n' + + json.dumps(url_record) ) stdin.isatty = lambda: False @@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase): def test_snapshot_passes_through_crawl(self): """snapshot create should pass through Crawl records.""" - from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT + from archivebox.misc.jsonl import TYPE_CRAWL crawl_record = { 'type': TYPE_CRAWL, diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 246a2e0c..0033269c 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -8,10 +8,6 @@ and other modules that expect to import config values directly. __package__ = 'archivebox.config' __order__ = 200 -import shutil -from pathlib import Path -from typing import Dict, List, Optional - from .paths import ( PACKAGE_DIR, # noqa DATA_DIR, # noqa @@ -31,6 +27,7 @@ def _get_config(): from .common import ARCHIVING_CONFIG, STORAGE_CONFIG return ARCHIVING_CONFIG, STORAGE_CONFIG + # Direct exports (evaluated at import time for backwards compat) # These are recalculated each time the module attribute is accessed diff --git a/archivebox/config/collection.py b/archivebox/config/collection.py index 46b591fe..51af4ab6 100644 --- a/archivebox/config/collection.py +++ b/archivebox/config/collection.py @@ -9,7 +9,6 @@ from configparser import ConfigParser from benedict import benedict -import archivebox from archivebox.config.constants import CONSTANTS diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index e284d44b..39b8f51a 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -11,10 +11,10 @@ __package__ = "archivebox.config" import os import json from pathlib import Path -from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast +from typing import Any, Dict, Optional, Type, Tuple from configparser import ConfigParser -from pydantic import Field, ConfigDict +from pydantic import ConfigDict from pydantic_settings import BaseSettings, PydanticBaseSettingsSource @@ -166,6 +166,23 @@ def get_config( if user is None and crawl and hasattr(crawl, "created_by"): user = crawl.created_by + + if persona is None and crawl is not None: + try: + from archivebox.personas.models import Persona + + persona_id = getattr(crawl, "persona_id", None) + if persona_id: + persona = Persona.objects.filter(id=persona_id).first() + + if persona is None: + crawl_config = getattr(crawl, "config", None) or {} + default_persona_name = crawl_config.get("DEFAULT_PERSONA") + if default_persona_name: + persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default") + persona.ensure_dirs() + except Exception: + pass from archivebox.config.constants import CONSTANTS from archivebox.config.common import ( SHELL_CONFIG, diff --git a/archivebox/config/django.py b/archivebox/config/django.py index 75cc5539..09ddcfd2 100644 --- a/archivebox/config/django.py +++ b/archivebox/config/django.py @@ -100,9 +100,11 @@ def setup_django(check_db=False, in_memory_db=False) -> None: return from django.conf import settings + from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG # log startup message to the error log - with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: + error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG) + with open(error_log, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py index 08d81ce6..b8a5f557 100644 --- a/archivebox/config/permissions.py +++ b/archivebox/config/permissions.py @@ -46,7 +46,6 @@ if RUNNING_AS_UID == 0: # if we are running as root it's really hard to figure out what the correct archivebox user should be # as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users) # check if 911:911 archivebox user exists on host system, and use it instead of 0 - import pwd if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox': FALLBACK_UID = DEFAULT_PUID FALLBACK_GID = DEFAULT_PGID diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 316e1aa3..1e3e8f5e 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -3,7 +3,6 @@ __package__ = 'archivebox.config' import os import shutil import inspect -from pathlib import Path from typing import Any, List, Dict, cast from benedict import benedict @@ -30,11 +29,11 @@ KNOWN_BINARIES = [ ] -def obj_to_yaml(obj: Any, indent: int=0) -> str: +def obj_to_yaml(obj: Any, indent: int = 0) -> str: indent_str = " " * indent if indent == 0: indent_str = '\n' # put extra newline between top-level entries - + if isinstance(obj, dict): if not obj: return "{}" @@ -42,7 +41,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: for key, value in obj.items(): result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n" return result - + elif isinstance(obj, list): if not obj: return "[]" @@ -50,16 +49,16 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: for item in obj: result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n" return result.rstrip() - + elif isinstance(obj, str): if "\n" in obj: return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ") else: return f" {obj}" - + elif isinstance(obj, (int, float, bool)): return f" {str(obj)}" - + elif callable(obj): source = '\n'.join( '' if 'def ' in line else line @@ -67,7 +66,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: if line.strip() ).split('lambda: ')[-1].rstrip(',') return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ") - + else: return f" {str(obj)}" @@ -75,7 +74,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: def get_detected_binaries() -> Dict[str, Dict[str, Any]]: """Detect available binaries using shutil.which.""" binaries = {} - + for name in KNOWN_BINARIES: path = shutil.which(name) if path: @@ -85,7 +84,7 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]: 'version': None, # Could add version detection later 'is_available': True, } - + return binaries @@ -144,19 +143,19 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: # Get binaries from database (previously detected/installed) db_binaries = {b.name: b for b in Binary.objects.all()} - - # Get currently detectable binaries + + # Get currently detectable binaries detected = get_detected_binaries() - + # Merge and display all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys()))) - + for name in all_binary_names: db_binary = db_binaries.get(name) detected_binary = detected.get(name) - + rows['Binary Name'].append(ItemLink(name, key=name)) - + if db_binary: rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found') rows['Provided By'].append(db_binary.binprovider or 'PATH') @@ -175,6 +174,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: table=rows, ) + @render_with_item_view def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: @@ -203,7 +203,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: ) except Binary.DoesNotExist: pass - + # Try to detect from PATH path = shutil.which(key) if path: @@ -224,7 +224,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: }, ], ) - + return ItemContext( slug=key, title=key, @@ -286,6 +286,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: table=rows, ) + @render_with_item_view def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: import json @@ -314,7 +315,10 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: # Add config.json data if available if plugin.get('config'): config_json = json.dumps(plugin['config'], indent=2) - fields["config.json"] = mark_safe(f'
{config_json}')
+ fields["config.json"] = mark_safe(
+ '{config_json}'
+ )
# Also extract and display individual config properties for easier viewing
if 'properties' in plugin['config']:
@@ -322,7 +326,6 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
properties_summary = []
for prop_name, prop_info in config_properties.items():
prop_type = prop_info.get('type', 'unknown')
- prop_default = prop_info.get('default', 'N/A')
prop_desc = prop_info.get('description', '')
properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}")
@@ -365,7 +368,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
title="No running worker processes",
table=rows,
)
-
+
all_config_entries = cast(List[Dict[str, Any]], supervisor.getAllConfigInfo() or [])
all_config = {config["name"]: benedict(config) for config in all_config_entries}
@@ -514,7 +517,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, "Must be a superuser to view configuration settings."
-
+
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
log_text = log_file.read_text()
diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py
index ce4ca437..ab5fc144 100644
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -1,8 +1,8 @@
__package__ = 'archivebox.core'
from django.contrib import admin
+from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
-import archivebox
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
@@ -20,7 +20,6 @@ archivebox_admin = ArchiveBoxAdmin()
# patch admin with methods to add data views (implemented by admin_data_views package)
# https://github.com/MrThearMan/django-admin-data-views
# https://mrthearman.github.io/django-admin-data-views/setup/
-from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index bc1093c9..85024ed5 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -26,7 +26,7 @@ from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot, ArchiveResult
-from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
+from archivebox.core.admin_archiveresults import render_archiveresults_list
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
@@ -712,8 +712,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="🔁 Redo Failed"
)
def update_snapshots(self, request, queryset):
- count = queryset.count()
-
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
@@ -741,8 +739,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="🔄 Redo"
)
def overwrite_snapshots(self, request, queryset):
- count = queryset.count()
-
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py
index 713d34d9..d6703b3f 100644
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -60,7 +60,7 @@ class CoreConfig(AppConfig):
from archivebox.workers.orchestrator import Orchestrator
Process.cleanup_stale_running()
- machine = Machine.current()
+ Machine.current()
if not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()
diff --git a/archivebox/core/asgi.py b/archivebox/core/asgi.py
index 4963169f..1253fbb0 100644
--- a/archivebox/core/asgi.py
+++ b/archivebox/core/asgi.py
@@ -8,11 +8,10 @@ https://docs.djangoproject.com/en/stable/howto/deployment/asgi/
"""
from archivebox.config.django import setup_django
+from django.core.asgi import get_asgi_application
setup_django(in_memory_db=False, check_db=True)
-from django.core.asgi import get_asgi_application
-
# Standard Django ASGI application (no websockets/channels needed)
application = get_asgi_application()
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index a1a83ed7..cc4f62b3 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -6,6 +6,7 @@ from archivebox.misc.util import URL_REGEX
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
from archivebox.crawls.schedule_utils import validate_schedule
+from archivebox.hooks import get_plugins
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@@ -15,7 +16,6 @@ DEPTH_CHOICES = (
('4', 'depth = 4 (+ URLs four hops away)'),
)
-from archivebox.hooks import get_plugins
def get_plugin_choices():
"""Get available extractor plugins from discovered hooks."""
@@ -210,15 +210,18 @@ class AddLinkForm(forms.Form):
return schedule
+
class TagWidgetMixin:
def format_value(self, value):
if value is not None and not isinstance(value, str):
value = edit_string_for_tags(value)
return super().format_value(value)
+
class TagWidget(TagWidgetMixin, forms.TextInput):
pass
+
class TagField(forms.CharField):
widget = TagWidget
diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py
index 2003b478..7594eb8d 100644
--- a/archivebox/core/middleware.py
+++ b/archivebox/core/middleware.py
@@ -17,7 +17,6 @@ from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from archivebox.core.host_utils import (
build_admin_url,
- build_api_url,
build_web_url,
get_api_host,
get_admin_host,
diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py
index dc96c8da..0f5df6a9 100644
--- a/archivebox/core/migrations/0006_auto_20201012_1520.py
+++ b/archivebox/core/migrations/0006_auto_20201012_1520.py
@@ -7,10 +7,8 @@ def forwards_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
TagModel = apps.get_model("core", "Tag")
- db_alias = schema_editor.connection.alias
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
- tags = snapshot.tags
tag_set = (
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
)
@@ -23,9 +21,7 @@ def forwards_func(apps, schema_editor):
def reverse_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
- TagModel = apps.get_model("core", "Tag")
- db_alias = schema_editor.connection.alias
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
tags = snapshot.tags.values_list("name", flat=True)
diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index c052f9ce..9cf5e75d 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -43,7 +43,7 @@ def forwards_func(apps, schema_editor):
try:
with open(out_dir / "index.json", "r") as f:
fs_index = json.load(f)
- except Exception as e:
+ except Exception:
continue
history = fs_index["history"]
diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
index c32c31b3..a95cc007 100644
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -234,7 +234,6 @@ def upgrade_core_tables(apps, schema_editor):
tag_has_data = cursor.fetchone()[0] > 0
if tag_has_data:
- tag_cols = get_table_columns('core_tag')
cursor.execute("PRAGMA table_info(core_tag)")
tag_id_type = None
for row in cursor.fetchall():
diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py
index ddd3c87b..fc435608 100644
--- a/archivebox/core/migrations/0024_assign_default_crawl.py
+++ b/archivebox/core/migrations/0024_assign_default_crawl.py
@@ -2,7 +2,6 @@
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
from django.db import migrations, models
-import uuid
def create_default_crawl_and_assign_snapshots(apps, schema_editor):
diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
index 8ac9d889..a26caa10 100644
--- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
+++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
@@ -347,7 +347,7 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
migrated_count += 1
if i == 0:
- print(f'DEBUG 0027: Linked ArchiveResult to Process')
+ print('DEBUG 0027: Linked ArchiveResult to Process')
except Exception as e:
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index f9c6cc5f..8a6dac92 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,6 +1,6 @@
__package__ = 'archivebox.core'
-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
+from typing import Optional, Dict, Iterable, Any, List
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
@@ -12,19 +12,18 @@ from pathlib import Path
from statemachine import State, registry
from django.db import models
-from django.db.models import QuerySet, Value, Case, When, IntegerField
+from django.db.models import QuerySet
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.utils import timezone
from django.core.cache import cache
-from django.urls import reverse, reverse_lazy
+from django.urls import reverse_lazy
from django.contrib import admin
from django.conf import settings
from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
-from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
-from archivebox.misc.hashing import get_dir_info
+from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.hooks import (
get_plugins, get_plugin_name, get_plugin_icon,
)
@@ -186,7 +185,7 @@ class SnapshotQuerySet(models.QuerySet):
for pattern in patterns:
try:
qsearch |= query_search_index(pattern)
- except:
+ except BaseException:
raise SystemExit(2)
return self.all() & qsearch
@@ -344,8 +343,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@property
def process_set(self):
"""Get all Process objects related to this snapshot's ArchiveResults."""
- import json
- import json
from archivebox.machine.models import Process
return Process.objects.filter(archiveresult__snapshot_id=self.id)
@@ -458,13 +455,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not old_dir.exists() or old_dir == new_dir:
# No migration needed
- print(f"[DEBUG _fs_migrate] Returning None (early return)")
+ print("[DEBUG _fs_migrate] Returning None (early return)")
return None
if new_dir.exists():
# New directory already exists (files already copied), but we still need cleanup
# Return cleanup info so old directory can be cleaned up
- print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
+ print("[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
return (old_dir, new_dir)
new_dir.mkdir(parents=True, exist_ok=True)
@@ -499,7 +496,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Schedule cleanup AFTER transaction commits successfully
# This ensures DB changes are committed before we delete old files
- from django.db import transaction
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
# Return cleanup info for manual cleanup if needed (when called directly)
@@ -594,8 +590,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
domain = self.extract_domain_from_url(self.url)
return (
- CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
- date_str / domain / str(self.id)
+ CONSTANTS.DATA_DIR / 'users' / username / 'snapshots'
+ / date_str / domain / str(self.id)
)
else:
# Unknown version - use current
@@ -670,7 +666,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
return snapshot
elif candidates.count() > 1:
- print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
+ print("[DEBUG load_from_directory] Multiple fuzzy matches, using first")
return candidates.first()
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
return None
@@ -767,7 +763,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
ts_int = int(float(ts))
# 1995-01-01 to 2035-12-31
return 788918400 <= ts_int <= 2082758400
- except:
+ except (TypeError, ValueError, OverflowError):
return False
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
@@ -850,7 +846,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
with open(json_path) as f:
index_data = json.load(f)
- except:
+ except (OSError, TypeError, ValueError, json.JSONDecodeError):
pass
# Merge title
@@ -929,7 +925,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if result_data.get('start_ts'):
try:
start_ts = parser.parse(result_data['start_ts'])
- except:
+ except (TypeError, ValueError, OverflowError):
pass
if (plugin, start_ts) in existing:
@@ -940,7 +936,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if result_data.get('end_ts'):
try:
end_ts = parser.parse(result_data['end_ts'])
- except:
+ except (TypeError, ValueError, OverflowError):
pass
# Support both 'output' (legacy) and 'output_str' (new JSONL) field names
@@ -957,7 +953,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
start_ts=start_ts,
end_ts=end_ts,
)
- except:
+ except Exception:
pass
def write_index_json(self):
@@ -1176,7 +1172,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
shutil.move(str(snapshot_dir), str(dest))
- except:
+ except Exception:
pass
@classmethod
@@ -1208,7 +1204,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
cls._merge_snapshots(snapshots)
merged += 1
- except:
+ except Exception:
pass
return merged
@@ -1244,7 +1240,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
shutil.rmtree(dup_dir)
- except:
+ except Exception:
pass
# Merge tags
@@ -1615,7 +1611,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
"""
import re
from django.utils import timezone
- from archivebox.misc.util import parse_date
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.config.common import GENERAL_CONFIG
@@ -2125,7 +2120,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
- from archivebox.misc.util import ts_to_date_str
from archivebox.core.host_utils import build_snapshot_url
result = {
@@ -2283,9 +2277,9 @@ class SnapshotMachine(BaseStateMachine):
# Tick Event (polled by workers)
tick = (
- queued.to.itself(unless='can_start') |
- queued.to(started, cond='can_start') |
- started.to(sealed, cond='is_finished')
+ queued.to.itself(unless='can_start')
+ | queued.to(started, cond='can_start')
+ | started.to(sealed, cond='is_finished')
)
# Manual event (can also be triggered by last ArchiveResult finishing)
@@ -2783,7 +2777,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
Updates status/output fields, queues discovered URLs, and triggers indexing.
"""
from django.utils import timezone
- from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
+ from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
from archivebox.config.configset import get_config
# Get merged config with proper context
@@ -3190,16 +3184,16 @@ class ArchiveResultMachine(BaseStateMachine):
# queued → skipped (if exceeded max attempts)
# started → backoff → started (retry)
tick = (
- queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
- queued.to.itself(unless='can_start') |
- queued.to(started, cond='can_start') |
- started.to(succeeded, cond='is_succeeded') |
- started.to(failed, cond='is_failed') |
- started.to(skipped, cond='is_skipped') |
- started.to(backoff, cond='is_backoff') |
- backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
- backoff.to.itself(unless='can_start') |
- backoff.to(started, cond='can_start')
+ queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
+ | queued.to.itself(unless='can_start')
+ | queued.to(started, cond='can_start')
+ | started.to(succeeded, cond='is_succeeded')
+ | started.to(failed, cond='is_failed')
+ | started.to(skipped, cond='is_skipped')
+ | started.to(backoff, cond='is_backoff')
+ | backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
+ | backoff.to.itself(unless='can_start')
+ | backoff.to(started, cond='can_start')
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
# Reason: backoff should always retry→started, then started→final states
)
@@ -3241,8 +3235,8 @@ class ArchiveResultMachine(BaseStateMachine):
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
- self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
- not self.archiveresult.output_str
+ self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
+ and not self.archiveresult.output_str
)
def is_finished(self) -> bool:
@@ -3286,7 +3280,6 @@ class ArchiveResultMachine(BaseStateMachine):
@started.enter
def enter_started(self):
- from archivebox.machine.models import NetworkInterface
# Update Process with network interface
if self.archiveresult.process_id:
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 2dec9a03..ff1127bd 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -6,6 +6,7 @@ import inspect
from pathlib import Path
+from django.conf.locale.en import formats as en_formats # type: ignore
from django.utils.crypto import get_random_string
import archivebox
@@ -13,6 +14,7 @@ import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa
from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url
+from .settings_logging import SETTINGS_LOGGING
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
@@ -54,8 +56,8 @@ INSTALLED_APPS = [
"django.contrib.staticfiles",
"django.contrib.admin",
# 3rd-party apps from PyPI
- "signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
- "django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
+ "signal_webhooks", # handles REST API outbound webhooks
+ "django_object_actions", # provides easy Django Admin action buttons on change views
# Our ArchiveBox-provided apps (use fully qualified names)
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
@@ -117,7 +119,6 @@ try:
try:
# Try to import django-auth-ldap (will fail if not installed)
- import django_auth_ldap
from django_auth_ldap.config import LDAPSearch
import ldap
@@ -414,9 +415,6 @@ DATETIME_FORMAT = "Y-m-d h:i:s A"
SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A"
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
-
-from django.conf.locale.en import formats as en_formats # type: ignore
-
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
@@ -425,9 +423,6 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
### Logging Settings
################################################################################
-
-from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG
-
LOGGING = SETTINGS_LOGGING
diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py
index 6c2cfd52..0d3a2dd5 100644
--- a/archivebox/core/settings_logging.py
+++ b/archivebox/core/settings_logging.py
@@ -5,8 +5,6 @@ import os
import tempfile
import logging
-import pydantic
-import django.template
from archivebox.config import CONSTANTS
diff --git a/archivebox/core/tests.py b/archivebox/core/tests.py
index 56060ae6..6690cefb 100644
--- a/archivebox/core/tests.py
+++ b/archivebox/core/tests.py
@@ -1,5 +1,6 @@
"""Tests for the core views, especially AddView."""
+import importlib
import os
import django
from unittest.mock import patch
@@ -8,13 +9,14 @@ from unittest.mock import patch
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
-from django.test import TestCase, Client
-from django.contrib.auth.models import User
-from django.urls import reverse
-
-from archivebox.crawls.models import Crawl, CrawlSchedule
-from archivebox.core.models import Tag
-from archivebox.config.common import SERVER_CONFIG
+TestCase = importlib.import_module('django.test').TestCase
+Client = importlib.import_module('django.test').Client
+User = importlib.import_module('django.contrib.auth.models').User
+reverse = importlib.import_module('django.urls').reverse
+Crawl = importlib.import_module('archivebox.crawls.models').Crawl
+CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
+Tag = importlib.import_module('archivebox.core.models').Tag
+SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
class AddViewTests(TestCase):
@@ -252,7 +254,7 @@ class AddViewTests(TestCase):
def test_add_staff_admin_custom_config_is_allowed(self):
"""Admin users can override crawl config."""
self.client.logout()
- admin_user = User.objects.create_user(
+ User.objects.create_user(
username='adminuser',
password='adminpass123',
email='admin@example.com',
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index fb7fabe7..3bc903e2 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -10,7 +10,7 @@ from pathlib import Path
from urllib.parse import urlparse
from django.shortcuts import render, redirect
-from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
+from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.views import View
from django.views.generic.list import ListView
@@ -24,9 +24,8 @@ from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
-import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
-from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
+from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
from archivebox.misc.serve_static import serve_static_with_byterange_support
@@ -35,6 +34,9 @@ from archivebox.search import query_search_index
from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
+from archivebox.core.forms import AddLinkForm
+from archivebox.crawls.models import Crawl
+from archivebox.hooks import get_enabled_plugins, get_plugin_name
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
@@ -49,12 +51,6 @@ def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
return target
-from archivebox.core.forms import AddLinkForm
-from archivebox.crawls.models import Crawl
-from archivebox.hooks import get_enabled_plugins, get_plugin_name
-
-
-
class HomepageView(View):
def get(self, request):
if request.user.is_authenticated:
@@ -1066,10 +1062,6 @@ class HealthCheckView(View):
status=200
)
-
-import json
-from django.http import JsonResponse
-
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
@@ -1077,7 +1069,6 @@ def live_progress_view(request):
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
- from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
@@ -1133,7 +1124,6 @@ def live_progress_view(request):
})
# Build hierarchical active crawls with nested snapshots and archive results
- from django.db.models import Prefetch
running_workers = Process.objects.filter(
machine=machine,
@@ -1387,7 +1377,7 @@ def find_config_default(key: str) -> str:
return default_val
def find_config_type(key: str) -> str:
- from typing import get_type_hints, ClassVar
+ from typing import ClassVar
CONFIGS = get_all_configs()
for config in CONFIGS.values():
@@ -1430,7 +1420,6 @@ def key_is_safe(key: str) -> bool:
def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
- import os
from archivebox.machine.models import Machine
# Check if it's from archivebox.machine.config
@@ -1464,12 +1453,11 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Get merged config that includes Machine.config overrides
try:
from archivebox.machine.models import Machine
- machine = Machine.current()
+ Machine.current()
merged_config = get_config()
- except Exception as e:
+ except Exception:
# Fallback if Machine model not available
merged_config = get_config()
- machine = None
rows = {
"Section": [],
@@ -1525,7 +1513,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
- import os
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py
index bbbceaa7..1fbefd0c 100644
--- a/archivebox/core/widgets.py
+++ b/archivebox/core/widgets.py
@@ -343,20 +343,17 @@ class InlineTagEditorWidget(TagEditorWidget):
snapshot_id = snapshot_id or self.snapshot_id
# Parse value to get list of tag dicts with id and name
- tags = []
tag_data = []
if value:
if hasattr(value, 'all'): # QuerySet
for tag in value.all():
tag_data.append({'id': tag.pk, 'name': tag.name})
tag_data.sort(key=lambda x: x['name'].lower())
- tags = [t['name'] for t in tag_data]
elif isinstance(value, (list, tuple)):
if value and hasattr(value[0], 'name'):
for tag in value:
tag_data.append({'id': tag.pk, 'name': tag.name})
tag_data.sort(key=lambda x: x['name'].lower())
- tags = [t['name'] for t in tag_data]
widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
widget_id = self._normalize_id(widget_id_raw)
diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py
index aa26ad94..00d224ea 100644
--- a/archivebox/core/wsgi.py
+++ b/archivebox/core/wsgi.py
@@ -9,9 +9,8 @@ https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
import archivebox # noqa
from archivebox.config.django import setup_django
+from django.core.wsgi import get_wsgi_application
setup_django(in_memory_db=False, check_db=True)
-from django.core.wsgi import get_wsgi_application
-
application = get_wsgi_application()
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index 01b18375..0539c6e0 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -1,17 +1,11 @@
__package__ = 'archivebox.crawls'
-import json
-from pathlib import Path
from django import forms
from django.utils.html import format_html, format_html_join, mark_safe
from django.contrib import admin, messages
-from django.urls import path
-from django.http import JsonResponse
-from django.views.decorators.http import require_POST
from django.db.models import Count, Q
-from archivebox import DATA_DIR
from django_object_actions import action
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index d7d54d64..7417ee4b 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,12 +1,11 @@
__package__ = 'archivebox.crawls'
-from typing import TYPE_CHECKING, Iterable
+from typing import TYPE_CHECKING
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
from django.db import models
-from django.db.models import QuerySet
from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings
from django.urls import reverse_lazy
@@ -15,13 +14,12 @@ from django_stubs_ext.db.models import TypedModelMeta
from statemachine import State, registry
from rich import print
-from archivebox.config import CONSTANTS
from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule
if TYPE_CHECKING:
- from archivebox.core.models import Snapshot, ArchiveResult
+ from archivebox.core.models import Snapshot
class CrawlSchedule(ModelWithUUID, ModelWithNotes):
@@ -111,7 +109,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
label = models.CharField(max_length=64, blank=True, null=False, default='')
notes = models.TextField(blank=True, null=False, default='')
schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
- output_dir = models.CharField(max_length=512, null=False, blank=True, default='')
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
@@ -252,6 +249,22 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return system_url
return None
+ def resolve_persona(self):
+ from archivebox.personas.models import Persona
+
+ if self.persona_id:
+ persona = Persona.objects.filter(id=self.persona_id).first()
+ if persona is None:
+ raise Persona.DoesNotExist(f'Crawl {self.id} references missing Persona {self.persona_id}')
+ return persona
+
+ default_persona_name = str((self.config or {}).get('DEFAULT_PERSONA') or '').strip()
+ if default_persona_name:
+ persona, _ = Persona.objects.get_or_create(name=default_persona_name or 'Default')
+ return persona
+
+ return None
+
def add_url(self, entry: dict) -> bool:
"""
@@ -391,7 +404,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
f.flush()
def get_runtime_config():
- return get_config(crawl=self)
+ config = get_config(crawl=self)
+ if persona_runtime_overrides:
+ config.update(persona_runtime_overrides)
+ return config
system_task = self.get_system_task()
if system_task == 'archivebox://update':
@@ -402,6 +418,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
machine = Machine.current()
declared_binary_names: set[str] = set()
+ persona_runtime_overrides: dict[str, str] = {}
+ persona = self.resolve_persona()
+ if persona:
+ base_runtime_config = get_config(crawl=self, persona=persona)
+ chrome_binary = str(base_runtime_config.get('CHROME_BINARY') or '')
+ persona_runtime_overrides = persona.prepare_runtime_for_crawl(
+ crawl=self,
+ chrome_binary=chrome_binary,
+ )
def install_declared_binaries(binary_names: set[str]) -> None:
if not binary_names:
@@ -563,7 +588,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
- f.write(f'Discovering Crawl hooks...\n')
+ f.write('Discovering Crawl hooks...\n')
f.flush()
hooks = discover_hooks('Crawl', config=get_runtime_config())
with open(debug_log, 'a') as f:
@@ -588,17 +613,17 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
print(f'[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]')
with open(debug_log, 'a') as f:
f.write(f'Skipping snapshot creation for system crawl: {system_task}\n')
- f.write(f'=== Crawl.run() complete ===\n\n')
+ f.write('=== Crawl.run() complete ===\n\n')
f.flush()
return None
with open(debug_log, 'a') as f:
- f.write(f'Creating snapshots from URLs...\n')
+ f.write('Creating snapshots from URLs...\n')
f.flush()
created_snapshots = self.create_snapshots_from_urls()
with open(debug_log, 'a') as f:
f.write(f'Created {len(created_snapshots)} snapshots\n')
- f.write(f'=== Crawl.run() complete ===\n\n')
+ f.write('=== Crawl.run() complete ===\n\n')
f.flush()
# Return first snapshot for this crawl (newly created or existing)
@@ -647,6 +672,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
for pid_file in self.output_dir.glob('**/*.pid'):
pid_file.unlink(missing_ok=True)
+ persona = self.resolve_persona()
+ if persona:
+ persona.cleanup_runtime_for_crawl(self)
+
# Run on_CrawlEnd hooks
from archivebox.config.configset import get_config
config = get_config(crawl=self)
@@ -715,9 +744,9 @@ class CrawlMachine(BaseStateMachine):
# Tick Event (polled by workers)
tick = (
- queued.to.itself(unless='can_start') |
- queued.to(started, cond='can_start') |
- started.to(sealed, cond='is_finished')
+ queued.to.itself(unless='can_start')
+ | queued.to(started, cond='can_start')
+ | started.to(sealed, cond='is_finished')
)
# Manual event (triggered by last Snapshot sealing)
@@ -740,7 +769,6 @@ class CrawlMachine(BaseStateMachine):
@started.enter
def enter_started(self):
import sys
- from archivebox.core.models import Snapshot
print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)
@@ -758,7 +786,7 @@ class CrawlMachine(BaseStateMachine):
)
else:
# No snapshots (system crawl like archivebox://install)
- print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
+ print('[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
# Seal immediately since there's no work to do
self.seal()
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 84112390..962bc200 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -56,16 +56,18 @@ __package__ = 'archivebox'
import os
import json
-import time
from functools import lru_cache
from pathlib import Path
-from typing import List, Dict, Any, Optional, TypedDict
+from typing import TYPE_CHECKING, List, Dict, Any, Optional, TypedDict
from abx_plugins import get_plugins_dir
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config.constants import CONSTANTS
+if TYPE_CHECKING:
+ from archivebox.machine.models import Process
+
# Plugin directories
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
@@ -266,9 +268,7 @@ def run_hook(
"""
from archivebox.machine.models import Process, Machine
from archivebox.config.constants import CONSTANTS
- import time
import sys
- start_time = time.time()
# Auto-detect timeout from plugin config if not explicitly provided
if timeout is None:
diff --git a/archivebox/ldap/auth.py b/archivebox/ldap/auth.py
index 3958ff09..aa7fc651 100644
--- a/archivebox/ldap/auth.py
+++ b/archivebox/ldap/auth.py
@@ -9,7 +9,6 @@ __package__ = "archivebox.ldap"
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- from django.contrib.auth.models import User
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
else:
try:
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index f92ac02b..f3a0f0da 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -10,6 +10,7 @@ from datetime import timedelta, datetime
from statemachine import State, registry
from django.db import models
+from django.db.models import QuerySet
from django.utils import timezone
from django.utils.functional import cached_property
@@ -197,7 +198,6 @@ class NetworkInterface(ModelWithHealthStats):
class BinaryManager(models.Manager):
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary':
"""Get or create an Binary record from the database or cache."""
- global _CURRENT_BINARIES
cached = _CURRENT_BINARIES.get(name)
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
return cached
@@ -583,7 +583,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
Called by state machine if needed (not typically used for binaries
since installations are foreground, but included for consistency).
"""
- from pathlib import Path
# Kill any background binary installation hooks using Process records
# (rarely used since binary installations are typically foreground)
@@ -1026,9 +1025,11 @@ class Process(models.Model):
# Check cache validity
if _CURRENT_PROCESS:
# Verify: same PID, same machine, cache not expired
- if (_CURRENT_PROCESS.pid == current_pid and
- _CURRENT_PROCESS.machine_id == machine.id and
- timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)):
+ if (
+ _CURRENT_PROCESS.pid == current_pid
+ and _CURRENT_PROCESS.machine_id == machine.id
+ and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
+ ):
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
_CURRENT_PROCESS = None
@@ -1111,7 +1112,6 @@ class Process(models.Model):
machine = machine or Machine.current()
# Debug logging
- import sys
# print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
# Get parent process start time from OS
@@ -1630,7 +1630,6 @@ class Process(models.Model):
self (updated with pid, started_at, etc.)
"""
import subprocess
- import time
# Validate pwd is set (required for output files)
if not self.pwd:
@@ -1846,7 +1845,6 @@ class Process(models.Model):
Returns:
True if process was terminated, False if already dead
"""
- import time
import signal
proc = self.proc
@@ -2199,8 +2197,8 @@ class BinaryMachine(BaseStateMachine):
# Tick Event - install happens during transition
tick = (
- queued.to.itself(unless='can_install') |
- queued.to(installed, cond='can_install', on='on_install')
+ queued.to.itself(unless='can_install')
+ | queued.to(installed, cond='can_install', on='on_install')
)
def can_install(self) -> bool:
@@ -2303,10 +2301,10 @@ class ProcessMachine(BaseStateMachine):
# Tick Event - transitions based on conditions
tick = (
- queued.to.itself(unless='can_start') |
- queued.to(running, cond='can_start') |
- running.to.itself(unless='is_exited') |
- running.to(exited, cond='is_exited')
+ queued.to.itself(unless='can_start')
+ | queued.to(running, cond='can_start')
+ | running.to.itself(unless='is_exited')
+ | running.to(exited, cond='is_exited')
)
# Additional events (for explicit control)
diff --git a/archivebox/machine/tests/test_machine_models.py b/archivebox/machine/tests/test_machine_models.py
index 6a1d4514..983770d4 100644
--- a/archivebox/machine/tests/test_machine_models.py
+++ b/archivebox/machine/tests/test_machine_models.py
@@ -12,8 +12,6 @@ Tests cover:
"""
import os
-import sys
-from pathlib import Path
from datetime import timedelta
from unittest.mock import patch
@@ -29,7 +27,6 @@ from archivebox.machine.models import (
BinaryMachine,
ProcessMachine,
MACHINE_RECHECK_INTERVAL,
- PROCESS_RECHECK_INTERVAL,
PID_REUSE_WINDOW,
)
@@ -323,7 +320,6 @@ class TestProcessModel(TestCase):
def test_process_update_and_requeue(self):
"""Process.update_and_requeue() should update fields and save."""
process = Process.objects.create(machine=self.machine, cmd=['test'])
- old_modified = process.modified_at
process.update_and_requeue(
status=Process.StatusChoices.RUNNING,
diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py
index a8abf996..025c3eee 100644
--- a/archivebox/mcp/server.py
+++ b/archivebox/mcp/server.py
@@ -1,5 +1,3 @@
-__package__ = 'archivebox.mcp'
-
"""
Model Context Protocol (MCP) server implementation for ArchiveBox.
@@ -10,9 +8,7 @@ Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport.
import sys
import json
import traceback
-from typing import Any, Dict, List, Optional
-from io import StringIO
-from contextlib import redirect_stdout, redirect_stderr
+from typing import Optional
import click
from click.testing import CliRunner
diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py
index bf97e838..91d4c081 100644
--- a/archivebox/misc/checks.py
+++ b/archivebox/misc/checks.py
@@ -225,7 +225,6 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
- import archivebox
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.misc.logging import STDERR
from archivebox.misc.logging_util import pretty_path
diff --git a/archivebox/misc/folders.py b/archivebox/misc/folders.py
index dd134dc1..dd8bbc1f 100644
--- a/archivebox/misc/folders.py
+++ b/archivebox/misc/folders.py
@@ -35,7 +35,6 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L
with open(index_path, 'r') as f:
data = json.load(f)
timestamp = data.get('timestamp')
- url = data.get('url')
except Exception:
continue
diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py
index 7e5b707c..c00071f6 100644
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -21,13 +21,12 @@ if TYPE_CHECKING:
from rich import print
from rich.panel import Panel
-from django.core.management.base import DjangoHelpFormatter
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import enforce_types
-from archivebox.misc.logging import ANSI, stderr
+from archivebox.misc.logging import ANSI
@dataclass
class RuntimeStats:
diff --git a/archivebox/misc/monkey_patches.py b/archivebox/misc/monkey_patches.py
index 2bfb7924..9ee755c4 100644
--- a/archivebox/misc/monkey_patches.py
+++ b/archivebox/misc/monkey_patches.py
@@ -1,16 +1,18 @@
__package__ = 'archivebox'
-import django
-import pydantic
+import datetime
+import warnings
+
+import benedict
+from daphne import access
import django_stubs_ext
+from django.utils import timezone
django_stubs_ext.monkeypatch()
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
-import datetime
-from django.utils import timezone
timezone.utc = datetime.timezone.utc
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
@@ -26,12 +28,9 @@ timezone.utc = datetime.timezone.utc
# Hide site-packages/sonic/client.py:115: SyntaxWarning
# https://github.com/xmonader/python-sonic-client/pull/18
-import warnings # noqa
warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic')
# Make daphne log requests quieter and esier to read
-from daphne import access # noqa
-
class ModifiedAccessLogGenerator(access.AccessLogGenerator):
"""Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files"""
@@ -68,5 +67,4 @@ access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry #
# fix benedict objects to pretty-print/repr more nicely with rich
# https://stackoverflow.com/a/79048811/2156113
# https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol
-import benedict # noqa
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore
diff --git a/archivebox/misc/progress_layout.py b/archivebox/misc/progress_layout.py
index eb6fdb3a..1263856b 100644
--- a/archivebox/misc/progress_layout.py
+++ b/archivebox/misc/progress_layout.py
@@ -135,7 +135,6 @@ class ProcessLogPanel:
if line:
log_lines.append(Text(line, style="cyan"))
- compact = self.compact if self.compact is not None else self._is_background_hook()
max_body = max(1, self.max_lines - len(header_lines))
if not log_lines:
log_lines = []
diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py
index a1a55d9b..6804c210 100644
--- a/archivebox/misc/system.py
+++ b/archivebox/misc/system.py
@@ -4,10 +4,11 @@ __package__ = 'archivebox.misc'
import os
import signal
import shutil
+import sys
from json import dump
from pathlib import Path
-from typing import Optional, Union, Set, Tuple
+from typing import Optional, Union, Tuple
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from atomicwrites import atomic_write as lib_atomic_write
@@ -58,7 +59,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
# far into the TimeoutExpired exception.
process.wait()
raise
- except: # Including KeyboardInterrupt, communicate handled that.
+ except BaseException: # Including KeyboardInterrupt, communicate handled that.
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py
index 8c38f3f3..b97a94f6 100644
--- a/archivebox/personas/admin.py
+++ b/archivebox/personas/admin.py
@@ -1,3 +1,2 @@
-from django.contrib import admin
# Register your models here.
diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py
index 4be5cfb3..ba30d587 100644
--- a/archivebox/personas/models.py
+++ b/archivebox/personas/models.py
@@ -11,8 +11,12 @@ Each persona has its own:
__package__ = 'archivebox.personas'
+import shutil
+import subprocess
+import sys
+from contextlib import contextmanager
from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING
from django.db import models
from django.conf import settings
@@ -21,8 +25,32 @@ from django.utils import timezone
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
from archivebox.uuid_compat import uuid7
+try:
+ import fcntl
+except ImportError: # pragma: no cover
+ fcntl = None
+
if TYPE_CHECKING:
- from django.db.models import QuerySet
+ pass
+
+
+VOLATILE_PROFILE_DIR_NAMES = {
+ 'Cache',
+ 'Code Cache',
+ 'GPUCache',
+ 'ShaderCache',
+ 'Service Worker',
+ 'GCM Store',
+ 'Crashpad',
+ 'BrowserMetrics',
+}
+
+VOLATILE_PROFILE_FILE_NAMES = {
+ 'BrowserMetrics-spare.pma',
+ 'SingletonCookie',
+ 'SingletonLock',
+ 'SingletonSocket',
+}
class Persona(ModelWithConfig):
@@ -120,37 +148,118 @@ class Persona(ModelWithConfig):
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
- def cleanup_chrome(self) -> bool:
- """
- Clean up Chrome state files (SingletonLock, etc.) for this persona.
-
- Returns:
- True if cleanup was performed, False if no cleanup needed
- """
+ def cleanup_chrome_profile(self, profile_dir: Path) -> bool:
+ """Remove volatile Chrome state that should never be reused across launches."""
cleaned = False
- chrome_dir = self.path / 'chrome_user_data'
- if not chrome_dir.exists():
+ if not profile_dir.exists():
return False
- # Clean up SingletonLock files
- for lock_file in chrome_dir.glob('**/SingletonLock'):
- try:
- lock_file.unlink()
- cleaned = True
- except OSError:
- pass
+ for path in profile_dir.rglob('*'):
+ if path.name in VOLATILE_PROFILE_FILE_NAMES:
+ try:
+ path.unlink()
+ cleaned = True
+ except OSError:
+ pass
- # Clean up SingletonSocket files
- for socket_file in chrome_dir.glob('**/SingletonSocket'):
+ for dirname in VOLATILE_PROFILE_DIR_NAMES:
+ for path in profile_dir.rglob(dirname):
+ if not path.is_dir():
+ continue
+ shutil.rmtree(path, ignore_errors=True)
+ cleaned = True
+
+ for path in profile_dir.rglob('*.log'):
try:
- socket_file.unlink()
+ path.unlink()
cleaned = True
except OSError:
pass
return cleaned
+ def cleanup_chrome(self) -> bool:
+ """Clean up volatile Chrome state for this persona's base profile."""
+ return self.cleanup_chrome_profile(self.path / 'chrome_user_data')
+
+ @contextmanager
+ def lock_runtime_for_crawl(self):
+ lock_path = self.path / '.archivebox-crawl-profile.lock'
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
+
+ with lock_path.open('w') as lock_file:
+ if fcntl is not None:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
+ try:
+ yield
+ finally:
+ if fcntl is not None:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+ def runtime_root_for_crawl(self, crawl) -> Path:
+ return Path(crawl.output_dir) / '.persona' / self.name
+
+ def runtime_profile_dir_for_crawl(self, crawl) -> Path:
+ return self.runtime_root_for_crawl(crawl) / 'chrome_user_data'
+
+ def runtime_downloads_dir_for_crawl(self, crawl) -> Path:
+ return self.runtime_root_for_crawl(crawl) / 'chrome_downloads'
+
+ def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None:
+ destination_dir.parent.mkdir(parents=True, exist_ok=True)
+ shutil.rmtree(destination_dir, ignore_errors=True)
+ destination_dir.mkdir(parents=True, exist_ok=True)
+
+ copy_cmd: list[str] | None = None
+ source_contents = f'{source_dir}/.'
+
+ if sys.platform == 'darwin':
+ copy_cmd = ['cp', '-cR', source_contents, str(destination_dir)]
+ elif sys.platform.startswith('linux'):
+ copy_cmd = ['cp', '-a', source_contents, str(destination_dir)]
+
+ if copy_cmd:
+ result = subprocess.run(copy_cmd, capture_output=True, text=True)
+ if result.returncode == 0:
+ return
+
+ shutil.rmtree(destination_dir, ignore_errors=True)
+ destination_dir.mkdir(parents=True, exist_ok=True)
+
+ shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True)
+
+ def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = '') -> dict[str, str]:
+ self.ensure_dirs()
+
+ template_dir = Path(self.CHROME_USER_DATA_DIR)
+ runtime_root = self.runtime_root_for_crawl(crawl)
+ runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl)
+ runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl)
+
+ with self.lock_runtime_for_crawl():
+ if not runtime_profile_dir.exists():
+ if template_dir.exists() and any(template_dir.iterdir()):
+ self.copy_chrome_profile(template_dir, runtime_profile_dir)
+ else:
+ runtime_profile_dir.mkdir(parents=True, exist_ok=True)
+
+ runtime_downloads_dir.mkdir(parents=True, exist_ok=True)
+ self.cleanup_chrome_profile(runtime_profile_dir)
+
+ (runtime_root / 'persona_name.txt').write_text(self.name)
+ (runtime_root / 'template_dir.txt').write_text(str(template_dir))
+ if chrome_binary:
+ (runtime_root / 'chrome_binary.txt').write_text(chrome_binary)
+
+ return {
+ 'CHROME_USER_DATA_DIR': str(runtime_profile_dir),
+ 'CHROME_DOWNLOADS_DIR': str(runtime_downloads_dir),
+ }
+
+ def cleanup_runtime_for_crawl(self, crawl) -> None:
+ shutil.rmtree(Path(crawl.output_dir) / '.persona', ignore_errors=True)
+
@classmethod
def get_or_create_default(cls) -> 'Persona':
"""Get or create the Default persona."""
diff --git a/archivebox/personas/tests.py b/archivebox/personas/tests.py
index 7ce503c2..49290204 100644
--- a/archivebox/personas/tests.py
+++ b/archivebox/personas/tests.py
@@ -1,3 +1,2 @@
-from django.test import TestCase
# Create your tests here.
diff --git a/archivebox/personas/views.py b/archivebox/personas/views.py
index 91ea44a2..b8e4ee02 100644
--- a/archivebox/personas/views.py
+++ b/archivebox/personas/views.py
@@ -1,3 +1,2 @@
-from django.shortcuts import render
# Create your views here.
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index b98f7f95..13ce44a1 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -14,7 +14,7 @@ Search backends must provide a search.py module with:
__package__ = 'archivebox.search'
-from typing import TYPE_CHECKING, Any, Optional
+from typing import Any, Optional
from django.db.models import QuerySet
@@ -22,9 +22,6 @@ from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
from archivebox.config.common import SEARCH_BACKEND_CONFIG
-if TYPE_CHECKING:
- from archivebox.core.models import Snapshot
-
# Cache discovered backends to avoid repeated filesystem scans
_search_backends_cache: Optional[dict] = None
diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py
index 69740e16..28f58062 100644
--- a/archivebox/tests/conftest.py
+++ b/archivebox/tests/conftest.py
@@ -1,7 +1,6 @@
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
import os
-import shutil
import sys
import subprocess
import textwrap
@@ -13,6 +12,8 @@ import pytest
from archivebox.uuid_compat import uuid7
+pytest_plugins = ["archivebox.tests.fixtures"]
+
# =============================================================================
# CLI Helpers (defined before fixtures that use them)
diff --git a/archivebox/tests/test_add.py b/archivebox/tests/test_add.py
index 0fb4271a..39d423e3 100644
--- a/archivebox/tests/test_add.py
+++ b/archivebox/tests/test_add.py
@@ -1,9 +1,6 @@
-import subprocess
-import json
-import sqlite3
import os
-
-from .fixtures import *
+import sqlite3
+import subprocess
def test_depth_flag_is_accepted(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
@@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
- arg_process = subprocess.run(
+ subprocess.run(
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
diff --git a/archivebox/tests/test_admin_views.py b/archivebox/tests/test_admin_views.py
index 707822cb..c1bfb3bd 100644
--- a/archivebox/tests/test_admin_views.py
+++ b/archivebox/tests/test_admin_views.py
@@ -9,7 +9,7 @@ Tests cover:
"""
import pytest
-from django.test import TestCase, Client, override_settings
+from django.test import override_settings
from django.urls import reverse
from django.contrib.auth import get_user_model
diff --git a/archivebox/tests/test_auth_ldap.py b/archivebox/tests/test_auth_ldap.py
index 7b25f0cf..cec866c9 100644
--- a/archivebox/tests/test_auth_ldap.py
+++ b/archivebox/tests/test_auth_ldap.py
@@ -9,7 +9,7 @@ import os
import sys
import tempfile
import unittest
-from pathlib import Path
+from importlib.util import find_spec
class TestLDAPConfig(unittest.TestCase):
@@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase):
def test_django_settings_with_ldap_library_check(self):
"""Test that Django settings check for LDAP libraries when enabled."""
- # Try to import django-auth-ldap to see if it's available
- try:
- import django_auth_ldap
- import ldap
- ldap_available = True
- except ImportError:
- ldap_available = False
+ ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None
# If LDAP libraries are not available, settings should handle gracefully
if not ldap_available:
diff --git a/archivebox/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py
index 7d325e61..a34a4879 100644
--- a/archivebox/tests/test_cli_add.py
+++ b/archivebox/tests/test_cli_add.py
@@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie
"""
import os
-import subprocess
import sqlite3
-from pathlib import Path
-
-from .fixtures import *
+import subprocess
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
@@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
assert 'test' in tags_str or 'example' in tags_str
+def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
+ """Test add persists the selected persona so browser config derives from it later."""
+ os.chdir(tmp_path)
+ result = subprocess.run(
+ ['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
+ capture_output=True,
+ env=disable_extractors_dict,
+ )
+
+ assert result.returncode == 0
+
+ conn = sqlite3.connect("index.sqlite3")
+ c = conn.cursor()
+ persona_id, default_persona = c.execute(
+ "SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
+ ).fetchone()
+ conn.close()
+
+ assert persona_id
+ assert default_persona == 'Default'
+ assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
+ assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
+
+
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice creates separate crawls and snapshots.
diff --git a/archivebox/tests/test_cli_archiveresult.py b/archivebox/tests/test_cli_archiveresult.py
index de016010..ff884675 100644
--- a/archivebox/tests/test_cli_archiveresult.py
+++ b/archivebox/tests/test_cli_archiveresult.py
@@ -9,7 +9,6 @@ Tests cover:
"""
import json
-import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
diff --git a/archivebox/tests/test_cli_config.py b/archivebox/tests/test_cli_config.py
index 87f7412c..351f14d0 100644
--- a/archivebox/tests/test_cli_config.py
+++ b/archivebox/tests/test_cli_config.py
@@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly.
import os
import subprocess
-from pathlib import Path
-
-from .fixtures import *
def test_config_displays_all_config(tmp_path, process):
diff --git a/archivebox/tests/test_cli_crawl.py b/archivebox/tests/test_cli_crawl.py
index 891f4114..c641a842 100644
--- a/archivebox/tests/test_cli_crawl.py
+++ b/archivebox/tests/test_cli_crawl.py
@@ -9,14 +9,11 @@ Tests cover:
"""
import json
-import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
- assert_jsonl_contains_type,
create_test_url,
- create_test_crawl_json,
)
diff --git a/archivebox/tests/test_cli_extract.py b/archivebox/tests/test_cli_extract.py
index 19b0d834..f1980f6b 100644
--- a/archivebox/tests/test_cli_extract.py
+++ b/archivebox/tests/test_cli_extract.py
@@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots.
"""
import os
-import subprocess
import sqlite3
-
-from .fixtures import *
+import subprocess
def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_cli_help.py b/archivebox/tests/test_cli_help.py
index ccf580b5..be4918dc 100644
--- a/archivebox/tests/test_cli_help.py
+++ b/archivebox/tests/test_cli_help.py
@@ -7,8 +7,6 @@ Verify command runs successfully and produces output.
import os
import subprocess
-from .fixtures import *
-
def test_help_runs_successfully(tmp_path):
"""Test that help command runs and produces output."""
diff --git a/archivebox/tests/test_cli_init.py b/archivebox/tests/test_cli_init.py
index 5761ce5b..e6ce1ef6 100644
--- a/archivebox/tests/test_cli_init.py
+++ b/archivebox/tests/test_cli_init.py
@@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config.
"""
import os
-import subprocess
import sqlite3
-from pathlib import Path
+import subprocess
from archivebox.config.common import STORAGE_CONFIG
-from .fixtures import *
-
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
diff --git a/archivebox/tests/test_cli_install.py b/archivebox/tests/test_cli_install.py
index d839772f..c7738468 100644
--- a/archivebox/tests/test_cli_install.py
+++ b/archivebox/tests/test_cli_install.py
@@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB.
"""
import os
-import subprocess
import sqlite3
+import subprocess
from pathlib import Path
-from .fixtures import *
-
def test_install_runs_successfully(tmp_path, process):
"""Test that install command runs without error."""
diff --git a/archivebox/tests/test_cli_manage.py b/archivebox/tests/test_cli_manage.py
index ada5e657..70555c44 100644
--- a/archivebox/tests/test_cli_manage.py
+++ b/archivebox/tests/test_cli_manage.py
@@ -6,9 +6,6 @@ Verify manage command runs Django management commands.
import os
import subprocess
-import sqlite3
-
-from .fixtures import *
def test_manage_help_works(tmp_path, process):
diff --git a/archivebox/tests/test_cli_remove.py b/archivebox/tests/test_cli_remove.py
index 7fa66209..5558e576 100644
--- a/archivebox/tests/test_cli_remove.py
+++ b/archivebox/tests/test_cli_remove.py
@@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem.
"""
import os
-import subprocess
import sqlite3
-from pathlib import Path
-
-from .fixtures import *
+import subprocess
def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_cli_run.py b/archivebox/tests/test_cli_run.py
index 5181ffd3..7d025b3a 100644
--- a/archivebox/tests/test_cli_run.py
+++ b/archivebox/tests/test_cli_run.py
@@ -8,7 +8,6 @@ Tests cover:
"""
import json
-import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
diff --git a/archivebox/tests/test_cli_run_binary_worker.py b/archivebox/tests/test_cli_run_binary_worker.py
index b7d4fc71..7f509bcd 100644
--- a/archivebox/tests/test_cli_run_binary_worker.py
+++ b/archivebox/tests/test_cli_run_binary_worker.py
@@ -10,11 +10,9 @@ Tests cover:
import json
import sqlite3
-import time
from archivebox.tests.conftest import (
run_archivebox_cmd,
- parse_jsonl_output,
)
diff --git a/archivebox/tests/test_cli_schedule.py b/archivebox/tests/test_cli_schedule.py
index 47e32c98..82c1e0b7 100644
--- a/archivebox/tests/test_cli_schedule.py
+++ b/archivebox/tests/test_cli_schedule.py
@@ -5,7 +5,6 @@ import os
import sqlite3
import subprocess
-from .fixtures import process, disable_extractors_dict
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_cli_search.py b/archivebox/tests/test_cli_search.py
index 1c567f42..7ae757fc 100644
--- a/archivebox/tests/test_cli_search.py
+++ b/archivebox/tests/test_cli_search.py
@@ -6,9 +6,6 @@ Verify search queries snapshots from DB.
import os
import subprocess
-import sqlite3
-
-from .fixtures import *
def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_cli_server.py b/archivebox/tests/test_cli_server.py
index 003119a3..7e31ac6c 100644
--- a/archivebox/tests/test_cli_server.py
+++ b/archivebox/tests/test_cli_server.py
@@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing).
import os
import subprocess
-import signal
-import time
-
-from .fixtures import *
def test_server_shows_usage_info(tmp_path, process):
diff --git a/archivebox/tests/test_cli_shell.py b/archivebox/tests/test_cli_shell.py
index 0c966c5d..818b9c5c 100644
--- a/archivebox/tests/test_cli_shell.py
+++ b/archivebox/tests/test_cli_shell.py
@@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only).
import os
import subprocess
-from .fixtures import *
-
def test_shell_command_exists(tmp_path, process):
"""Test that shell command is recognized."""
diff --git a/archivebox/tests/test_cli_snapshot.py b/archivebox/tests/test_cli_snapshot.py
index 24f35bf7..a05ecc78 100644
--- a/archivebox/tests/test_cli_snapshot.py
+++ b/archivebox/tests/test_cli_snapshot.py
@@ -9,12 +9,10 @@ Tests cover:
"""
import json
-import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
- assert_jsonl_contains_type,
create_test_url,
)
diff --git a/archivebox/tests/test_cli_status.py b/archivebox/tests/test_cli_status.py
index 97538f5f..b5eb8dc6 100644
--- a/archivebox/tests/test_cli_status.py
+++ b/archivebox/tests/test_cli_status.py
@@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem.
"""
import os
-import subprocess
import sqlite3
+import subprocess
from pathlib import Path
-from .fixtures import *
-
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
diff --git a/archivebox/tests/test_cli_update.py b/archivebox/tests/test_cli_update.py
index 1dc71580..05819c57 100644
--- a/archivebox/tests/test_cli_update.py
+++ b/archivebox/tests/test_cli_update.py
@@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots.
"""
import os
-import subprocess
import sqlite3
-
-from .fixtures import *
+import subprocess
def test_update_runs_successfully_on_empty_archive(tmp_path, process):
diff --git a/archivebox/tests/test_cli_version.py b/archivebox/tests/test_cli_version.py
index 46382e27..eee2362e 100644
--- a/archivebox/tests/test_cli_version.py
+++ b/archivebox/tests/test_cli_version.py
@@ -11,7 +11,9 @@ import tempfile
import subprocess
from pathlib import Path
-from .fixtures import *
+from .fixtures import process
+
+FIXTURES = (process,)
def _archivebox_cli() -> str:
diff --git a/archivebox/tests/test_config.py b/archivebox/tests/test_config.py
index b9c251c7..49e4da45 100644
--- a/archivebox/tests/test_config.py
+++ b/archivebox/tests/test_config.py
@@ -6,7 +6,6 @@ import subprocess
import pytest
-from .fixtures import process, disable_extractors_dict
def test_config_shows_all_config_values(tmp_path, process):
@@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
capture_output=True,
text=True,
)
+ assert result.returncode == 0, result.stderr
# Read the config file directly to verify it was written
config_file = tmp_path / 'ArchiveBox.conf'
diff --git a/archivebox/tests/test_crawl.py b/archivebox/tests/test_crawl.py
index 1b1acd88..6065d675 100644
--- a/archivebox/tests/test_crawl.py
+++ b/archivebox/tests/test_crawl.py
@@ -4,11 +4,9 @@
import os
import subprocess
import sqlite3
-import json
import pytest
-from .fixtures import process, disable_extractors_dict
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_extract.py b/archivebox/tests/test_extract.py
index 117c922f..47df599e 100644
--- a/archivebox/tests/test_extract.py
+++ b/archivebox/tests/test_extract.py
@@ -8,7 +8,6 @@ import json
import pytest
-from .fixtures import process, disable_extractors_dict
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
text=True,
env=disable_extractors_dict,
)
+ assert result.returncode == 0, result.stderr
# Should not error
conn = sqlite3.connect('index.sqlite3')
diff --git a/archivebox/tests/test_extractors.py b/archivebox/tests/test_extractors.py
index 3502c7f4..6e2eb521 100644
--- a/archivebox/tests/test_extractors.py
+++ b/archivebox/tests/test_extractors.py
@@ -1,8 +1,12 @@
-from .fixtures import *
import json as pyjson
import sqlite3
+import subprocess
from pathlib import Path
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
+
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py
index 271ac6af..e303a515 100755
--- a/archivebox/tests/test_hooks.py
+++ b/archivebox/tests/test_hooks.py
@@ -16,7 +16,7 @@ import subprocess
import tempfile
import unittest
from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
diff --git a/archivebox/tests/test_init.py b/archivebox/tests/test_init.py
index b9d7e130..3a3697bd 100644
--- a/archivebox/tests/test_init.py
+++ b/archivebox/tests/test_init.py
@@ -3,13 +3,13 @@
import os
import subprocess
-from pathlib import Path
-import json, shutil
import sqlite3
from archivebox.config.common import STORAGE_CONFIG
-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
@@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
+ assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# In the new architecture, URLs are saved to source files
# Check that a source file was created with the URL
@@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
capture_output=True, env=disable_extractors_dict)
+ assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# Check that a source file was created with both URLs
sources_dir = tmp_path / "sources"
@@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
env=disable_extractors_dict)
+ assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# Check database permissions
assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
diff --git a/archivebox/tests/test_install.py b/archivebox/tests/test_install.py
index 3106ddb1..af967500 100644
--- a/archivebox/tests/test_install.py
+++ b/archivebox/tests/test_install.py
@@ -7,7 +7,6 @@ import sqlite3
import pytest
-from .fixtures import process, disable_extractors_dict
class TestInstallDryRun:
diff --git a/archivebox/tests/test_list.py b/archivebox/tests/test_list.py
index d527fa5d..2aaad4fa 100644
--- a/archivebox/tests/test_list.py
+++ b/archivebox/tests/test_list.py
@@ -1,7 +1,9 @@
import json
import subprocess
-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
def test_search_json(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py
index c8de3fcf..21bdd134 100644
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x.
- New fields like depth, retry_at, etc.
"""
-import json
import shutil
import sqlite3
-import subprocess
import tempfile
import unittest
from pathlib import Path
@@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
f"Files lost during migration: {files_before_count} -> {files_after_count}")
# Run update to trigger filesystem reorganization
- print(f"\n[*] Running archivebox update to reorganize filesystem...")
+ print("\n[*] Running archivebox update to reorganize filesystem...")
result = run_archivebox(self.work_dir, ['update'], timeout=120)
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
@@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
# CRITICAL: Verify sample files exist in new structure
self.assertGreater(len(new_sample_files), 0,
- f"Sample files not found in new structure")
+ "Sample files not found in new structure")
# Verify new path format
for path_key, file_path in new_sample_files.items():
diff --git a/archivebox/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py
index 1872a617..fc61d228 100644
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -10,7 +10,6 @@ from pathlib import Path
import pytest
-from .fixtures import process, disable_extractors_dict, recursive_test_site
def wait_for_db_condition(timeout, condition, interval=0.5):
@@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "true",
- "SAVE_WGET": "false",
})
proc = subprocess.Popen(
diff --git a/archivebox/tests/test_remove.py b/archivebox/tests/test_remove.py
index f9045bcc..078f4e06 100644
--- a/archivebox/tests/test_remove.py
+++ b/archivebox/tests/test_remove.py
@@ -1,7 +1,10 @@
import os
import sqlite3
+import subprocess
-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
"""Test removing a snapshot by URL pattern"""
diff --git a/archivebox/tests/test_schedule.py b/archivebox/tests/test_schedule.py
index 9ec5166a..105308fe 100644
--- a/archivebox/tests/test_schedule.py
+++ b/archivebox/tests/test_schedule.py
@@ -7,7 +7,6 @@ import subprocess
import pytest
-from .fixtures import process
def _fetchone(tmp_path, query):
diff --git a/archivebox/tests/test_schedule_e2e.py b/archivebox/tests/test_schedule_e2e.py
new file mode 100644
index 00000000..3cd22d94
--- /dev/null
+++ b/archivebox/tests/test_schedule_e2e.py
@@ -0,0 +1,420 @@
+#!/usr/bin/env python3
+"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
+
+import os
+import socket
+import sqlite3
+import subprocess
+import sys
+import textwrap
+import time
+from pathlib import Path
+
+import pytest
+import requests
+
+from .conftest import run_python_cwd
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def init_archive(cwd: Path) -> None:
+ result = subprocess.run(
+ [sys.executable, '-m', 'archivebox', 'init', '--quick'],
+ cwd=cwd,
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+ assert result.returncode == 0, result.stderr
+
+
+def build_test_env(port: int, **extra: str) -> dict[str, str]:
+ env = os.environ.copy()
+ env.pop('DATA_DIR', None)
+ env.update({
+ 'LISTEN_HOST': f'archivebox.localhost:{port}',
+ 'ALLOWED_HOSTS': '*',
+ 'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
+ 'PUBLIC_ADD_VIEW': 'True',
+ 'USE_COLOR': 'False',
+ 'SHOW_PROGRESS': 'False',
+ 'TIMEOUT': '20',
+ 'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
+ 'SAVE_ARCHIVEDOTORG': 'False',
+ 'SAVE_TITLE': 'False',
+ 'SAVE_FAVICON': 'False',
+ 'SAVE_WARC': 'False',
+ 'SAVE_PDF': 'False',
+ 'SAVE_SCREENSHOT': 'False',
+ 'SAVE_DOM': 'False',
+ 'SAVE_SINGLEFILE': 'False',
+ 'SAVE_READABILITY': 'False',
+ 'SAVE_MERCURY': 'False',
+ 'SAVE_GIT': 'False',
+ 'SAVE_YTDLP': 'False',
+ 'SAVE_HEADERS': 'False',
+ 'SAVE_HTMLTOTEXT': 'False',
+ 'SAVE_WGET': 'True',
+ 'USE_CHROME': 'False',
+ })
+ env.update(extra)
+ return env
+
+
+def get_free_port() -> int:
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+ sock.bind(('127.0.0.1', 0))
+ return sock.getsockname()[1]
+
+
+def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
+ result = subprocess.run(
+ [sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
+ cwd=cwd,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=60,
+ )
+ assert result.returncode == 0, result.stderr
+
+
+def stop_server(cwd: Path) -> None:
+ script = textwrap.dedent(
+ """
+ import os
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+ import django
+ django.setup()
+ from archivebox.workers.supervisord_util import stop_existing_supervisord_process
+ stop_existing_supervisord_process()
+ print('stopped')
+ """
+ )
+ run_python_cwd(script, cwd=cwd, timeout=30)
+
+
+def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
+ deadline = time.time() + timeout
+ last_exc = None
+ while time.time() < deadline:
+ try:
+ response = requests.get(
+ f'http://127.0.0.1:{port}{path}',
+ headers={'Host': host},
+ timeout=2,
+ allow_redirects=False,
+ )
+ if response.status_code < 500:
+ return response
+ except requests.RequestException as exc:
+ last_exc = exc
+ time.sleep(0.5)
+ raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
+
+
+def make_latest_schedule_due(cwd: Path) -> None:
+ conn = sqlite3.connect(cwd / 'index.sqlite3')
+ try:
+ conn.execute(
+ """
+ UPDATE crawls_crawl
+ SET created_at = datetime('now', '-2 day'),
+ modified_at = datetime('now', '-2 day')
+ WHERE id = (
+ SELECT template_id
+ FROM crawls_crawlschedule
+ ORDER BY created_at DESC
+ LIMIT 1
+ )
+ """
+ )
+ conn.commit()
+ finally:
+ conn.close()
+
+
+def get_snapshot_file_text(cwd: Path, url: str) -> str:
+ script = textwrap.dedent(
+ f"""
+ import os
+ from pathlib import Path
+
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+ import django
+ django.setup()
+
+ from archivebox.core.models import Snapshot
+
+ snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
+ assert snapshot is not None, 'missing snapshot'
+ assert snapshot.status == 'sealed', snapshot.status
+
+ snapshot_dir = Path(snapshot.output_dir)
+ candidates = []
+ preferred_patterns = (
+ 'wget/**/index.html',
+ 'wget/**/*.html',
+ 'trafilatura/content.html',
+ 'trafilatura/content.txt',
+ 'defuddle/content.html',
+ 'defuddle/content.txt',
+ )
+ for pattern in preferred_patterns:
+ for candidate in snapshot_dir.glob(pattern):
+ if candidate.is_file():
+ candidates.append(candidate)
+
+ if not candidates:
+ for candidate in snapshot_dir.rglob('*'):
+ if not candidate.is_file():
+ continue
+ rel = candidate.relative_to(snapshot_dir)
+ if rel.parts and rel.parts[0] == 'responses':
+ continue
+ if candidate.suffix not in ('.html', '.htm', '.txt'):
+ continue
+ if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
+ continue
+ candidates.append(candidate)
+
+ assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
+ print(candidates[0].read_text(errors='ignore'))
+ """
+ )
+ stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
+ assert code == 0, stderr
+ return stdout
+
+
+def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
+ deadline = time.time() + timeout
+ last_error = None
+ while time.time() < deadline:
+ try:
+ return get_snapshot_file_text(cwd, url)
+ except AssertionError as err:
+ last_error = err
+ time.sleep(2)
+ raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
+
+
+def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
+ conn = sqlite3.connect(cwd / 'index.sqlite3')
+ try:
+ scheduled_snapshots = conn.execute(
+ "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
+ (scheduled_url,),
+ ).fetchone()[0]
+ one_shot_snapshots = conn.execute(
+ "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
+ (one_shot_url,),
+ ).fetchone()[0]
+ scheduled_crawls = conn.execute(
+ """
+ SELECT COUNT(*)
+ FROM crawls_crawl
+ WHERE schedule_id IS NOT NULL
+ AND urls = ?
+ """,
+ (scheduled_url,),
+ ).fetchone()[0]
+ return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
+ finally:
+ conn.close()
+
+
+def create_admin_and_token(cwd: Path) -> str:
+ script = textwrap.dedent(
+ """
+ import os
+ from datetime import timedelta
+ from django.utils import timezone
+
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+ import django
+ django.setup()
+
+ from django.contrib.auth import get_user_model
+ from archivebox.api.models import APIToken
+
+ User = get_user_model()
+ user, _ = User.objects.get_or_create(
+ username='apitestadmin',
+ defaults={
+ 'email': 'apitestadmin@example.com',
+ 'is_staff': True,
+ 'is_superuser': True,
+ },
+ )
+ user.is_staff = True
+ user.is_superuser = True
+ user.set_password('testpass123')
+ user.save()
+
+ token = APIToken.objects.create(
+ created_by=user,
+ expires=timezone.now() + timedelta(days=1),
+ )
+ print(token.token)
+ """
+ )
+ stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
+ assert code == 0, stderr
+ return stdout.strip().splitlines()[-1]
+
+
+@pytest.mark.timeout(180)
+def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
+ os.chdir(tmp_path)
+ init_archive(tmp_path)
+
+ port = get_free_port()
+ env = build_test_env(port)
+
+ schedule_result = subprocess.run(
+ [sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
+ cwd=tmp_path,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=60,
+ )
+ assert schedule_result.returncode == 0, schedule_result.stderr
+ assert 'Created scheduled crawl' in schedule_result.stdout
+
+ make_latest_schedule_due(tmp_path)
+
+ try:
+ start_server(tmp_path, env=env, port=port)
+ wait_for_http(port, host=f'web.archivebox.localhost:{port}')
+ captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
+ assert 'Root' in captured_text
+ assert 'About' in captured_text
+ finally:
+ stop_server(tmp_path)
+
+
+@pytest.mark.timeout(180)
+def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
+ os.chdir(tmp_path)
+ init_archive(tmp_path)
+
+ port = get_free_port()
+ env = build_test_env(port)
+ scheduled_url = recursive_test_site['root_url']
+ one_shot_url = recursive_test_site['child_urls'][0]
+
+ schedule_result = subprocess.run(
+ [sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
+ cwd=tmp_path,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=60,
+ )
+ assert schedule_result.returncode == 0, schedule_result.stderr
+
+ make_latest_schedule_due(tmp_path)
+
+ add_result = subprocess.run(
+ [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
+ cwd=tmp_path,
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=120,
+ )
+ assert add_result.returncode == 0, add_result.stderr
+ captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
+ assert 'Deep About' in captured_text or 'About' in captured_text
+
+ scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
+ assert one_shot_snapshots >= 1
+ assert scheduled_snapshots == 0
+ assert scheduled_crawls == 1 # template only, no materialized scheduled run
+
+
+@pytest.mark.timeout(180)
+def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
+ os.chdir(tmp_path)
+ init_archive(tmp_path)
+
+ port = get_free_port()
+ env = build_test_env(port)
+ api_token = create_admin_and_token(tmp_path)
+
+ try:
+ start_server(tmp_path, env=env, port=port)
+ wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
+
+ response = requests.post(
+ f'http://127.0.0.1:{port}/api/v1/cli/schedule',
+ headers={
+ 'Host': f'api.archivebox.localhost:{port}',
+ 'X-ArchiveBox-API-Key': api_token,
+ },
+ json={
+ 'every': 'daily',
+ 'import_path': recursive_test_site['root_url'],
+ 'quiet': True,
+ },
+ timeout=10,
+ )
+
+ assert response.status_code == 200, response.text
+ payload = response.json()
+ assert payload['success'] is True
+ assert payload['result_format'] == 'json'
+ assert len(payload['result']['created_schedule_ids']) == 1
+ finally:
+ stop_server(tmp_path)
+
+
+@pytest.mark.timeout(180)
+def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
+ os.chdir(tmp_path)
+ init_archive(tmp_path)
+
+ port = get_free_port()
+ env = build_test_env(port, PUBLIC_ADD_VIEW='True')
+
+ try:
+ start_server(tmp_path, env=env, port=port)
+ wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
+
+ response = requests.post(
+ f'http://127.0.0.1:{port}/add/',
+ headers={'Host': f'web.archivebox.localhost:{port}'},
+ data={
+ 'url': recursive_test_site['root_url'],
+ 'depth': '0',
+ 'schedule': 'daily',
+ 'tag': 'web-ui',
+ 'notes': 'created from web ui',
+ },
+ timeout=10,
+ allow_redirects=False,
+ )
+
+ assert response.status_code in (302, 303), response.text
+
+ conn = sqlite3.connect(tmp_path / 'index.sqlite3')
+ try:
+ row = conn.execute(
+ """
+ SELECT cs.schedule, c.urls, c.tags_str
+ FROM crawls_crawlschedule cs
+ JOIN crawls_crawl c ON c.schedule_id = cs.id
+ ORDER BY cs.created_at DESC
+ LIMIT 1
+ """
+ ).fetchone()
+ finally:
+ conn.close()
+
+ assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
+ finally:
+ stop_server(tmp_path)
diff --git a/archivebox/tests/test_search.py b/archivebox/tests/test_search.py
index 31d944db..9b141be8 100644
--- a/archivebox/tests/test_search.py
+++ b/archivebox/tests/test_search.py
@@ -3,12 +3,9 @@
import os
import subprocess
-import sqlite3
-import json
import pytest
-from .fixtures import process, disable_extractors_dict
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_snapshot.py b/archivebox/tests/test_snapshot.py
index 8d2fc3fc..46b4f09b 100644
--- a/archivebox/tests/test_snapshot.py
+++ b/archivebox/tests/test_snapshot.py
@@ -6,13 +6,11 @@ import subprocess
import sqlite3
from archivebox.machine.models import Process
from datetime import datetime
-from pathlib import Path
from urllib.parse import urlparse
import uuid
import pytest
-from .fixtures import process, disable_extractors_dict
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
@@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
snapshot_id = str(uuid.UUID(snapshot_id_raw))
- crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row
username = user_row[0]
- crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d')
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
domain = urlparse(snapshot_url).hostname or 'unknown'
diff --git a/archivebox/tests/test_status.py b/archivebox/tests/test_status.py
index 2599f053..9035374d 100644
--- a/archivebox/tests/test_status.py
+++ b/archivebox/tests/test_status.py
@@ -3,11 +3,9 @@
import os
import subprocess
-import sqlite3
import pytest
-from .fixtures import process, disable_extractors_dict
def test_status_shows_index_info(tmp_path, process):
diff --git a/archivebox/tests/test_title.py b/archivebox/tests/test_title.py
index d43ae954..883a4a8c 100644
--- a/archivebox/tests/test_title.py
+++ b/archivebox/tests/test_title.py
@@ -1,7 +1,10 @@
import os
import sqlite3
+import subprocess
-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
"""Test that title is extracted from the page."""
diff --git a/archivebox/tests/test_update.py b/archivebox/tests/test_update.py
index 6054f207..e866d811 100644
--- a/archivebox/tests/test_update.py
+++ b/archivebox/tests/test_update.py
@@ -1,7 +1,10 @@
import json
import sqlite3
+import subprocess
-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that archivebox update imports real legacy archive directories."""
diff --git a/archivebox/tests/test_version.py b/archivebox/tests/test_version.py
index 38fa2ba0..7ad7705d 100644
--- a/archivebox/tests/test_version.py
+++ b/archivebox/tests/test_version.py
@@ -3,11 +3,9 @@
import os
import subprocess
-import json
import pytest
-from .fixtures import process, disable_extractors_dict
class TestVersionQuiet:
diff --git a/archivebox/tests/test_worker_config_propagation.py b/archivebox/tests/test_worker_config_propagation.py
index dbb1bfe3..32eb2759 100644
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py
@@ -18,11 +18,9 @@ Config priority order (highest to lowest):
"""
import os
-import json
import sys
import tempfile
import subprocess
-import time
from pathlib import Path
@@ -45,7 +43,7 @@ def test_config_propagation_through_worker_hierarchy():
data_dir.mkdir()
print(f"\n{'='*80}")
- print(f"Test: Config Propagation Through Worker Hierarchy")
+ print("Test: Config Propagation Through Worker Hierarchy")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -63,7 +61,7 @@ def test_config_propagation_through_worker_hierarchy():
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- print(f"✓ Archive initialized\n")
+ print("✓ Archive initialized\n")
# Step 2: Write custom config to ArchiveBox.conf
print("Step 2: Write custom config to ArchiveBox.conf")
@@ -90,7 +88,7 @@ SAVE_TITLE = True
SAVE_FAVICON = True
SAVE_SCREENSHOT = True
""")
- print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
+ print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
# Step 2.5: Set Machine.config values
print("Step 2.5: Set Machine.config with custom binary path")
@@ -123,7 +121,7 @@ print(f"Machine {{machine.hostname}} config updated")
timeout=30,
)
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
- print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
+ print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
# Step 3: Create Crawl via Django ORM with custom crawl.config
print("Step 3: Create Crawl with custom crawl.config JSON")
@@ -421,7 +419,7 @@ def test_config_environment_variable_parsing():
data_dir.mkdir()
print(f"\n{'='*80}")
- print(f"Test: Config Environment Variable Parsing")
+ print("Test: Config Environment Variable Parsing")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -557,7 +555,7 @@ def test_parent_environment_preserved_in_hooks():
data_dir.mkdir()
print(f"\n{'='*80}")
- print(f"Test: Parent Environment Preserved in Hooks")
+ print("Test: Parent Environment Preserved in Hooks")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -575,7 +573,7 @@ def test_parent_environment_preserved_in_hooks():
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- print(f"✓ Archive initialized\n")
+ print("✓ Archive initialized\n")
# Create snapshot
print("Step 2: Create Snapshot")
@@ -635,7 +633,6 @@ print(snapshot.id)
timeout=120,
)
- stdout = result.stdout.decode()
stderr = result.stderr.decode()
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
@@ -760,7 +757,7 @@ def test_config_auto_fetch_relationships():
data_dir.mkdir()
print(f"\n{'='*80}")
- print(f"Test: Config Auto-Fetch Relationships")
+ print("Test: Config Auto-Fetch Relationships")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -778,7 +775,7 @@ def test_config_auto_fetch_relationships():
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- print(f"✓ Archive initialized\n")
+ print("✓ Archive initialized\n")
# Create objects with config at each level
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
@@ -906,7 +903,7 @@ def test_config_precedence_with_environment_vars():
data_dir.mkdir()
print(f"\n{'='*80}")
- print(f"Test: Config Precedence with Environment Variables")
+ print("Test: Config Precedence with Environment Variables")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -1006,7 +1003,7 @@ def test_new_environment_variables_added():
data_dir.mkdir()
print(f"\n{'='*80}")
- print(f"Test: New Environment Variables Added to Config")
+ print("Test: New Environment Variables Added to Config")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py
index d969acc9..9720cde4 100644
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -94,10 +94,10 @@ class Orchestrator:
self.POLL_INTERVAL = 0.25
# Exit quickly once idle in foreground mode
self.IDLE_TIMEOUT = 1
-
+
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
-
+
@classmethod
def is_running(cls) -> bool:
"""Check if an orchestrator is already running."""
@@ -223,7 +223,7 @@ class Orchestrator:
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
)
-
+
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
"""Determine if we should spawn a new worker."""
if queue_count == 0:
@@ -253,7 +253,7 @@ class Orchestrator:
return False
return True
-
+
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
@@ -286,7 +286,10 @@ class Orchestrator:
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
for p in all_procs:
- print(f'[yellow] -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]')
+ print(
+ f'[yellow] -> type={p.process_type} status={p.status} '
+ f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
+ )
worker_process = Process.objects.filter(
pid=pid,
@@ -324,7 +327,7 @@ class Orchestrator:
error=e,
)
return None
-
+
def check_queues_and_spawn_workers(self) -> dict[str, int]:
"""
Check Binary and Crawl queues and spawn workers as needed.
@@ -584,11 +587,11 @@ class Orchestrator:
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
"""Check if any queue has pending work."""
return any(count > 0 for count in queue_sizes.values())
-
+
def has_running_workers(self) -> bool:
"""Check if any workers are still running."""
return self.get_total_worker_count() > 0
-
+
def has_future_work(self) -> bool:
"""Check if there's work scheduled for the future (retry_at > now) in Crawl queue."""
from archivebox.crawls.models import Crawl
@@ -605,38 +608,38 @@ class Orchestrator:
qs = qs.filter(id=self.crawl_id)
return qs.count() > 0
-
+
def on_tick(self, queue_sizes: dict[str, int]) -> None:
"""Called each orchestrator tick. Override for custom behavior."""
# Tick logging suppressed to reduce noise
pass
-
+
def on_idle(self) -> None:
"""Called when orchestrator is idle (no work, no workers)."""
# Idle logging suppressed to reduce noise
pass
-
+
def should_exit(self, queue_sizes: dict[str, int]) -> bool:
"""Determine if orchestrator should exit."""
if not self.exit_on_idle:
return False
-
+
if self.IDLE_TIMEOUT == 0:
return False
-
+
# Don't exit if there's pending or future work
if self.has_pending_work(queue_sizes):
return False
-
+
if self.has_running_workers():
return False
-
+
if self.has_future_work():
return False
-
+
# Exit after idle timeout
return self.idle_count >= self.IDLE_TIMEOUT
-
+
def runloop(self) -> None:
"""Main orchestrator loop."""
from rich.live import Live
@@ -702,7 +705,7 @@ class Orchestrator:
os.close(devnull_fd)
os.close(stdout_for_restore)
os.close(stderr_for_restore)
- except:
+ except OSError:
pass
# stdout_for_console is closed by orchestrator_console
@@ -1132,7 +1135,6 @@ class Orchestrator:
# Count hooks by status for debugging
queued = snapshot.archiveresult_set.filter(status='queued').count()
- started = snapshot.archiveresult_set.filter(status='started').count()
# Find currently running hook (ordered by hook_name to get lowest step number)
current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
@@ -1211,7 +1213,7 @@ class Orchestrator:
for snapshot_id in list(snapshot_progress.keys()):
if snapshot_id not in active_ids:
progress_layout.log_event(
- f"Snapshot completed/removed",
+ "Snapshot completed/removed",
style="blue"
)
if snapshot_id in snapshot_progress:
@@ -1263,7 +1265,7 @@ class Orchestrator:
raise
else:
self.on_shutdown()
-
+
def start(self) -> int:
"""
Fork orchestrator as a background process.
@@ -1285,7 +1287,7 @@ class Orchestrator:
pid=proc.pid,
)
return proc.pid
-
+
@classmethod
def get_or_start(cls, exit_on_idle: bool = True) -> 'Orchestrator':
"""
@@ -1296,6 +1298,6 @@ class Orchestrator:
print('[grey53]👨✈️ Orchestrator already running[/grey53]')
# Return a placeholder - actual orchestrator is in another process
return cls(exit_on_idle=exit_on_idle)
-
+
orchestrator = cls(exit_on_idle=exit_on_idle)
return orchestrator
diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py
index f4d7aa02..b85865cc 100644
--- a/archivebox/workers/supervisord_util.py
+++ b/archivebox/workers/supervisord_util.py
@@ -2,7 +2,6 @@ __package__ = 'archivebox.workers'
import sys
import time
-import signal
import socket
import psutil
import shutil
@@ -42,7 +41,7 @@ ORCHESTRATOR_WORKER = {
SERVER_WORKER = lambda host, port: {
"name": "worker_daphne",
- "command": f"daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
+ "command": f"{sys.executable} -m daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
"autostart": "false",
"autorestart": "true",
"stdout_logfile": "logs/worker_daphne.log",
@@ -513,8 +512,6 @@ def watch_worker(supervisor, daemon_name, interval=5):
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
- global _supervisord_proc
-
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
bg_workers = [
@@ -551,8 +548,6 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
def start_cli_workers(watch=False):
- global _supervisord_proc
-
supervisor = get_or_create_supervisord_process(daemonize=False)
start_worker(supervisor, ORCHESTRATOR_WORKER)
diff --git a/archivebox/workers/tests/test_orchestrator.py b/archivebox/workers/tests/test_orchestrator.py
index 79d37f95..ac8e23a6 100644
--- a/archivebox/workers/tests/test_orchestrator.py
+++ b/archivebox/workers/tests/test_orchestrator.py
@@ -10,9 +10,7 @@ Tests cover:
"""
import os
-import tempfile
import time
-from pathlib import Path
from datetime import timedelta
from unittest.mock import patch, MagicMock
@@ -217,7 +215,6 @@ class TestOrchestratorWithProcess(TestCase):
def test_orchestrator_scoped_worker_count(self):
"""Orchestrator with crawl_id should count only descendant workers."""
- import time
from archivebox.machine.models import Process, Machine
machine = Machine.current()
diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py
index 37a920b7..a344f6a2 100644
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -13,13 +13,10 @@ __package__ = 'archivebox.workers'
import os
import time
-import traceback
-from typing import ClassVar, Any
-from datetime import timedelta
+from typing import TYPE_CHECKING, Any, ClassVar
from pathlib import Path
from multiprocessing import cpu_count
-from django.db.models import QuerySet
from django.utils import timezone
from django.conf import settings
@@ -28,6 +25,9 @@ from rich import print
from archivebox.misc.logging_util import log_worker_event
+if TYPE_CHECKING:
+ from archivebox.machine.models import Process
+
CPU_COUNT = cpu_count()
@@ -314,7 +314,10 @@ class Worker:
process.kill(signal_num=signal.SIGKILL)
log_worker_event(
worker_type=worker_type,
- event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)',
+ event=(
+ f'⚠ Sent SIGKILL to {hook_name} + '
+ f'{len(children_pids) if children_pids else 0} children (exceeded timeout)'
+ ),
indent_level=indent_level,
pid=self.pid,
)
@@ -341,7 +344,6 @@ class Worker:
from archivebox.machine.models import Process, Machine
from archivebox.config.configset import get_config
from pathlib import Path
- from django.conf import settings
import sys
refresh_machine_config = bool(
@@ -552,7 +554,7 @@ class CrawlWorker(Worker):
# Check if crawl is done
if self._is_crawl_finished():
- print(f'🔄 Crawl finished, sealing...', file=sys.stderr)
+ print('🔄 Crawl finished, sealing...', file=sys.stderr)
self.crawl.sm.seal()
break
@@ -813,7 +815,8 @@ class SnapshotWorker(Worker):
is_background = is_background_hook(hook_name)
# Create ArchiveResult for THIS HOOK (not per plugin)
- # One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
+ # One plugin can have multiple hooks
+ # (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
# Unique key = (snapshot, plugin, hook_name) for idempotency
ar, created = ArchiveResult.objects.get_or_create(
snapshot=self.snapshot,
@@ -868,7 +871,7 @@ class SnapshotWorker(Worker):
self.snapshot.sm.seal()
self.snapshot.refresh_from_db()
- except Exception as e:
+ except Exception:
# Mark snapshot as sealed even on error (still triggers cleanup)
self._finalize_background_hooks()
self.snapshot.sm.seal()
@@ -1019,7 +1022,6 @@ class SnapshotWorker(Worker):
self.background_processes = {}
# Update background results now that hooks are done
- from archivebox.core.models import ArchiveResult
bg_results = self.snapshot.archiveresult_set.filter(
hook_name__contains='.bg.',
@@ -1034,7 +1036,6 @@ class SnapshotWorker(Worker):
if not self.background_processes:
return
- from archivebox.core.models import ArchiveResult
for hook_name, process in list(self.background_processes.items()):
exit_code = process.poll()
@@ -1165,7 +1166,6 @@ class BinaryWorker(Worker):
def runloop(self) -> None:
"""Install binary(ies)."""
- import sys
self.on_startup()
@@ -1216,7 +1216,7 @@ class BinaryWorker(Worker):
except Exception as e:
log_worker_event(
worker_type='BinaryWorker',
- event=f'Failed to install binary',
+ event='Failed to install binary',
indent_level=1,
pid=self.pid,
error=e,
diff --git a/bin/test.sh b/bin/test.sh
index 7690d375..7567a56c 100755
--- a/bin/test.sh
+++ b/bin/test.sh
@@ -14,5 +14,5 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
source "$DIR/.venv/bin/activate"
-pytest -s --basetemp=tests/data "$@"
+pytest -s --basetemp=archivebox/tests/data "$@"
exec ./bin/test_plugins.sh
diff --git a/docs b/docs
index a9e347fa..be25d9bf 160000
--- a/docs
+++ b/docs
@@ -1 +1 @@
-Subproject commit a9e347fac6fb37f7c5194379aca8aca44839f446
+Subproject commit be25d9bfa2d0f98b6b5b788c43d9629d1b31d217