From 934e02695bd6eac3e317f5c1d0d5138a5cb3317e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 15 Mar 2026 18:45:29 -0700 Subject: [PATCH] fix lint --- archivebox/api/models.py | 1 - archivebox/api/tests.py | 17 +- archivebox/api/v1_auth.py | 3 - archivebox/api/v1_crawls.py | 1 - archivebox/base_models/admin.py | 2 +- archivebox/base_models/models.py | 5 - archivebox/cli/archivebox_add.py | 10 +- archivebox/cli/archivebox_config.py | 2 +- archivebox/cli/archivebox_extract.py | 1 - archivebox/cli/archivebox_install.py | 2 - archivebox/cli/archivebox_persona.py | 9 +- archivebox/cli/archivebox_pluginmap.py | 1 - archivebox/cli/archivebox_search.py | 3 +- archivebox/cli/archivebox_server.py | 11 +- archivebox/cli/archivebox_status.py | 6 +- archivebox/cli/archivebox_update.py | 2 - archivebox/cli/archivebox_version.py | 2 +- archivebox/cli/tests.py | 34 +- archivebox/cli/tests_piping.py | 17 +- archivebox/config/__init__.py | 5 +- archivebox/config/collection.py | 1 - archivebox/config/configset.py | 21 +- archivebox/config/django.py | 4 +- archivebox/config/permissions.py | 1 - archivebox/config/views.py | 47 +- archivebox/core/admin_site.py | 3 +- archivebox/core/admin_snapshots.py | 6 +- archivebox/core/apps.py | 2 +- archivebox/core/asgi.py | 3 +- archivebox/core/forms.py | 5 +- archivebox/core/middleware.py | 1 - .../migrations/0006_auto_20201012_1520.py | 4 - .../core/migrations/0007_archiveresult.py | 2 +- .../core/migrations/0023_upgrade_to_0_9_0.py | 1 - .../migrations/0024_assign_default_crawl.py | 1 - .../0027_copy_archiveresult_to_process.py | 2 +- archivebox/core/models.py | 75 ++-- archivebox/core/settings.py | 13 +- archivebox/core/settings_logging.py | 2 - archivebox/core/tests.py | 18 +- archivebox/core/views.py | 29 +- archivebox/core/widgets.py | 3 - archivebox/core/wsgi.py | 3 +- archivebox/crawls/admin.py | 6 - archivebox/crawls/models.py | 58 ++- archivebox/hooks.py | 8 +- archivebox/ldap/auth.py | 1 - archivebox/machine/models.py | 26 +- .../machine/tests/test_machine_models.py | 4 - archivebox/mcp/server.py | 6 +- archivebox/misc/checks.py | 1 - archivebox/misc/folders.py | 1 - archivebox/misc/logging_util.py | 3 +- archivebox/misc/monkey_patches.py | 14 +- archivebox/misc/progress_layout.py | 1 - archivebox/misc/system.py | 5 +- archivebox/personas/admin.py | 1 - archivebox/personas/models.py | 151 ++++++- archivebox/personas/tests.py | 1 - archivebox/personas/views.py | 1 - archivebox/search/__init__.py | 5 +- archivebox/tests/conftest.py | 3 +- archivebox/tests/test_add.py | 9 +- archivebox/tests/test_admin_views.py | 2 +- archivebox/tests/test_auth_ldap.py | 10 +- archivebox/tests/test_cli_add.py | 29 +- archivebox/tests/test_cli_archiveresult.py | 1 - archivebox/tests/test_cli_config.py | 3 - archivebox/tests/test_cli_crawl.py | 3 - archivebox/tests/test_cli_extract.py | 4 +- archivebox/tests/test_cli_help.py | 2 - archivebox/tests/test_cli_init.py | 5 +- archivebox/tests/test_cli_install.py | 4 +- archivebox/tests/test_cli_manage.py | 3 - archivebox/tests/test_cli_remove.py | 5 +- archivebox/tests/test_cli_run.py | 1 - .../tests/test_cli_run_binary_worker.py | 2 - archivebox/tests/test_cli_schedule.py | 1 - archivebox/tests/test_cli_search.py | 3 - archivebox/tests/test_cli_server.py | 4 - archivebox/tests/test_cli_shell.py | 2 - archivebox/tests/test_cli_snapshot.py | 2 - archivebox/tests/test_cli_status.py | 4 +- archivebox/tests/test_cli_update.py | 4 +- archivebox/tests/test_cli_version.py | 4 +- archivebox/tests/test_config.py | 2 +- archivebox/tests/test_crawl.py | 2 - archivebox/tests/test_extract.py | 2 +- archivebox/tests/test_extractors.py | 6 +- archivebox/tests/test_hooks.py | 2 +- archivebox/tests/test_init.py | 9 +- archivebox/tests/test_install.py | 1 - archivebox/tests/test_list.py | 4 +- archivebox/tests/test_migrations_08_to_09.py | 6 +- archivebox/tests/test_recursive_crawl.py | 2 - archivebox/tests/test_remove.py | 5 +- archivebox/tests/test_schedule.py | 1 - archivebox/tests/test_schedule_e2e.py | 420 ++++++++++++++++++ archivebox/tests/test_search.py | 3 - archivebox/tests/test_snapshot.py | 4 - archivebox/tests/test_status.py | 2 - archivebox/tests/test_title.py | 5 +- archivebox/tests/test_update.py | 5 +- archivebox/tests/test_version.py | 2 - .../tests/test_worker_config_propagation.py | 25 +- archivebox/workers/orchestrator.py | 48 +- archivebox/workers/supervisord_util.py | 7 +- archivebox/workers/tests/test_orchestrator.py | 3 - archivebox/workers/worker.py | 26 +- bin/test.sh | 2 +- docs | 2 +- 111 files changed, 919 insertions(+), 461 deletions(-) create mode 100644 archivebox/tests/test_schedule_e2e.py diff --git a/archivebox/api/models.py b/archivebox/api/models.py index 50d5bcc8..29f99913 100755 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -2,7 +2,6 @@ __package__ = 'archivebox.api' import secrets from archivebox.uuid_compat import uuid7 -from datetime import timedelta from django.conf import settings from django.db import models diff --git a/archivebox/api/tests.py b/archivebox/api/tests.py index ee566a63..0dba652c 100644 --- a/archivebox/api/tests.py +++ b/archivebox/api/tests.py @@ -1,16 +1,17 @@ -import os -import django +import importlib from io import StringIO from types import SimpleNamespace -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') -django.setup() +from archivebox.config.django import setup_django -from django.contrib.auth.models import User -from django.test import TestCase +setup_django() -from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule -from archivebox.crawls.models import CrawlSchedule +User = importlib.import_module('django.contrib.auth.models').User +TestCase = importlib.import_module('django.test').TestCase +api_v1_cli = importlib.import_module('archivebox.api.v1_cli') +ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema +cli_schedule = api_v1_cli.cli_schedule +CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule class CLIScheduleAPITests(TestCase): diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py index cc82c371..a77124cf 100644 --- a/archivebox/api/v1_auth.py +++ b/archivebox/api/v1_auth.py @@ -3,10 +3,7 @@ __package__ = 'archivebox.api' from typing import Optional from ninja import Router, Schema -from django.utils import timezone -from datetime import timedelta -from archivebox.api.models import APIToken from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index fe268a3c..ca1b0d87 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -5,7 +5,6 @@ from typing import List, Optional from datetime import datetime from django.utils import timezone -from django.db.models import Q from django.contrib.auth import get_user_model from ninja import Router, Schema diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py index 3c4fa643..0d172fca 100644 --- a/archivebox/base_models/admin.py +++ b/archivebox/base_models/admin.py @@ -6,7 +6,7 @@ import json from django import forms from django.contrib import admin -from django.utils.html import format_html, mark_safe +from django.utils.html import mark_safe from django_object_actions import DjangoObjectActions diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index c036edd1..02cf144b 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -2,12 +2,9 @@ __package__ = 'archivebox.base_models' -from uuid import UUID from archivebox.uuid_compat import uuid7 -from typing import ClassVar from pathlib import Path -from django.contrib import admin from django.db import models from django.db.models import F from django.utils import timezone @@ -17,8 +14,6 @@ from django.conf import settings from django_stubs_ext.db.models import TypedModelMeta -from archivebox import DATA_DIR -from archivebox.misc.hashing import get_dir_info def get_or_create_system_user_pk(username='system'): diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 46ae23a9..a1eecf79 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -57,6 +57,7 @@ def add(urls: str | list[str], from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.personas.models import Persona from archivebox.workers.orchestrator import Orchestrator from archivebox.misc.logging_util import printable_filesize from archivebox.misc.system import get_dir_size @@ -79,11 +80,15 @@ def add(urls: str | list[str], # Read URLs directly into crawl urls_content = sources_file.read_text() + persona_name = (persona or 'Default').strip() or 'Default' + persona_obj, _ = Persona.objects.get_or_create(name=persona_name) + persona_obj.ensure_dirs() crawl = Crawl.objects.create( urls=urls_content, max_depth=depth, tags_str=tag, + persona_id=persona_obj.id, label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]', created_by_id=created_by_id, config={ @@ -91,7 +96,7 @@ def add(urls: str | list[str], 'INDEX_ONLY': index_only, 'OVERWRITE': overwrite, 'PLUGINS': plugins, - 'DEFAULT_PERSONA': persona or 'Default', + 'DEFAULT_PERSONA': persona_name, 'PARSER': parser, } ) @@ -135,8 +140,7 @@ def add(urls: str | list[str], print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]') else: # Foreground mode: run full orchestrator until all work is done - print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]') - from archivebox.workers.orchestrator import Orchestrator + print('[green]\\[*] Starting orchestrator to process crawl...[/green]') orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id)) orchestrator.runloop() # Block until complete diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 751a85ea..c96c0bde 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -94,7 +94,7 @@ def config(*keys, # Display all plugin config in single [PLUGINS] section if plugin_keys: - print(f'[grey53]\\[PLUGINS][/grey53]') + print('[grey53]\\[PLUGINS][/grey53]') print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n')) print('[grey53]################################################################[/grey53]') diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index 9142fbf8..900c0bef 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -31,7 +31,6 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox extract' import sys -from typing import Optional, List import rich_click as click diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index 3c8a4e35..8a91e8d2 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -3,8 +3,6 @@ __package__ = 'archivebox.cli' import os -import sys -import shutil import rich_click as click from rich import print diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py index 1e1d4e60..cc0b95ae 100644 --- a/archivebox/cli/archivebox_persona.py +++ b/archivebox/cli/archivebox_persona.py @@ -410,7 +410,6 @@ def create_personas( """ from archivebox.misc.jsonl import write_record from archivebox.personas.models import Persona - from archivebox.config.constants import CONSTANTS is_tty = sys.stdout.isatty() name_list = list(names) if names else [] @@ -493,10 +492,10 @@ def create_personas( 'SingletonLock', 'SingletonSocket', 'SingletonCookie', ), ) - rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr) + rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr) # Extract cookies via CDP - rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) + rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) if extract_cookies_via_cdp( persona_chrome_dir, @@ -506,8 +505,8 @@ def create_personas( ): rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr) else: - rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr) - rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr) + rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr) + rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr) except Exception as e: rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr) diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py index fe280faa..a6d132ac 100644 --- a/archivebox/cli/archivebox_pluginmap.py +++ b/archivebox/cli/archivebox_pluginmap.py @@ -3,7 +3,6 @@ __package__ = 'archivebox.cli' from typing import Optional -from pathlib import Path import rich_click as click diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py index b066b474..009afa36 100644 --- a/archivebox/cli/archivebox_search.py +++ b/archivebox/cli/archivebox_search.py @@ -4,7 +4,7 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox search' from pathlib import Path -from typing import Optional, List, Any +from typing import Optional, List import rich_click as click from rich import print @@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None, csv: str | None=None, with_headers: bool=False): """List, filter, and export information about archive entries""" - from archivebox.core.models import Snapshot if with_headers and not (json or html or csv): stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 6e6401cd..d3a31a3c 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), get_existing_supervisord_process, get_worker, start_server_workers, - tail_multiple_worker_logs, is_port_in_use, ) from archivebox.workers.orchestrator import Orchestrator @@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), if is_port_in_use(host, int(port)): print(f'[red][X] Error: Port {port} is already in use[/red]') print(f' Another process (possibly daphne) is already listening on {host}:{port}') - print(f' Stop the conflicting process or choose a different port') + print(' Stop the conflicting process or choose a different port') sys.exit(1) # Check if orchestrator is already running for this data directory if Orchestrator.is_running(): - print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]') - print(f' Stop the existing orchestrator before starting a new server') - print(f' To stop: pkill -f "archivebox manage orchestrator"') + print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]') + print(' Stop the existing orchestrator before starting a new server') + print(' To stop: pkill -f "archivebox manage orchestrator"') sys.exit(1) # Check if supervisord is already running @@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), print('[red][X] Error: ArchiveBox server is already running[/red]') print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING': - print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING') + print(' [green]√[/green] Background worker (worker_orchestrator) is RUNNING') print() print('[yellow]To stop the existing server, run:[/yellow]') print(' pkill -f "archivebox server"') diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index c0622f0d..424de1ef 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None: if not snapshot.downloaded_at: continue print( - '[grey53] ' + ( + '[grey53] ' f' > {str(snapshot.downloaded_at)[:16]} ' f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'"{snapshot.title}": {snapshot.url}' - )[:SHELL_CONFIG.TERM_WIDTH] - + '[grey53]', + '[/grey53]' + )[:SHELL_CONFIG.TERM_WIDTH], ) print('[grey53] ...') diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index a3601bd0..9a8fd8e0 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (), from archivebox.config.django import setup_django setup_django() - from archivebox.core.models import Snapshot - from django.utils import timezone from django.core.management import call_command # Run migrations first to ensure DB schema is up-to-date diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 4f80bfe2..c89298f9 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -6,7 +6,7 @@ import sys import os import platform from pathlib import Path -from typing import Iterable, Optional +from typing import Iterable import rich_click as click diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py index 27dec785..ab3b7a8e 100644 --- a/archivebox/cli/tests.py +++ b/archivebox/cli/tests.py @@ -3,13 +3,13 @@ __package__ = 'archivebox.cli' +import importlib import os -import sys import shutil +import sys import unittest -from pathlib import Path - from contextlib import contextmanager +from pathlib import Path TEST_CONFIG = { 'USE_COLOR': 'False', @@ -30,18 +30,15 @@ TEST_CONFIG = { DATA_DIR = 'data.tests' os.environ.update(TEST_CONFIG) -from ..main import init -from archivebox.config.constants import ( - SQL_INDEX_FILENAME, - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, -) - -from . import ( - archivebox_init, - archivebox_add, - archivebox_remove, -) +init = importlib.import_module('archivebox.main').init +constants = importlib.import_module('archivebox.config.constants') +SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME +JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME +HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME +archivebox_init = importlib.import_module('archivebox.cli.archivebox_init') +archivebox_add = importlib.import_module('archivebox.cli.archivebox_add') +archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove') +parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index HIDE_CLI_OUTPUT = True @@ -68,6 +65,13 @@ stdout = sys.stdout stderr = sys.stderr +def load_main_index(*, out_dir: str): + index_path = Path(out_dir) / JSON_INDEX_FILENAME + if not index_path.exists(): + raise FileNotFoundError(index_path) + return list(parse_json_main_index(Path(out_dir))) + + @contextmanager def output_hidden(show_failing=True): if not HIDE_CLI_OUTPUT: diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index 9f8e8c02..623c2567 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -23,7 +23,6 @@ Each command should: __package__ = 'archivebox.cli' import os -import sys import json import shutil import tempfile @@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase): def test_parse_jsonl_with_id(self): """JSONL with id field should be recognized.""" - from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT + from archivebox.misc.jsonl import parse_line line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}' result = parse_line(line) @@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): """ from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, - TYPE_SNAPSHOT + read_args_or_stdin, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): Test: archivebox snapshot URL | archivebox extract Extract should accept JSONL output from snapshot command. """ - from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( read_args_or_stdin, TYPE_SNAPSHOT @@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase): Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract """ from archivebox.hooks import collect_urls_from_plugins - from archivebox.misc.jsonl import TYPE_SNAPSHOT # Create mock output directory snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test' @@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase): def test_crawl_passes_through_other_types(self): """crawl create should pass through records with other types.""" - from archivebox.misc.jsonl import TYPE_CRAWL # Input: a Tag record (not a Crawl or URL) tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'} @@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase): # Mock stdin with both records stdin = StringIO( - json.dumps(tag_record) + '\n' + - json.dumps(url_record) + json.dumps(tag_record) + + '\n' + + json.dumps(url_record) ) stdin.isatty = lambda: False @@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase): def test_snapshot_passes_through_crawl(self): """snapshot create should pass through Crawl records.""" - from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT + from archivebox.misc.jsonl import TYPE_CRAWL crawl_record = { 'type': TYPE_CRAWL, diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 246a2e0c..0033269c 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -8,10 +8,6 @@ and other modules that expect to import config values directly. __package__ = 'archivebox.config' __order__ = 200 -import shutil -from pathlib import Path -from typing import Dict, List, Optional - from .paths import ( PACKAGE_DIR, # noqa DATA_DIR, # noqa @@ -31,6 +27,7 @@ def _get_config(): from .common import ARCHIVING_CONFIG, STORAGE_CONFIG return ARCHIVING_CONFIG, STORAGE_CONFIG + # Direct exports (evaluated at import time for backwards compat) # These are recalculated each time the module attribute is accessed diff --git a/archivebox/config/collection.py b/archivebox/config/collection.py index 46b591fe..51af4ab6 100644 --- a/archivebox/config/collection.py +++ b/archivebox/config/collection.py @@ -9,7 +9,6 @@ from configparser import ConfigParser from benedict import benedict -import archivebox from archivebox.config.constants import CONSTANTS diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index e284d44b..39b8f51a 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -11,10 +11,10 @@ __package__ = "archivebox.config" import os import json from pathlib import Path -from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast +from typing import Any, Dict, Optional, Type, Tuple from configparser import ConfigParser -from pydantic import Field, ConfigDict +from pydantic import ConfigDict from pydantic_settings import BaseSettings, PydanticBaseSettingsSource @@ -166,6 +166,23 @@ def get_config( if user is None and crawl and hasattr(crawl, "created_by"): user = crawl.created_by + + if persona is None and crawl is not None: + try: + from archivebox.personas.models import Persona + + persona_id = getattr(crawl, "persona_id", None) + if persona_id: + persona = Persona.objects.filter(id=persona_id).first() + + if persona is None: + crawl_config = getattr(crawl, "config", None) or {} + default_persona_name = crawl_config.get("DEFAULT_PERSONA") + if default_persona_name: + persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default") + persona.ensure_dirs() + except Exception: + pass from archivebox.config.constants import CONSTANTS from archivebox.config.common import ( SHELL_CONFIG, diff --git a/archivebox/config/django.py b/archivebox/config/django.py index 75cc5539..09ddcfd2 100644 --- a/archivebox/config/django.py +++ b/archivebox/config/django.py @@ -100,9 +100,11 @@ def setup_django(check_db=False, in_memory_db=False) -> None: return from django.conf import settings + from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG # log startup message to the error log - with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: + error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG) + with open(error_log, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py index 08d81ce6..b8a5f557 100644 --- a/archivebox/config/permissions.py +++ b/archivebox/config/permissions.py @@ -46,7 +46,6 @@ if RUNNING_AS_UID == 0: # if we are running as root it's really hard to figure out what the correct archivebox user should be # as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users) # check if 911:911 archivebox user exists on host system, and use it instead of 0 - import pwd if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox': FALLBACK_UID = DEFAULT_PUID FALLBACK_GID = DEFAULT_PGID diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 316e1aa3..1e3e8f5e 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -3,7 +3,6 @@ __package__ = 'archivebox.config' import os import shutil import inspect -from pathlib import Path from typing import Any, List, Dict, cast from benedict import benedict @@ -30,11 +29,11 @@ KNOWN_BINARIES = [ ] -def obj_to_yaml(obj: Any, indent: int=0) -> str: +def obj_to_yaml(obj: Any, indent: int = 0) -> str: indent_str = " " * indent if indent == 0: indent_str = '\n' # put extra newline between top-level entries - + if isinstance(obj, dict): if not obj: return "{}" @@ -42,7 +41,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: for key, value in obj.items(): result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n" return result - + elif isinstance(obj, list): if not obj: return "[]" @@ -50,16 +49,16 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: for item in obj: result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n" return result.rstrip() - + elif isinstance(obj, str): if "\n" in obj: return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ") else: return f" {obj}" - + elif isinstance(obj, (int, float, bool)): return f" {str(obj)}" - + elif callable(obj): source = '\n'.join( '' if 'def ' in line else line @@ -67,7 +66,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: if line.strip() ).split('lambda: ')[-1].rstrip(',') return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ") - + else: return f" {str(obj)}" @@ -75,7 +74,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str: def get_detected_binaries() -> Dict[str, Dict[str, Any]]: """Detect available binaries using shutil.which.""" binaries = {} - + for name in KNOWN_BINARIES: path = shutil.which(name) if path: @@ -85,7 +84,7 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]: 'version': None, # Could add version detection later 'is_available': True, } - + return binaries @@ -144,19 +143,19 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: # Get binaries from database (previously detected/installed) db_binaries = {b.name: b for b in Binary.objects.all()} - - # Get currently detectable binaries + + # Get currently detectable binaries detected = get_detected_binaries() - + # Merge and display all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys()))) - + for name in all_binary_names: db_binary = db_binaries.get(name) detected_binary = detected.get(name) - + rows['Binary Name'].append(ItemLink(name, key=name)) - + if db_binary: rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found') rows['Provided By'].append(db_binary.binprovider or 'PATH') @@ -175,6 +174,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: table=rows, ) + @render_with_item_view def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: @@ -203,7 +203,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: ) except Binary.DoesNotExist: pass - + # Try to detect from PATH path = shutil.which(key) if path: @@ -224,7 +224,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: }, ], ) - + return ItemContext( slug=key, title=key, @@ -286,6 +286,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: table=rows, ) + @render_with_item_view def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: import json @@ -314,7 +315,10 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: # Add config.json data if available if plugin.get('config'): config_json = json.dumps(plugin['config'], indent=2) - fields["config.json"] = mark_safe(f'
{config_json}
') + fields["config.json"] = mark_safe( + '
{config_json}
' + ) # Also extract and display individual config properties for easier viewing if 'properties' in plugin['config']: @@ -322,7 +326,6 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: properties_summary = [] for prop_name, prop_info in config_properties.items(): prop_type = prop_info.get('type', 'unknown') - prop_default = prop_info.get('default', 'N/A') prop_desc = prop_info.get('description', '') properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}") @@ -365,7 +368,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: title="No running worker processes", table=rows, ) - + all_config_entries = cast(List[Dict[str, Any]], supervisor.getAllConfigInfo() or []) all_config = {config["name"]: benedict(config) for config in all_config_entries} @@ -514,7 +517,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: assert request.user.is_superuser, "Must be a superuser to view configuration settings." - + log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0] log_text = log_file.read_text() diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py index ce4ca437..ab5fc144 100644 --- a/archivebox/core/admin_site.py +++ b/archivebox/core/admin_site.py @@ -1,8 +1,8 @@ __package__ = 'archivebox.core' from django.contrib import admin +from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls -import archivebox class ArchiveBoxAdmin(admin.AdminSite): site_header = 'ArchiveBox' @@ -20,7 +20,6 @@ archivebox_admin = ArchiveBoxAdmin() # patch admin with methods to add data views (implemented by admin_data_views package) # https://github.com/MrThearMan/django-admin-data-views # https://mrthearman.github.io/django-admin-data-views/setup/ -from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index bc1093c9..85024ed5 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -26,7 +26,7 @@ from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from archivebox.workers.tasks import bg_archive_snapshots, bg_add from archivebox.core.models import Tag, Snapshot, ArchiveResult -from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list +from archivebox.core.admin_archiveresults import render_archiveresults_list from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget @@ -712,8 +712,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description="🔁 Redo Failed" ) def update_snapshots(self, request, queryset): - count = queryset.count() - queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR}) messages.success( @@ -741,8 +739,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description="🔄 Redo" ) def overwrite_snapshots(self, request, queryset): - count = queryset.count() - queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR}) messages.success( diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 713d34d9..d6703b3f 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -60,7 +60,7 @@ class CoreConfig(AppConfig): from archivebox.workers.orchestrator import Orchestrator Process.cleanup_stale_running() - machine = Machine.current() + Machine.current() if not Orchestrator.is_running(): Orchestrator(exit_on_idle=False).start() diff --git a/archivebox/core/asgi.py b/archivebox/core/asgi.py index 4963169f..1253fbb0 100644 --- a/archivebox/core/asgi.py +++ b/archivebox/core/asgi.py @@ -8,11 +8,10 @@ https://docs.djangoproject.com/en/stable/howto/deployment/asgi/ """ from archivebox.config.django import setup_django +from django.core.asgi import get_asgi_application setup_django(in_memory_db=False, check_db=True) -from django.core.asgi import get_asgi_application - # Standard Django ASGI application (no websockets/channels needed) application = get_asgi_application() diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index a1a83ed7..cc4f62b3 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -6,6 +6,7 @@ from archivebox.misc.util import URL_REGEX from taggit.utils import edit_string_for_tags, parse_tags from archivebox.base_models.admin import KeyValueWidget from archivebox.crawls.schedule_utils import validate_schedule +from archivebox.hooks import get_plugins DEPTH_CHOICES = ( ('0', 'depth = 0 (archive just these URLs)'), @@ -15,7 +16,6 @@ DEPTH_CHOICES = ( ('4', 'depth = 4 (+ URLs four hops away)'), ) -from archivebox.hooks import get_plugins def get_plugin_choices(): """Get available extractor plugins from discovered hooks.""" @@ -210,15 +210,18 @@ class AddLinkForm(forms.Form): return schedule + class TagWidgetMixin: def format_value(self, value): if value is not None and not isinstance(value, str): value = edit_string_for_tags(value) return super().format_value(value) + class TagWidget(TagWidgetMixin, forms.TextInput): pass + class TagField(forms.CharField): widget = TagWidget diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index 2003b478..7594eb8d 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -17,7 +17,6 @@ from archivebox.config import VERSION from archivebox.config.version import get_COMMIT_HASH from archivebox.core.host_utils import ( build_admin_url, - build_api_url, build_web_url, get_api_host, get_admin_host, diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py index dc96c8da..0f5df6a9 100644 --- a/archivebox/core/migrations/0006_auto_20201012_1520.py +++ b/archivebox/core/migrations/0006_auto_20201012_1520.py @@ -7,10 +7,8 @@ def forwards_func(apps, schema_editor): SnapshotModel = apps.get_model("core", "Snapshot") TagModel = apps.get_model("core", "Tag") - db_alias = schema_editor.connection.alias snapshots = SnapshotModel.objects.all() for snapshot in snapshots: - tags = snapshot.tags tag_set = ( set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) ) @@ -23,9 +21,7 @@ def forwards_func(apps, schema_editor): def reverse_func(apps, schema_editor): SnapshotModel = apps.get_model("core", "Snapshot") - TagModel = apps.get_model("core", "Tag") - db_alias = schema_editor.connection.alias snapshots = SnapshotModel.objects.all() for snapshot in snapshots: tags = snapshot.tags.values_list("name", flat=True) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index c052f9ce..9cf5e75d 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -43,7 +43,7 @@ def forwards_func(apps, schema_editor): try: with open(out_dir / "index.json", "r") as f: fs_index = json.load(f) - except Exception as e: + except Exception: continue history = fs_index["history"] diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index c32c31b3..a95cc007 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -234,7 +234,6 @@ def upgrade_core_tables(apps, schema_editor): tag_has_data = cursor.fetchone()[0] > 0 if tag_has_data: - tag_cols = get_table_columns('core_tag') cursor.execute("PRAGMA table_info(core_tag)") tag_id_type = None for row in cursor.fetchall(): diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py index ddd3c87b..fc435608 100644 --- a/archivebox/core/migrations/0024_assign_default_crawl.py +++ b/archivebox/core/migrations/0024_assign_default_crawl.py @@ -2,7 +2,6 @@ # Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL from django.db import migrations, models -import uuid def create_default_crawl_and_assign_snapshots(apps, schema_editor): diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py index 8ac9d889..a26caa10 100644 --- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py +++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py @@ -347,7 +347,7 @@ def copy_archiveresult_data_to_process(apps, schema_editor): migrated_count += 1 if i == 0: - print(f'DEBUG 0027: Linked ArchiveResult to Process') + print('DEBUG 0027: Linked ArchiveResult to Process') except Exception as e: print(f'✗ Error migrating ArchiveResult {ar_id}: {e}') diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f9c6cc5f..8a6dac92 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.core' -from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING +from typing import Optional, Dict, Iterable, Any, List from archivebox.uuid_compat import uuid7 from datetime import datetime, timedelta from django_stubs_ext.db.models import TypedModelMeta @@ -12,19 +12,18 @@ from pathlib import Path from statemachine import State, registry from django.db import models -from django.db.models import QuerySet, Value, Case, When, IntegerField +from django.db.models import QuerySet from django.utils.functional import cached_property from django.utils.text import slugify from django.utils import timezone from django.core.cache import cache -from django.urls import reverse, reverse_lazy +from django.urls import reverse_lazy from django.contrib import admin from django.conf import settings from archivebox.config import CONSTANTS from archivebox.misc.system import get_dir_size, atomic_write -from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode -from archivebox.misc.hashing import get_dir_info +from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.hooks import ( get_plugins, get_plugin_name, get_plugin_icon, ) @@ -186,7 +185,7 @@ class SnapshotQuerySet(models.QuerySet): for pattern in patterns: try: qsearch |= query_search_index(pattern) - except: + except BaseException: raise SystemExit(2) return self.all() & qsearch @@ -344,8 +343,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @property def process_set(self): """Get all Process objects related to this snapshot's ArchiveResults.""" - import json - import json from archivebox.machine.models import Process return Process.objects.filter(archiveresult__snapshot_id=self.id) @@ -458,13 +455,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not old_dir.exists() or old_dir == new_dir: # No migration needed - print(f"[DEBUG _fs_migrate] Returning None (early return)") + print("[DEBUG _fs_migrate] Returning None (early return)") return None if new_dir.exists(): # New directory already exists (files already copied), but we still need cleanup # Return cleanup info so old directory can be cleaned up - print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)") + print("[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)") return (old_dir, new_dir) new_dir.mkdir(parents=True, exist_ok=True) @@ -499,7 +496,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Schedule cleanup AFTER transaction commits successfully # This ensures DB changes are committed before we delete old files - from django.db import transaction transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir)) # Return cleanup info for manual cleanup if needed (when called directly) @@ -594,8 +590,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea domain = self.extract_domain_from_url(self.url) return ( - CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' / - date_str / domain / str(self.id) + CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' + / date_str / domain / str(self.id) ) else: # Unknown version - use current @@ -670,7 +666,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}") return snapshot elif candidates.count() > 1: - print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first") + print("[DEBUG load_from_directory] Multiple fuzzy matches, using first") return candidates.first() print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}") return None @@ -767,7 +763,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ts_int = int(float(ts)) # 1995-01-01 to 2035-12-31 return 788918400 <= ts_int <= 2082758400 - except: + except (TypeError, ValueError, OverflowError): return False index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False @@ -850,7 +846,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: with open(json_path) as f: index_data = json.load(f) - except: + except (OSError, TypeError, ValueError, json.JSONDecodeError): pass # Merge title @@ -929,7 +925,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if result_data.get('start_ts'): try: start_ts = parser.parse(result_data['start_ts']) - except: + except (TypeError, ValueError, OverflowError): pass if (plugin, start_ts) in existing: @@ -940,7 +936,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if result_data.get('end_ts'): try: end_ts = parser.parse(result_data['end_ts']) - except: + except (TypeError, ValueError, OverflowError): pass # Support both 'output' (legacy) and 'output_str' (new JSONL) field names @@ -957,7 +953,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea start_ts=start_ts, end_ts=end_ts, ) - except: + except Exception: pass def write_index_json(self): @@ -1176,7 +1172,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: shutil.move(str(snapshot_dir), str(dest)) - except: + except Exception: pass @classmethod @@ -1208,7 +1204,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: cls._merge_snapshots(snapshots) merged += 1 - except: + except Exception: pass return merged @@ -1244,7 +1240,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea try: shutil.rmtree(dup_dir) - except: + except Exception: pass # Merge tags @@ -1615,7 +1611,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ import re from django.utils import timezone - from archivebox.misc.util import parse_date from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.config.common import GENERAL_CONFIG @@ -2125,7 +2120,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def to_dict(self, extended: bool = False) -> Dict[str, Any]: """Convert Snapshot to a dictionary (replacement for Link._asdict())""" - from archivebox.misc.util import ts_to_date_str from archivebox.core.host_utils import build_snapshot_url result = { @@ -2283,9 +2277,9 @@ class SnapshotMachine(BaseStateMachine): # Tick Event (polled by workers) tick = ( - queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to(sealed, cond='is_finished') + queued.to.itself(unless='can_start') + | queued.to(started, cond='can_start') + | started.to(sealed, cond='is_finished') ) # Manual event (can also be triggered by last ArchiveResult finishing) @@ -2783,7 +2777,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi Updates status/output fields, queues discovered URLs, and triggers indexing. """ from django.utils import timezone - from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook from archivebox.config.configset import get_config # Get merged config with proper context @@ -3190,16 +3184,16 @@ class ArchiveResultMachine(BaseStateMachine): # queued → skipped (if exceeded max attempts) # started → backoff → started (retry) tick = ( - queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first - queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to(succeeded, cond='is_succeeded') | - started.to(failed, cond='is_failed') | - started.to(skipped, cond='is_skipped') | - started.to(backoff, cond='is_backoff') | - backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too - backoff.to.itself(unless='can_start') | - backoff.to(started, cond='can_start') + queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first + | queued.to.itself(unless='can_start') + | queued.to(started, cond='can_start') + | started.to(succeeded, cond='is_succeeded') + | started.to(failed, cond='is_failed') + | started.to(skipped, cond='is_skipped') + | started.to(backoff, cond='is_backoff') + | backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too + | backoff.to.itself(unless='can_start') + | backoff.to(started, cond='can_start') # Removed redundant transitions: backoff.to(succeeded/failed/skipped) # Reason: backoff should always retry→started, then started→final states ) @@ -3241,8 +3235,8 @@ class ArchiveResultMachine(BaseStateMachine): """Check if we should backoff and retry later.""" # Backoff if status is still started (plugin didn't complete) and output_str is empty return ( - self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and - not self.archiveresult.output_str + self.archiveresult.status == ArchiveResult.StatusChoices.STARTED + and not self.archiveresult.output_str ) def is_finished(self) -> bool: @@ -3286,7 +3280,6 @@ class ArchiveResultMachine(BaseStateMachine): @started.enter def enter_started(self): - from archivebox.machine.models import NetworkInterface # Update Process with network interface if self.archiveresult.process_id: diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 2dec9a03..ff1127bd 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -6,6 +6,7 @@ import inspect from pathlib import Path +from django.conf.locale.en import formats as en_formats # type: ignore from django.utils.crypto import get_random_string import archivebox @@ -13,6 +14,7 @@ import archivebox from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url +from .settings_logging import SETTINGS_LOGGING IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3] @@ -54,8 +56,8 @@ INSTALLED_APPS = [ "django.contrib.staticfiles", "django.contrib.admin", # 3rd-party apps from PyPI - "signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks - "django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions + "signal_webhooks", # handles REST API outbound webhooks + "django_object_actions", # provides easy Django Admin action buttons on change views # Our ArchiveBox-provided apps (use fully qualified names) # NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies # "archivebox.config", # ArchiveBox config settings (no models, not a real Django app) @@ -117,7 +119,6 @@ try: try: # Try to import django-auth-ldap (will fail if not installed) - import django_auth_ldap from django_auth_ldap.config import LDAPSearch import ldap @@ -414,9 +415,6 @@ DATETIME_FORMAT = "Y-m-d h:i:s A" SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A" TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent - -from django.conf.locale.en import formats as en_formats # type: ignore - en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT @@ -425,9 +423,6 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT ### Logging Settings ################################################################################ - -from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG - LOGGING = SETTINGS_LOGGING diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py index 6c2cfd52..0d3a2dd5 100644 --- a/archivebox/core/settings_logging.py +++ b/archivebox/core/settings_logging.py @@ -5,8 +5,6 @@ import os import tempfile import logging -import pydantic -import django.template from archivebox.config import CONSTANTS diff --git a/archivebox/core/tests.py b/archivebox/core/tests.py index 56060ae6..6690cefb 100644 --- a/archivebox/core/tests.py +++ b/archivebox/core/tests.py @@ -1,5 +1,6 @@ """Tests for the core views, especially AddView.""" +import importlib import os import django from unittest.mock import patch @@ -8,13 +9,14 @@ from unittest.mock import patch os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') django.setup() -from django.test import TestCase, Client -from django.contrib.auth.models import User -from django.urls import reverse - -from archivebox.crawls.models import Crawl, CrawlSchedule -from archivebox.core.models import Tag -from archivebox.config.common import SERVER_CONFIG +TestCase = importlib.import_module('django.test').TestCase +Client = importlib.import_module('django.test').Client +User = importlib.import_module('django.contrib.auth.models').User +reverse = importlib.import_module('django.urls').reverse +Crawl = importlib.import_module('archivebox.crawls.models').Crawl +CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule +Tag = importlib.import_module('archivebox.core.models').Tag +SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG class AddViewTests(TestCase): @@ -252,7 +254,7 @@ class AddViewTests(TestCase): def test_add_staff_admin_custom_config_is_allowed(self): """Admin users can override crawl config.""" self.client.logout() - admin_user = User.objects.create_user( + User.objects.create_user( username='adminuser', password='adminpass123', email='admin@example.com', diff --git a/archivebox/core/views.py b/archivebox/core/views.py index fb7fabe7..3bc903e2 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -10,7 +10,7 @@ from pathlib import Path from urllib.parse import urlparse from django.shortcuts import render, redirect -from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden +from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden from django.utils.html import format_html, mark_safe from django.views import View from django.views.generic.list import ListView @@ -24,9 +24,8 @@ from django.utils.decorators import method_decorator from admin_data_views.typing import TableContext, ItemContext from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink -import archivebox from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION -from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG +from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG from archivebox.config.configset import get_flat_config, get_config, get_all_configs from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode from archivebox.misc.serve_static import serve_static_with_byterange_support @@ -35,6 +34,9 @@ from archivebox.search import query_search_index from archivebox.core.models import Snapshot from archivebox.core.host_utils import build_snapshot_url +from archivebox.core.forms import AddLinkForm +from archivebox.crawls.models import Crawl +from archivebox.hooks import get_enabled_plugins, get_plugin_name def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: @@ -49,12 +51,6 @@ def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: return target -from archivebox.core.forms import AddLinkForm -from archivebox.crawls.models import Crawl -from archivebox.hooks import get_enabled_plugins, get_plugin_name - - - class HomepageView(View): def get(self, request): if request.user.is_authenticated: @@ -1066,10 +1062,6 @@ class HealthCheckView(View): status=200 ) - -import json -from django.http import JsonResponse - def live_progress_view(request): """Simple JSON endpoint for live progress status - used by admin progress monitor.""" try: @@ -1077,7 +1069,6 @@ def live_progress_view(request): from archivebox.crawls.models import Crawl from archivebox.core.models import Snapshot, ArchiveResult from archivebox.machine.models import Process, Machine - from django.db.models import Case, When, Value, IntegerField # Get orchestrator status orchestrator_running = Orchestrator.is_running() @@ -1133,7 +1124,6 @@ def live_progress_view(request): }) # Build hierarchical active crawls with nested snapshots and archive results - from django.db.models import Prefetch running_workers = Process.objects.filter( machine=machine, @@ -1387,7 +1377,7 @@ def find_config_default(key: str) -> str: return default_val def find_config_type(key: str) -> str: - from typing import get_type_hints, ClassVar + from typing import ClassVar CONFIGS = get_all_configs() for config in CONFIGS.values(): @@ -1430,7 +1420,6 @@ def key_is_safe(key: str) -> bool: def find_config_source(key: str, merged_config: dict) -> str: """Determine where a config value comes from.""" - import os from archivebox.machine.models import Machine # Check if it's from archivebox.machine.config @@ -1464,12 +1453,11 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: # Get merged config that includes Machine.config overrides try: from archivebox.machine.models import Machine - machine = Machine.current() + Machine.current() merged_config = get_config() - except Exception as e: + except Exception: # Fallback if Machine model not available merged_config = get_config() - machine = None rows = { "Section": [], @@ -1525,7 +1513,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: - import os from archivebox.machine.models import Machine from archivebox.config.configset import BaseConfigSet diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py index bbbceaa7..1fbefd0c 100644 --- a/archivebox/core/widgets.py +++ b/archivebox/core/widgets.py @@ -343,20 +343,17 @@ class InlineTagEditorWidget(TagEditorWidget): snapshot_id = snapshot_id or self.snapshot_id # Parse value to get list of tag dicts with id and name - tags = [] tag_data = [] if value: if hasattr(value, 'all'): # QuerySet for tag in value.all(): tag_data.append({'id': tag.pk, 'name': tag.name}) tag_data.sort(key=lambda x: x['name'].lower()) - tags = [t['name'] for t in tag_data] elif isinstance(value, (list, tuple)): if value and hasattr(value[0], 'name'): for tag in value: tag_data.append({'id': tag.pk, 'name': tag.name}) tag_data.sort(key=lambda x: x['name'].lower()) - tags = [t['name'] for t in tag_data] widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name) widget_id = self._normalize_id(widget_id_raw) diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py index aa26ad94..00d224ea 100644 --- a/archivebox/core/wsgi.py +++ b/archivebox/core/wsgi.py @@ -9,9 +9,8 @@ https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ import archivebox # noqa from archivebox.config.django import setup_django +from django.core.wsgi import get_wsgi_application setup_django(in_memory_db=False, check_db=True) -from django.core.wsgi import get_wsgi_application - application = get_wsgi_application() diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 01b18375..0539c6e0 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -1,17 +1,11 @@ __package__ = 'archivebox.crawls' -import json -from pathlib import Path from django import forms from django.utils.html import format_html, format_html_join, mark_safe from django.contrib import admin, messages -from django.urls import path -from django.http import JsonResponse -from django.views.decorators.http import require_POST from django.db.models import Count, Q -from archivebox import DATA_DIR from django_object_actions import action diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index d7d54d64..7417ee4b 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -1,12 +1,11 @@ __package__ = 'archivebox.crawls' -from typing import TYPE_CHECKING, Iterable +from typing import TYPE_CHECKING from datetime import timedelta from archivebox.uuid_compat import uuid7 from pathlib import Path from django.db import models -from django.db.models import QuerySet from django.core.validators import MaxValueValidator, MinValueValidator from django.conf import settings from django.urls import reverse_lazy @@ -15,13 +14,12 @@ from django_stubs_ext.db.models import TypedModelMeta from statemachine import State, registry from rich import print -from archivebox.config import CONSTANTS from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule if TYPE_CHECKING: - from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.core.models import Snapshot class CrawlSchedule(ModelWithUUID, ModelWithNotes): @@ -111,7 +109,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith label = models.CharField(max_length=64, blank=True, null=False, default='') notes = models.TextField(blank=True, null=False, default='') schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True) - output_dir = models.CharField(max_length=512, null=False, blank=True, default='') status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) @@ -252,6 +249,22 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return system_url return None + def resolve_persona(self): + from archivebox.personas.models import Persona + + if self.persona_id: + persona = Persona.objects.filter(id=self.persona_id).first() + if persona is None: + raise Persona.DoesNotExist(f'Crawl {self.id} references missing Persona {self.persona_id}') + return persona + + default_persona_name = str((self.config or {}).get('DEFAULT_PERSONA') or '').strip() + if default_persona_name: + persona, _ = Persona.objects.get_or_create(name=default_persona_name or 'Default') + return persona + + return None + def add_url(self, entry: dict) -> bool: """ @@ -391,7 +404,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith f.flush() def get_runtime_config(): - return get_config(crawl=self) + config = get_config(crawl=self) + if persona_runtime_overrides: + config.update(persona_runtime_overrides) + return config system_task = self.get_system_task() if system_task == 'archivebox://update': @@ -402,6 +418,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith machine = Machine.current() declared_binary_names: set[str] = set() + persona_runtime_overrides: dict[str, str] = {} + persona = self.resolve_persona() + if persona: + base_runtime_config = get_config(crawl=self, persona=persona) + chrome_binary = str(base_runtime_config.get('CHROME_BINARY') or '') + persona_runtime_overrides = persona.prepare_runtime_for_crawl( + crawl=self, + chrome_binary=chrome_binary, + ) def install_declared_binaries(binary_names: set[str]) -> None: if not binary_names: @@ -563,7 +588,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Discover and run on_Crawl hooks with open(debug_log, 'a') as f: - f.write(f'Discovering Crawl hooks...\n') + f.write('Discovering Crawl hooks...\n') f.flush() hooks = discover_hooks('Crawl', config=get_runtime_config()) with open(debug_log, 'a') as f: @@ -588,17 +613,17 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith print(f'[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]') with open(debug_log, 'a') as f: f.write(f'Skipping snapshot creation for system crawl: {system_task}\n') - f.write(f'=== Crawl.run() complete ===\n\n') + f.write('=== Crawl.run() complete ===\n\n') f.flush() return None with open(debug_log, 'a') as f: - f.write(f'Creating snapshots from URLs...\n') + f.write('Creating snapshots from URLs...\n') f.flush() created_snapshots = self.create_snapshots_from_urls() with open(debug_log, 'a') as f: f.write(f'Created {len(created_snapshots)} snapshots\n') - f.write(f'=== Crawl.run() complete ===\n\n') + f.write('=== Crawl.run() complete ===\n\n') f.flush() # Return first snapshot for this crawl (newly created or existing) @@ -647,6 +672,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith for pid_file in self.output_dir.glob('**/*.pid'): pid_file.unlink(missing_ok=True) + persona = self.resolve_persona() + if persona: + persona.cleanup_runtime_for_crawl(self) + # Run on_CrawlEnd hooks from archivebox.config.configset import get_config config = get_config(crawl=self) @@ -715,9 +744,9 @@ class CrawlMachine(BaseStateMachine): # Tick Event (polled by workers) tick = ( - queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to(sealed, cond='is_finished') + queued.to.itself(unless='can_start') + | queued.to(started, cond='can_start') + | started.to(sealed, cond='is_finished') ) # Manual event (triggered by last Snapshot sealing) @@ -740,7 +769,6 @@ class CrawlMachine(BaseStateMachine): @started.enter def enter_started(self): import sys - from archivebox.core.models import Snapshot print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr) @@ -758,7 +786,7 @@ class CrawlMachine(BaseStateMachine): ) else: # No snapshots (system crawl like archivebox://install) - print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr) + print('[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr) # Seal immediately since there's no work to do self.seal() diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 84112390..962bc200 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -56,16 +56,18 @@ __package__ = 'archivebox' import os import json -import time from functools import lru_cache from pathlib import Path -from typing import List, Dict, Any, Optional, TypedDict +from typing import TYPE_CHECKING, List, Dict, Any, Optional, TypedDict from abx_plugins import get_plugins_dir from django.conf import settings from django.utils.safestring import mark_safe from archivebox.config.constants import CONSTANTS +if TYPE_CHECKING: + from archivebox.machine.models import Process + # Plugin directories BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() @@ -266,9 +268,7 @@ def run_hook( """ from archivebox.machine.models import Process, Machine from archivebox.config.constants import CONSTANTS - import time import sys - start_time = time.time() # Auto-detect timeout from plugin config if not explicitly provided if timeout is None: diff --git a/archivebox/ldap/auth.py b/archivebox/ldap/auth.py index 3958ff09..aa7fc651 100644 --- a/archivebox/ldap/auth.py +++ b/archivebox/ldap/auth.py @@ -9,7 +9,6 @@ __package__ = "archivebox.ldap" from typing import TYPE_CHECKING if TYPE_CHECKING: - from django.contrib.auth.models import User from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend else: try: diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index f92ac02b..f3a0f0da 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -10,6 +10,7 @@ from datetime import timedelta, datetime from statemachine import State, registry from django.db import models +from django.db.models import QuerySet from django.utils import timezone from django.utils.functional import cached_property @@ -197,7 +198,6 @@ class NetworkInterface(ModelWithHealthStats): class BinaryManager(models.Manager): def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary': """Get or create an Binary record from the database or cache.""" - global _CURRENT_BINARIES cached = _CURRENT_BINARIES.get(name) if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL): return cached @@ -583,7 +583,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): Called by state machine if needed (not typically used for binaries since installations are foreground, but included for consistency). """ - from pathlib import Path # Kill any background binary installation hooks using Process records # (rarely used since binary installations are typically foreground) @@ -1026,9 +1025,11 @@ class Process(models.Model): # Check cache validity if _CURRENT_PROCESS: # Verify: same PID, same machine, cache not expired - if (_CURRENT_PROCESS.pid == current_pid and - _CURRENT_PROCESS.machine_id == machine.id and - timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)): + if ( + _CURRENT_PROCESS.pid == current_pid + and _CURRENT_PROCESS.machine_id == machine.id + and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL) + ): _CURRENT_PROCESS.ensure_log_files() return _CURRENT_PROCESS _CURRENT_PROCESS = None @@ -1111,7 +1112,6 @@ class Process(models.Model): machine = machine or Machine.current() # Debug logging - import sys # print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) # Get parent process start time from OS @@ -1630,7 +1630,6 @@ class Process(models.Model): self (updated with pid, started_at, etc.) """ import subprocess - import time # Validate pwd is set (required for output files) if not self.pwd: @@ -1846,7 +1845,6 @@ class Process(models.Model): Returns: True if process was terminated, False if already dead """ - import time import signal proc = self.proc @@ -2199,8 +2197,8 @@ class BinaryMachine(BaseStateMachine): # Tick Event - install happens during transition tick = ( - queued.to.itself(unless='can_install') | - queued.to(installed, cond='can_install', on='on_install') + queued.to.itself(unless='can_install') + | queued.to(installed, cond='can_install', on='on_install') ) def can_install(self) -> bool: @@ -2303,10 +2301,10 @@ class ProcessMachine(BaseStateMachine): # Tick Event - transitions based on conditions tick = ( - queued.to.itself(unless='can_start') | - queued.to(running, cond='can_start') | - running.to.itself(unless='is_exited') | - running.to(exited, cond='is_exited') + queued.to.itself(unless='can_start') + | queued.to(running, cond='can_start') + | running.to.itself(unless='is_exited') + | running.to(exited, cond='is_exited') ) # Additional events (for explicit control) diff --git a/archivebox/machine/tests/test_machine_models.py b/archivebox/machine/tests/test_machine_models.py index 6a1d4514..983770d4 100644 --- a/archivebox/machine/tests/test_machine_models.py +++ b/archivebox/machine/tests/test_machine_models.py @@ -12,8 +12,6 @@ Tests cover: """ import os -import sys -from pathlib import Path from datetime import timedelta from unittest.mock import patch @@ -29,7 +27,6 @@ from archivebox.machine.models import ( BinaryMachine, ProcessMachine, MACHINE_RECHECK_INTERVAL, - PROCESS_RECHECK_INTERVAL, PID_REUSE_WINDOW, ) @@ -323,7 +320,6 @@ class TestProcessModel(TestCase): def test_process_update_and_requeue(self): """Process.update_and_requeue() should update fields and save.""" process = Process.objects.create(machine=self.machine, cmd=['test']) - old_modified = process.modified_at process.update_and_requeue( status=Process.StatusChoices.RUNNING, diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py index a8abf996..025c3eee 100644 --- a/archivebox/mcp/server.py +++ b/archivebox/mcp/server.py @@ -1,5 +1,3 @@ -__package__ = 'archivebox.mcp' - """ Model Context Protocol (MCP) server implementation for ArchiveBox. @@ -10,9 +8,7 @@ Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport. import sys import json import traceback -from typing import Any, Dict, List, Optional -from io import StringIO -from contextlib import redirect_stdout, redirect_stderr +from typing import Optional import click from click.testing import CliRunner diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index bf97e838..91d4c081 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -225,7 +225,6 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True): - import archivebox from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP from archivebox.misc.logging import STDERR from archivebox.misc.logging_util import pretty_path diff --git a/archivebox/misc/folders.py b/archivebox/misc/folders.py index dd134dc1..dd8bbc1f 100644 --- a/archivebox/misc/folders.py +++ b/archivebox/misc/folders.py @@ -35,7 +35,6 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L with open(index_path, 'r') as f: data = json.load(f) timestamp = data.get('timestamp') - url = data.get('url') except Exception: continue diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 7e5b707c..c00071f6 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -21,13 +21,12 @@ if TYPE_CHECKING: from rich import print from rich.panel import Panel -from django.core.management.base import DjangoHelpFormatter from archivebox.config import CONSTANTS, DATA_DIR, VERSION from archivebox.config.common import SHELL_CONFIG from archivebox.misc.system import get_dir_size from archivebox.misc.util import enforce_types -from archivebox.misc.logging import ANSI, stderr +from archivebox.misc.logging import ANSI @dataclass class RuntimeStats: diff --git a/archivebox/misc/monkey_patches.py b/archivebox/misc/monkey_patches.py index 2bfb7924..9ee755c4 100644 --- a/archivebox/misc/monkey_patches.py +++ b/archivebox/misc/monkey_patches.py @@ -1,16 +1,18 @@ __package__ = 'archivebox' -import django -import pydantic +import datetime +import warnings + +import benedict +from daphne import access import django_stubs_ext +from django.utils import timezone django_stubs_ext.monkeypatch() # monkey patch django timezone to add back utc (it was removed in Django 5.0) -import datetime -from django.utils import timezone timezone.utc = datetime.timezone.utc # monkey patch django-signals-webhooks to change how it shows up in Admin UI @@ -26,12 +28,9 @@ timezone.utc = datetime.timezone.utc # Hide site-packages/sonic/client.py:115: SyntaxWarning # https://github.com/xmonader/python-sonic-client/pull/18 -import warnings # noqa warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic') # Make daphne log requests quieter and esier to read -from daphne import access # noqa - class ModifiedAccessLogGenerator(access.AccessLogGenerator): """Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files""" @@ -68,5 +67,4 @@ access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # # fix benedict objects to pretty-print/repr more nicely with rich # https://stackoverflow.com/a/79048811/2156113 # https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol -import benedict # noqa benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore diff --git a/archivebox/misc/progress_layout.py b/archivebox/misc/progress_layout.py index eb6fdb3a..1263856b 100644 --- a/archivebox/misc/progress_layout.py +++ b/archivebox/misc/progress_layout.py @@ -135,7 +135,6 @@ class ProcessLogPanel: if line: log_lines.append(Text(line, style="cyan")) - compact = self.compact if self.compact is not None else self._is_background_hook() max_body = max(1, self.max_lines - len(header_lines)) if not log_lines: log_lines = [] diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py index a1a55d9b..6804c210 100644 --- a/archivebox/misc/system.py +++ b/archivebox/misc/system.py @@ -4,10 +4,11 @@ __package__ = 'archivebox.misc' import os import signal import shutil +import sys from json import dump from pathlib import Path -from typing import Optional, Union, Set, Tuple +from typing import Optional, Union, Tuple from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired from atomicwrites import atomic_write as lib_atomic_write @@ -58,7 +59,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, # far into the TimeoutExpired exception. process.wait() raise - except: # Including KeyboardInterrupt, communicate handled that. + except BaseException: # Including KeyboardInterrupt, communicate handled that. process.kill() # We don't call process.wait() as .__exit__ does that for us. raise diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py index 8c38f3f3..b97a94f6 100644 --- a/archivebox/personas/admin.py +++ b/archivebox/personas/admin.py @@ -1,3 +1,2 @@ -from django.contrib import admin # Register your models here. diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 4be5cfb3..ba30d587 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -11,8 +11,12 @@ Each persona has its own: __package__ = 'archivebox.personas' +import shutil +import subprocess +import sys +from contextlib import contextmanager from pathlib import Path -from typing import TYPE_CHECKING, Iterator +from typing import TYPE_CHECKING from django.db import models from django.conf import settings @@ -21,8 +25,32 @@ from django.utils import timezone from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk from archivebox.uuid_compat import uuid7 +try: + import fcntl +except ImportError: # pragma: no cover + fcntl = None + if TYPE_CHECKING: - from django.db.models import QuerySet + pass + + +VOLATILE_PROFILE_DIR_NAMES = { + 'Cache', + 'Code Cache', + 'GPUCache', + 'ShaderCache', + 'Service Worker', + 'GCM Store', + 'Crashpad', + 'BrowserMetrics', +} + +VOLATILE_PROFILE_FILE_NAMES = { + 'BrowserMetrics-spare.pma', + 'SingletonCookie', + 'SingletonLock', + 'SingletonSocket', +} class Persona(ModelWithConfig): @@ -120,37 +148,118 @@ class Persona(ModelWithConfig): (self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True) (self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True) - def cleanup_chrome(self) -> bool: - """ - Clean up Chrome state files (SingletonLock, etc.) for this persona. - - Returns: - True if cleanup was performed, False if no cleanup needed - """ + def cleanup_chrome_profile(self, profile_dir: Path) -> bool: + """Remove volatile Chrome state that should never be reused across launches.""" cleaned = False - chrome_dir = self.path / 'chrome_user_data' - if not chrome_dir.exists(): + if not profile_dir.exists(): return False - # Clean up SingletonLock files - for lock_file in chrome_dir.glob('**/SingletonLock'): - try: - lock_file.unlink() - cleaned = True - except OSError: - pass + for path in profile_dir.rglob('*'): + if path.name in VOLATILE_PROFILE_FILE_NAMES: + try: + path.unlink() + cleaned = True + except OSError: + pass - # Clean up SingletonSocket files - for socket_file in chrome_dir.glob('**/SingletonSocket'): + for dirname in VOLATILE_PROFILE_DIR_NAMES: + for path in profile_dir.rglob(dirname): + if not path.is_dir(): + continue + shutil.rmtree(path, ignore_errors=True) + cleaned = True + + for path in profile_dir.rglob('*.log'): try: - socket_file.unlink() + path.unlink() cleaned = True except OSError: pass return cleaned + def cleanup_chrome(self) -> bool: + """Clean up volatile Chrome state for this persona's base profile.""" + return self.cleanup_chrome_profile(self.path / 'chrome_user_data') + + @contextmanager + def lock_runtime_for_crawl(self): + lock_path = self.path / '.archivebox-crawl-profile.lock' + lock_path.parent.mkdir(parents=True, exist_ok=True) + + with lock_path.open('w') as lock_file: + if fcntl is not None: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) + try: + yield + finally: + if fcntl is not None: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + def runtime_root_for_crawl(self, crawl) -> Path: + return Path(crawl.output_dir) / '.persona' / self.name + + def runtime_profile_dir_for_crawl(self, crawl) -> Path: + return self.runtime_root_for_crawl(crawl) / 'chrome_user_data' + + def runtime_downloads_dir_for_crawl(self, crawl) -> Path: + return self.runtime_root_for_crawl(crawl) / 'chrome_downloads' + + def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.rmtree(destination_dir, ignore_errors=True) + destination_dir.mkdir(parents=True, exist_ok=True) + + copy_cmd: list[str] | None = None + source_contents = f'{source_dir}/.' + + if sys.platform == 'darwin': + copy_cmd = ['cp', '-cR', source_contents, str(destination_dir)] + elif sys.platform.startswith('linux'): + copy_cmd = ['cp', '-a', source_contents, str(destination_dir)] + + if copy_cmd: + result = subprocess.run(copy_cmd, capture_output=True, text=True) + if result.returncode == 0: + return + + shutil.rmtree(destination_dir, ignore_errors=True) + destination_dir.mkdir(parents=True, exist_ok=True) + + shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True) + + def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = '') -> dict[str, str]: + self.ensure_dirs() + + template_dir = Path(self.CHROME_USER_DATA_DIR) + runtime_root = self.runtime_root_for_crawl(crawl) + runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl) + runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl) + + with self.lock_runtime_for_crawl(): + if not runtime_profile_dir.exists(): + if template_dir.exists() and any(template_dir.iterdir()): + self.copy_chrome_profile(template_dir, runtime_profile_dir) + else: + runtime_profile_dir.mkdir(parents=True, exist_ok=True) + + runtime_downloads_dir.mkdir(parents=True, exist_ok=True) + self.cleanup_chrome_profile(runtime_profile_dir) + + (runtime_root / 'persona_name.txt').write_text(self.name) + (runtime_root / 'template_dir.txt').write_text(str(template_dir)) + if chrome_binary: + (runtime_root / 'chrome_binary.txt').write_text(chrome_binary) + + return { + 'CHROME_USER_DATA_DIR': str(runtime_profile_dir), + 'CHROME_DOWNLOADS_DIR': str(runtime_downloads_dir), + } + + def cleanup_runtime_for_crawl(self, crawl) -> None: + shutil.rmtree(Path(crawl.output_dir) / '.persona', ignore_errors=True) + @classmethod def get_or_create_default(cls) -> 'Persona': """Get or create the Default persona.""" diff --git a/archivebox/personas/tests.py b/archivebox/personas/tests.py index 7ce503c2..49290204 100644 --- a/archivebox/personas/tests.py +++ b/archivebox/personas/tests.py @@ -1,3 +1,2 @@ -from django.test import TestCase # Create your tests here. diff --git a/archivebox/personas/views.py b/archivebox/personas/views.py index 91ea44a2..b8e4ee02 100644 --- a/archivebox/personas/views.py +++ b/archivebox/personas/views.py @@ -1,3 +1,2 @@ -from django.shortcuts import render # Create your views here. diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index b98f7f95..13ce44a1 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -14,7 +14,7 @@ Search backends must provide a search.py module with: __package__ = 'archivebox.search' -from typing import TYPE_CHECKING, Any, Optional +from typing import Any, Optional from django.db.models import QuerySet @@ -22,9 +22,6 @@ from archivebox.misc.util import enforce_types from archivebox.misc.logging import stderr from archivebox.config.common import SEARCH_BACKEND_CONFIG -if TYPE_CHECKING: - from archivebox.core.models import Snapshot - # Cache discovered backends to avoid repeated filesystem scans _search_backends_cache: Optional[dict] = None diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py index 69740e16..28f58062 100644 --- a/archivebox/tests/conftest.py +++ b/archivebox/tests/conftest.py @@ -1,7 +1,6 @@ """archivebox/tests/conftest.py - Pytest fixtures for CLI tests.""" import os -import shutil import sys import subprocess import textwrap @@ -13,6 +12,8 @@ import pytest from archivebox.uuid_compat import uuid7 +pytest_plugins = ["archivebox.tests.fixtures"] + # ============================================================================= # CLI Helpers (defined before fixtures that use them) diff --git a/archivebox/tests/test_add.py b/archivebox/tests/test_add.py index 0fb4271a..39d423e3 100644 --- a/archivebox/tests/test_add.py +++ b/archivebox/tests/test_add.py @@ -1,9 +1,6 @@ -import subprocess -import json -import sqlite3 import os - -from .fixtures import * +import sqlite3 +import subprocess def test_depth_flag_is_accepted(process, disable_extractors_dict): arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], @@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict): def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - arg_process = subprocess.run( + subprocess.run( ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], capture_output=True, env=disable_extractors_dict, diff --git a/archivebox/tests/test_admin_views.py b/archivebox/tests/test_admin_views.py index 707822cb..c1bfb3bd 100644 --- a/archivebox/tests/test_admin_views.py +++ b/archivebox/tests/test_admin_views.py @@ -9,7 +9,7 @@ Tests cover: """ import pytest -from django.test import TestCase, Client, override_settings +from django.test import override_settings from django.urls import reverse from django.contrib.auth import get_user_model diff --git a/archivebox/tests/test_auth_ldap.py b/archivebox/tests/test_auth_ldap.py index 7b25f0cf..cec866c9 100644 --- a/archivebox/tests/test_auth_ldap.py +++ b/archivebox/tests/test_auth_ldap.py @@ -9,7 +9,7 @@ import os import sys import tempfile import unittest -from pathlib import Path +from importlib.util import find_spec class TestLDAPConfig(unittest.TestCase): @@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase): def test_django_settings_with_ldap_library_check(self): """Test that Django settings check for LDAP libraries when enabled.""" - # Try to import django-auth-ldap to see if it's available - try: - import django_auth_ldap - import ldap - ldap_available = True - except ImportError: - ldap_available = False + ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None # If LDAP libraries are not available, settings should handle gracefully if not ldap_available: diff --git a/archivebox/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py index 7d325e61..a34a4879 100644 --- a/archivebox/tests/test_cli_add.py +++ b/archivebox/tests/test_cli_add.py @@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie """ import os -import subprocess import sqlite3 -from pathlib import Path - -from .fixtures import * +import subprocess def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict): @@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict): assert 'test' in tags_str or 'example' in tags_str +def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict): + """Test add persists the selected persona so browser config derives from it later.""" + os.chdir(tmp_path) + result = subprocess.run( + ['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + persona_id, default_persona = c.execute( + "SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1" + ).fetchone() + conn.close() + + assert persona_id + assert default_persona == 'Default' + assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir() + assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir() + + def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict): """Test that adding the same URL twice creates separate crawls and snapshots. diff --git a/archivebox/tests/test_cli_archiveresult.py b/archivebox/tests/test_cli_archiveresult.py index de016010..ff884675 100644 --- a/archivebox/tests/test_cli_archiveresult.py +++ b/archivebox/tests/test_cli_archiveresult.py @@ -9,7 +9,6 @@ Tests cover: """ import json -import pytest from archivebox.tests.conftest import ( run_archivebox_cmd, diff --git a/archivebox/tests/test_cli_config.py b/archivebox/tests/test_cli_config.py index 87f7412c..351f14d0 100644 --- a/archivebox/tests/test_cli_config.py +++ b/archivebox/tests/test_cli_config.py @@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly. import os import subprocess -from pathlib import Path - -from .fixtures import * def test_config_displays_all_config(tmp_path, process): diff --git a/archivebox/tests/test_cli_crawl.py b/archivebox/tests/test_cli_crawl.py index 891f4114..c641a842 100644 --- a/archivebox/tests/test_cli_crawl.py +++ b/archivebox/tests/test_cli_crawl.py @@ -9,14 +9,11 @@ Tests cover: """ import json -import pytest from archivebox.tests.conftest import ( run_archivebox_cmd, parse_jsonl_output, - assert_jsonl_contains_type, create_test_url, - create_test_crawl_json, ) diff --git a/archivebox/tests/test_cli_extract.py b/archivebox/tests/test_cli_extract.py index 19b0d834..f1980f6b 100644 --- a/archivebox/tests/test_cli_extract.py +++ b/archivebox/tests/test_cli_extract.py @@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots. """ import os -import subprocess import sqlite3 - -from .fixtures import * +import subprocess def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_cli_help.py b/archivebox/tests/test_cli_help.py index ccf580b5..be4918dc 100644 --- a/archivebox/tests/test_cli_help.py +++ b/archivebox/tests/test_cli_help.py @@ -7,8 +7,6 @@ Verify command runs successfully and produces output. import os import subprocess -from .fixtures import * - def test_help_runs_successfully(tmp_path): """Test that help command runs and produces output.""" diff --git a/archivebox/tests/test_cli_init.py b/archivebox/tests/test_cli_init.py index 5761ce5b..e6ce1ef6 100644 --- a/archivebox/tests/test_cli_init.py +++ b/archivebox/tests/test_cli_init.py @@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config. """ import os -import subprocess import sqlite3 -from pathlib import Path +import subprocess from archivebox.config.common import STORAGE_CONFIG -from .fixtures import * - DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5') diff --git a/archivebox/tests/test_cli_install.py b/archivebox/tests/test_cli_install.py index d839772f..c7738468 100644 --- a/archivebox/tests/test_cli_install.py +++ b/archivebox/tests/test_cli_install.py @@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB. """ import os -import subprocess import sqlite3 +import subprocess from pathlib import Path -from .fixtures import * - def test_install_runs_successfully(tmp_path, process): """Test that install command runs without error.""" diff --git a/archivebox/tests/test_cli_manage.py b/archivebox/tests/test_cli_manage.py index ada5e657..70555c44 100644 --- a/archivebox/tests/test_cli_manage.py +++ b/archivebox/tests/test_cli_manage.py @@ -6,9 +6,6 @@ Verify manage command runs Django management commands. import os import subprocess -import sqlite3 - -from .fixtures import * def test_manage_help_works(tmp_path, process): diff --git a/archivebox/tests/test_cli_remove.py b/archivebox/tests/test_cli_remove.py index 7fa66209..5558e576 100644 --- a/archivebox/tests/test_cli_remove.py +++ b/archivebox/tests/test_cli_remove.py @@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem. """ import os -import subprocess import sqlite3 -from pathlib import Path - -from .fixtures import * +import subprocess def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_cli_run.py b/archivebox/tests/test_cli_run.py index 5181ffd3..7d025b3a 100644 --- a/archivebox/tests/test_cli_run.py +++ b/archivebox/tests/test_cli_run.py @@ -8,7 +8,6 @@ Tests cover: """ import json -import pytest from archivebox.tests.conftest import ( run_archivebox_cmd, diff --git a/archivebox/tests/test_cli_run_binary_worker.py b/archivebox/tests/test_cli_run_binary_worker.py index b7d4fc71..7f509bcd 100644 --- a/archivebox/tests/test_cli_run_binary_worker.py +++ b/archivebox/tests/test_cli_run_binary_worker.py @@ -10,11 +10,9 @@ Tests cover: import json import sqlite3 -import time from archivebox.tests.conftest import ( run_archivebox_cmd, - parse_jsonl_output, ) diff --git a/archivebox/tests/test_cli_schedule.py b/archivebox/tests/test_cli_schedule.py index 47e32c98..82c1e0b7 100644 --- a/archivebox/tests/test_cli_schedule.py +++ b/archivebox/tests/test_cli_schedule.py @@ -5,7 +5,6 @@ import os import sqlite3 import subprocess -from .fixtures import process, disable_extractors_dict def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_cli_search.py b/archivebox/tests/test_cli_search.py index 1c567f42..7ae757fc 100644 --- a/archivebox/tests/test_cli_search.py +++ b/archivebox/tests/test_cli_search.py @@ -6,9 +6,6 @@ Verify search queries snapshots from DB. import os import subprocess -import sqlite3 - -from .fixtures import * def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_cli_server.py b/archivebox/tests/test_cli_server.py index 003119a3..7e31ac6c 100644 --- a/archivebox/tests/test_cli_server.py +++ b/archivebox/tests/test_cli_server.py @@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing). import os import subprocess -import signal -import time - -from .fixtures import * def test_server_shows_usage_info(tmp_path, process): diff --git a/archivebox/tests/test_cli_shell.py b/archivebox/tests/test_cli_shell.py index 0c966c5d..818b9c5c 100644 --- a/archivebox/tests/test_cli_shell.py +++ b/archivebox/tests/test_cli_shell.py @@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only). import os import subprocess -from .fixtures import * - def test_shell_command_exists(tmp_path, process): """Test that shell command is recognized.""" diff --git a/archivebox/tests/test_cli_snapshot.py b/archivebox/tests/test_cli_snapshot.py index 24f35bf7..a05ecc78 100644 --- a/archivebox/tests/test_cli_snapshot.py +++ b/archivebox/tests/test_cli_snapshot.py @@ -9,12 +9,10 @@ Tests cover: """ import json -import pytest from archivebox.tests.conftest import ( run_archivebox_cmd, parse_jsonl_output, - assert_jsonl_contains_type, create_test_url, ) diff --git a/archivebox/tests/test_cli_status.py b/archivebox/tests/test_cli_status.py index 97538f5f..b5eb8dc6 100644 --- a/archivebox/tests/test_cli_status.py +++ b/archivebox/tests/test_cli_status.py @@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem. """ import os -import subprocess import sqlite3 +import subprocess from pathlib import Path -from .fixtures import * - def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: candidates = {snapshot_id} diff --git a/archivebox/tests/test_cli_update.py b/archivebox/tests/test_cli_update.py index 1dc71580..05819c57 100644 --- a/archivebox/tests/test_cli_update.py +++ b/archivebox/tests/test_cli_update.py @@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots. """ import os -import subprocess import sqlite3 - -from .fixtures import * +import subprocess def test_update_runs_successfully_on_empty_archive(tmp_path, process): diff --git a/archivebox/tests/test_cli_version.py b/archivebox/tests/test_cli_version.py index 46382e27..eee2362e 100644 --- a/archivebox/tests/test_cli_version.py +++ b/archivebox/tests/test_cli_version.py @@ -11,7 +11,9 @@ import tempfile import subprocess from pathlib import Path -from .fixtures import * +from .fixtures import process + +FIXTURES = (process,) def _archivebox_cli() -> str: diff --git a/archivebox/tests/test_config.py b/archivebox/tests/test_config.py index b9c251c7..49e4da45 100644 --- a/archivebox/tests/test_config.py +++ b/archivebox/tests/test_config.py @@ -6,7 +6,6 @@ import subprocess import pytest -from .fixtures import process, disable_extractors_dict def test_config_shows_all_config_values(tmp_path, process): @@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process): capture_output=True, text=True, ) + assert result.returncode == 0, result.stderr # Read the config file directly to verify it was written config_file = tmp_path / 'ArchiveBox.conf' diff --git a/archivebox/tests/test_crawl.py b/archivebox/tests/test_crawl.py index 1b1acd88..6065d675 100644 --- a/archivebox/tests/test_crawl.py +++ b/archivebox/tests/test_crawl.py @@ -4,11 +4,9 @@ import os import subprocess import sqlite3 -import json import pytest -from .fixtures import process, disable_extractors_dict def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_extract.py b/archivebox/tests/test_extract.py index 117c922f..47df599e 100644 --- a/archivebox/tests/test_extract.py +++ b/archivebox/tests/test_extract.py @@ -8,7 +8,6 @@ import json import pytest -from .fixtures import process, disable_extractors_dict def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict): @@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict): text=True, env=disable_extractors_dict, ) + assert result.returncode == 0, result.stderr # Should not error conn = sqlite3.connect('index.sqlite3') diff --git a/archivebox/tests/test_extractors.py b/archivebox/tests/test_extractors.py index 3502c7f4..6e2eb521 100644 --- a/archivebox/tests/test_extractors.py +++ b/archivebox/tests/test_extractors.py @@ -1,8 +1,12 @@ -from .fixtures import * import json as pyjson import sqlite3 +import subprocess from pathlib import Path +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) + def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: candidates = {snapshot_id} diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py index 271ac6af..e303a515 100755 --- a/archivebox/tests/test_hooks.py +++ b/archivebox/tests/test_hooks.py @@ -16,7 +16,7 @@ import subprocess import tempfile import unittest from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import patch # Set up Django before importing any Django-dependent modules os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') diff --git a/archivebox/tests/test_init.py b/archivebox/tests/test_init.py index b9d7e130..3a3697bd 100644 --- a/archivebox/tests/test_init.py +++ b/archivebox/tests/test_init.py @@ -3,13 +3,13 @@ import os import subprocess -from pathlib import Path -import json, shutil import sqlite3 from archivebox.config.common import STORAGE_CONFIG -from .fixtures import * +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5') @@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True, env=disable_extractors_dict) + assert add_process.returncode == 0, add_process.stderr.decode("utf-8") # In the new architecture, URLs are saved to source files # Check that a source file was created with the URL @@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'], capture_output=True, env=disable_extractors_dict) + assert add_process.returncode == 0, add_process.stderr.decode("utf-8") # Check that a source file was created with both URLs sources_dir = tmp_path / "sources" @@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr os.chdir(tmp_path) add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True, env=disable_extractors_dict) + assert add_process.returncode == 0, add_process.stderr.decode("utf-8") # Check database permissions assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS) diff --git a/archivebox/tests/test_install.py b/archivebox/tests/test_install.py index 3106ddb1..af967500 100644 --- a/archivebox/tests/test_install.py +++ b/archivebox/tests/test_install.py @@ -7,7 +7,6 @@ import sqlite3 import pytest -from .fixtures import process, disable_extractors_dict class TestInstallDryRun: diff --git a/archivebox/tests/test_list.py b/archivebox/tests/test_list.py index d527fa5d..2aaad4fa 100644 --- a/archivebox/tests/test_list.py +++ b/archivebox/tests/test_list.py @@ -1,7 +1,9 @@ import json import subprocess -from .fixtures import * +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) def test_search_json(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py index c8de3fcf..21bdd134 100644 --- a/archivebox/tests/test_migrations_08_to_09.py +++ b/archivebox/tests/test_migrations_08_to_09.py @@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x. - New fields like depth, retry_at, etc. """ -import json import shutil import sqlite3 -import subprocess import tempfile import unittest from pathlib import Path @@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase): f"Files lost during migration: {files_before_count} -> {files_after_count}") # Run update to trigger filesystem reorganization - print(f"\n[*] Running archivebox update to reorganize filesystem...") + print("\n[*] Running archivebox update to reorganize filesystem...") result = run_archivebox(self.work_dir, ['update'], timeout=120) self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}") @@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase): # CRITICAL: Verify sample files exist in new structure self.assertGreater(len(new_sample_files), 0, - f"Sample files not found in new structure") + "Sample files not found in new structure") # Verify new path format for path_key, file_path in new_sample_files.items(): diff --git a/archivebox/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py index 1872a617..fc61d228 100644 --- a/archivebox/tests/test_recursive_crawl.py +++ b/archivebox/tests/test_recursive_crawl.py @@ -10,7 +10,6 @@ from pathlib import Path import pytest -from .fixtures import process, disable_extractors_dict, recursive_test_site def wait_for_db_condition(timeout, condition, interval=0.5): @@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs "SAVE_ARCHIVEDOTORG": "false", "SAVE_TITLE": "false", "SAVE_FAVICON": "true", - "SAVE_WGET": "false", }) proc = subprocess.Popen( diff --git a/archivebox/tests/test_remove.py b/archivebox/tests/test_remove.py index f9045bcc..078f4e06 100644 --- a/archivebox/tests/test_remove.py +++ b/archivebox/tests/test_remove.py @@ -1,7 +1,10 @@ import os import sqlite3 +import subprocess -from .fixtures import * +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict): """Test removing a snapshot by URL pattern""" diff --git a/archivebox/tests/test_schedule.py b/archivebox/tests/test_schedule.py index 9ec5166a..105308fe 100644 --- a/archivebox/tests/test_schedule.py +++ b/archivebox/tests/test_schedule.py @@ -7,7 +7,6 @@ import subprocess import pytest -from .fixtures import process def _fetchone(tmp_path, query): diff --git a/archivebox/tests/test_schedule_e2e.py b/archivebox/tests/test_schedule_e2e.py new file mode 100644 index 00000000..3cd22d94 --- /dev/null +++ b/archivebox/tests/test_schedule_e2e.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 +"""End-to-end tests for scheduling across CLI, server, API, and web UI.""" + +import os +import socket +import sqlite3 +import subprocess +import sys +import textwrap +import time +from pathlib import Path + +import pytest +import requests + +from .conftest import run_python_cwd + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def init_archive(cwd: Path) -> None: + result = subprocess.run( + [sys.executable, '-m', 'archivebox', 'init', '--quick'], + cwd=cwd, + capture_output=True, + text=True, + timeout=60, + ) + assert result.returncode == 0, result.stderr + + +def build_test_env(port: int, **extra: str) -> dict[str, str]: + env = os.environ.copy() + env.pop('DATA_DIR', None) + env.update({ + 'LISTEN_HOST': f'archivebox.localhost:{port}', + 'ALLOWED_HOSTS': '*', + 'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}', + 'PUBLIC_ADD_VIEW': 'True', + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + 'TIMEOUT': '20', + 'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*', + 'SAVE_ARCHIVEDOTORG': 'False', + 'SAVE_TITLE': 'False', + 'SAVE_FAVICON': 'False', + 'SAVE_WARC': 'False', + 'SAVE_PDF': 'False', + 'SAVE_SCREENSHOT': 'False', + 'SAVE_DOM': 'False', + 'SAVE_SINGLEFILE': 'False', + 'SAVE_READABILITY': 'False', + 'SAVE_MERCURY': 'False', + 'SAVE_GIT': 'False', + 'SAVE_YTDLP': 'False', + 'SAVE_HEADERS': 'False', + 'SAVE_HTMLTOTEXT': 'False', + 'SAVE_WGET': 'True', + 'USE_CHROME': 'False', + }) + env.update(extra) + return env + + +def get_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(('127.0.0.1', 0)) + return sock.getsockname()[1] + + +def start_server(cwd: Path, env: dict[str, str], port: int) -> None: + result = subprocess.run( + [sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'], + cwd=cwd, + capture_output=True, + text=True, + env=env, + timeout=60, + ) + assert result.returncode == 0, result.stderr + + +def stop_server(cwd: Path) -> None: + script = textwrap.dedent( + """ + import os + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') + import django + django.setup() + from archivebox.workers.supervisord_util import stop_existing_supervisord_process + stop_existing_supervisord_process() + print('stopped') + """ + ) + run_python_cwd(script, cwd=cwd, timeout=30) + + +def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response: + deadline = time.time() + timeout + last_exc = None + while time.time() < deadline: + try: + response = requests.get( + f'http://127.0.0.1:{port}{path}', + headers={'Host': host}, + timeout=2, + allow_redirects=False, + ) + if response.status_code < 500: + return response + except requests.RequestException as exc: + last_exc = exc + time.sleep(0.5) + raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}') + + +def make_latest_schedule_due(cwd: Path) -> None: + conn = sqlite3.connect(cwd / 'index.sqlite3') + try: + conn.execute( + """ + UPDATE crawls_crawl + SET created_at = datetime('now', '-2 day'), + modified_at = datetime('now', '-2 day') + WHERE id = ( + SELECT template_id + FROM crawls_crawlschedule + ORDER BY created_at DESC + LIMIT 1 + ) + """ + ) + conn.commit() + finally: + conn.close() + + +def get_snapshot_file_text(cwd: Path, url: str) -> str: + script = textwrap.dedent( + f""" + import os + from pathlib import Path + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') + import django + django.setup() + + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first() + assert snapshot is not None, 'missing snapshot' + assert snapshot.status == 'sealed', snapshot.status + + snapshot_dir = Path(snapshot.output_dir) + candidates = [] + preferred_patterns = ( + 'wget/**/index.html', + 'wget/**/*.html', + 'trafilatura/content.html', + 'trafilatura/content.txt', + 'defuddle/content.html', + 'defuddle/content.txt', + ) + for pattern in preferred_patterns: + for candidate in snapshot_dir.glob(pattern): + if candidate.is_file(): + candidates.append(candidate) + + if not candidates: + for candidate in snapshot_dir.rglob('*'): + if not candidate.is_file(): + continue + rel = candidate.relative_to(snapshot_dir) + if rel.parts and rel.parts[0] == 'responses': + continue + if candidate.suffix not in ('.html', '.htm', '.txt'): + continue + if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'): + continue + candidates.append(candidate) + + assert candidates, f'no captured html/txt files found in {{snapshot_dir}}' + print(candidates[0].read_text(errors='ignore')) + """ + ) + stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60) + assert code == 0, stderr + return stdout + + +def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str: + deadline = time.time() + timeout + last_error = None + while time.time() < deadline: + try: + return get_snapshot_file_text(cwd, url) + except AssertionError as err: + last_error = err + time.sleep(2) + raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}') + + +def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]: + conn = sqlite3.connect(cwd / 'index.sqlite3') + try: + scheduled_snapshots = conn.execute( + "SELECT COUNT(*) FROM core_snapshot WHERE url = ?", + (scheduled_url,), + ).fetchone()[0] + one_shot_snapshots = conn.execute( + "SELECT COUNT(*) FROM core_snapshot WHERE url = ?", + (one_shot_url,), + ).fetchone()[0] + scheduled_crawls = conn.execute( + """ + SELECT COUNT(*) + FROM crawls_crawl + WHERE schedule_id IS NOT NULL + AND urls = ? + """, + (scheduled_url,), + ).fetchone()[0] + return scheduled_snapshots, one_shot_snapshots, scheduled_crawls + finally: + conn.close() + + +def create_admin_and_token(cwd: Path) -> str: + script = textwrap.dedent( + """ + import os + from datetime import timedelta + from django.utils import timezone + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') + import django + django.setup() + + from django.contrib.auth import get_user_model + from archivebox.api.models import APIToken + + User = get_user_model() + user, _ = User.objects.get_or_create( + username='apitestadmin', + defaults={ + 'email': 'apitestadmin@example.com', + 'is_staff': True, + 'is_superuser': True, + }, + ) + user.is_staff = True + user.is_superuser = True + user.set_password('testpass123') + user.save() + + token = APIToken.objects.create( + created_by=user, + expires=timezone.now() + timedelta(days=1), + ) + print(token.token) + """ + ) + stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60) + assert code == 0, stderr + return stdout.strip().splitlines()[-1] + + +@pytest.mark.timeout(180) +def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site): + os.chdir(tmp_path) + init_archive(tmp_path) + + port = get_free_port() + env = build_test_env(port) + + schedule_result = subprocess.run( + [sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']], + cwd=tmp_path, + capture_output=True, + text=True, + env=env, + timeout=60, + ) + assert schedule_result.returncode == 0, schedule_result.stderr + assert 'Created scheduled crawl' in schedule_result.stdout + + make_latest_schedule_due(tmp_path) + + try: + start_server(tmp_path, env=env, port=port) + wait_for_http(port, host=f'web.archivebox.localhost:{port}') + captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180) + assert 'Root' in captured_text + assert 'About' in captured_text + finally: + stop_server(tmp_path) + + +@pytest.mark.timeout(180) +def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site): + os.chdir(tmp_path) + init_archive(tmp_path) + + port = get_free_port() + env = build_test_env(port) + scheduled_url = recursive_test_site['root_url'] + one_shot_url = recursive_test_site['child_urls'][0] + + schedule_result = subprocess.run( + [sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url], + cwd=tmp_path, + capture_output=True, + text=True, + env=env, + timeout=60, + ) + assert schedule_result.returncode == 0, schedule_result.stderr + + make_latest_schedule_due(tmp_path) + + add_result = subprocess.run( + [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url], + cwd=tmp_path, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + assert add_result.returncode == 0, add_result.stderr + captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120) + assert 'Deep About' in captured_text or 'About' in captured_text + + scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url) + assert one_shot_snapshots >= 1 + assert scheduled_snapshots == 0 + assert scheduled_crawls == 1 # template only, no materialized scheduled run + + +@pytest.mark.timeout(180) +def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site): + os.chdir(tmp_path) + init_archive(tmp_path) + + port = get_free_port() + env = build_test_env(port) + api_token = create_admin_and_token(tmp_path) + + try: + start_server(tmp_path, env=env, port=port) + wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs') + + response = requests.post( + f'http://127.0.0.1:{port}/api/v1/cli/schedule', + headers={ + 'Host': f'api.archivebox.localhost:{port}', + 'X-ArchiveBox-API-Key': api_token, + }, + json={ + 'every': 'daily', + 'import_path': recursive_test_site['root_url'], + 'quiet': True, + }, + timeout=10, + ) + + assert response.status_code == 200, response.text + payload = response.json() + assert payload['success'] is True + assert payload['result_format'] == 'json' + assert len(payload['result']['created_schedule_ids']) == 1 + finally: + stop_server(tmp_path) + + +@pytest.mark.timeout(180) +def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site): + os.chdir(tmp_path) + init_archive(tmp_path) + + port = get_free_port() + env = build_test_env(port, PUBLIC_ADD_VIEW='True') + + try: + start_server(tmp_path, env=env, port=port) + wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/') + + response = requests.post( + f'http://127.0.0.1:{port}/add/', + headers={'Host': f'web.archivebox.localhost:{port}'}, + data={ + 'url': recursive_test_site['root_url'], + 'depth': '0', + 'schedule': 'daily', + 'tag': 'web-ui', + 'notes': 'created from web ui', + }, + timeout=10, + allow_redirects=False, + ) + + assert response.status_code in (302, 303), response.text + + conn = sqlite3.connect(tmp_path / 'index.sqlite3') + try: + row = conn.execute( + """ + SELECT cs.schedule, c.urls, c.tags_str + FROM crawls_crawlschedule cs + JOIN crawls_crawl c ON c.schedule_id = cs.id + ORDER BY cs.created_at DESC + LIMIT 1 + """ + ).fetchone() + finally: + conn.close() + + assert row == ('daily', recursive_test_site['root_url'], 'web-ui') + finally: + stop_server(tmp_path) diff --git a/archivebox/tests/test_search.py b/archivebox/tests/test_search.py index 31d944db..9b141be8 100644 --- a/archivebox/tests/test_search.py +++ b/archivebox/tests/test_search.py @@ -3,12 +3,9 @@ import os import subprocess -import sqlite3 -import json import pytest -from .fixtures import process, disable_extractors_dict def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_snapshot.py b/archivebox/tests/test_snapshot.py index 8d2fc3fc..46b4f09b 100644 --- a/archivebox/tests/test_snapshot.py +++ b/archivebox/tests/test_snapshot.py @@ -6,13 +6,11 @@ import subprocess import sqlite3 from archivebox.machine.models import Process from datetime import datetime -from pathlib import Path from urllib.parse import urlparse import uuid import pytest -from .fixtures import process, disable_extractors_dict def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict): @@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row snapshot_id = str(uuid.UUID(snapshot_id_raw)) - crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row username = user_row[0] - crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d') snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d') domain = urlparse(snapshot_url).hostname or 'unknown' diff --git a/archivebox/tests/test_status.py b/archivebox/tests/test_status.py index 2599f053..9035374d 100644 --- a/archivebox/tests/test_status.py +++ b/archivebox/tests/test_status.py @@ -3,11 +3,9 @@ import os import subprocess -import sqlite3 import pytest -from .fixtures import process, disable_extractors_dict def test_status_shows_index_info(tmp_path, process): diff --git a/archivebox/tests/test_title.py b/archivebox/tests/test_title.py index d43ae954..883a4a8c 100644 --- a/archivebox/tests/test_title.py +++ b/archivebox/tests/test_title.py @@ -1,7 +1,10 @@ import os import sqlite3 +import subprocess -from .fixtures import * +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) def test_title_is_extracted(tmp_path, process, disable_extractors_dict): """Test that title is extracted from the page.""" diff --git a/archivebox/tests/test_update.py b/archivebox/tests/test_update.py index 6054f207..e866d811 100644 --- a/archivebox/tests/test_update.py +++ b/archivebox/tests/test_update.py @@ -1,7 +1,10 @@ import json import sqlite3 +import subprocess -from .fixtures import * +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict): """Test that archivebox update imports real legacy archive directories.""" diff --git a/archivebox/tests/test_version.py b/archivebox/tests/test_version.py index 38fa2ba0..7ad7705d 100644 --- a/archivebox/tests/test_version.py +++ b/archivebox/tests/test_version.py @@ -3,11 +3,9 @@ import os import subprocess -import json import pytest -from .fixtures import process, disable_extractors_dict class TestVersionQuiet: diff --git a/archivebox/tests/test_worker_config_propagation.py b/archivebox/tests/test_worker_config_propagation.py index dbb1bfe3..32eb2759 100644 --- a/archivebox/tests/test_worker_config_propagation.py +++ b/archivebox/tests/test_worker_config_propagation.py @@ -18,11 +18,9 @@ Config priority order (highest to lowest): """ import os -import json import sys import tempfile import subprocess -import time from pathlib import Path @@ -45,7 +43,7 @@ def test_config_propagation_through_worker_hierarchy(): data_dir.mkdir() print(f"\n{'='*80}") - print(f"Test: Config Propagation Through Worker Hierarchy") + print("Test: Config Propagation Through Worker Hierarchy") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") @@ -63,7 +61,7 @@ def test_config_propagation_through_worker_hierarchy(): timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" - print(f"✓ Archive initialized\n") + print("✓ Archive initialized\n") # Step 2: Write custom config to ArchiveBox.conf print("Step 2: Write custom config to ArchiveBox.conf") @@ -90,7 +88,7 @@ SAVE_TITLE = True SAVE_FAVICON = True SAVE_SCREENSHOT = True """) - print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n") + print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n") # Step 2.5: Set Machine.config values print("Step 2.5: Set Machine.config with custom binary path") @@ -123,7 +121,7 @@ print(f"Machine {{machine.hostname}} config updated") timeout=30, ) assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}" - print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n") + print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n") # Step 3: Create Crawl via Django ORM with custom crawl.config print("Step 3: Create Crawl with custom crawl.config JSON") @@ -421,7 +419,7 @@ def test_config_environment_variable_parsing(): data_dir.mkdir() print(f"\n{'='*80}") - print(f"Test: Config Environment Variable Parsing") + print("Test: Config Environment Variable Parsing") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") @@ -557,7 +555,7 @@ def test_parent_environment_preserved_in_hooks(): data_dir.mkdir() print(f"\n{'='*80}") - print(f"Test: Parent Environment Preserved in Hooks") + print("Test: Parent Environment Preserved in Hooks") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") @@ -575,7 +573,7 @@ def test_parent_environment_preserved_in_hooks(): timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" - print(f"✓ Archive initialized\n") + print("✓ Archive initialized\n") # Create snapshot print("Step 2: Create Snapshot") @@ -635,7 +633,6 @@ print(snapshot.id) timeout=120, ) - stdout = result.stdout.decode() stderr = result.stderr.decode() print("\n--- SnapshotWorker stderr (first 50 lines) ---") @@ -760,7 +757,7 @@ def test_config_auto_fetch_relationships(): data_dir.mkdir() print(f"\n{'='*80}") - print(f"Test: Config Auto-Fetch Relationships") + print("Test: Config Auto-Fetch Relationships") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") @@ -778,7 +775,7 @@ def test_config_auto_fetch_relationships(): timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" - print(f"✓ Archive initialized\n") + print("✓ Archive initialized\n") # Create objects with config at each level print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level") @@ -906,7 +903,7 @@ def test_config_precedence_with_environment_vars(): data_dir.mkdir() print(f"\n{'='*80}") - print(f"Test: Config Precedence with Environment Variables") + print("Test: Config Precedence with Environment Variables") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") @@ -1006,7 +1003,7 @@ def test_new_environment_variables_added(): data_dir.mkdir() print(f"\n{'='*80}") - print(f"Test: New Environment Variables Added to Config") + print("Test: New Environment Variables Added to Config") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index d969acc9..9720cde4 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -94,10 +94,10 @@ class Orchestrator: self.POLL_INTERVAL = 0.25 # Exit quickly once idle in foreground mode self.IDLE_TIMEOUT = 1 - + def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' - + @classmethod def is_running(cls) -> bool: """Check if an orchestrator is already running.""" @@ -223,7 +223,7 @@ class Orchestrator: process_type=Process.TypeChoices.WORKER, status=Process.StatusChoices.RUNNING, ) - + def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: """Determine if we should spawn a new worker.""" if queue_count == 0: @@ -253,7 +253,7 @@ class Orchestrator: return False return True - + def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: @@ -286,7 +286,10 @@ class Orchestrator: print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]') print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]') for p in all_procs: - print(f'[yellow] -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]') + print( + f'[yellow] -> type={p.process_type} status={p.status} ' + f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]' + ) worker_process = Process.objects.filter( pid=pid, @@ -324,7 +327,7 @@ class Orchestrator: error=e, ) return None - + def check_queues_and_spawn_workers(self) -> dict[str, int]: """ Check Binary and Crawl queues and spawn workers as needed. @@ -584,11 +587,11 @@ class Orchestrator: def has_pending_work(self, queue_sizes: dict[str, int]) -> bool: """Check if any queue has pending work.""" return any(count > 0 for count in queue_sizes.values()) - + def has_running_workers(self) -> bool: """Check if any workers are still running.""" return self.get_total_worker_count() > 0 - + def has_future_work(self) -> bool: """Check if there's work scheduled for the future (retry_at > now) in Crawl queue.""" from archivebox.crawls.models import Crawl @@ -605,38 +608,38 @@ class Orchestrator: qs = qs.filter(id=self.crawl_id) return qs.count() > 0 - + def on_tick(self, queue_sizes: dict[str, int]) -> None: """Called each orchestrator tick. Override for custom behavior.""" # Tick logging suppressed to reduce noise pass - + def on_idle(self) -> None: """Called when orchestrator is idle (no work, no workers).""" # Idle logging suppressed to reduce noise pass - + def should_exit(self, queue_sizes: dict[str, int]) -> bool: """Determine if orchestrator should exit.""" if not self.exit_on_idle: return False - + if self.IDLE_TIMEOUT == 0: return False - + # Don't exit if there's pending or future work if self.has_pending_work(queue_sizes): return False - + if self.has_running_workers(): return False - + if self.has_future_work(): return False - + # Exit after idle timeout return self.idle_count >= self.IDLE_TIMEOUT - + def runloop(self) -> None: """Main orchestrator loop.""" from rich.live import Live @@ -702,7 +705,7 @@ class Orchestrator: os.close(devnull_fd) os.close(stdout_for_restore) os.close(stderr_for_restore) - except: + except OSError: pass # stdout_for_console is closed by orchestrator_console @@ -1132,7 +1135,6 @@ class Orchestrator: # Count hooks by status for debugging queued = snapshot.archiveresult_set.filter(status='queued').count() - started = snapshot.archiveresult_set.filter(status='started').count() # Find currently running hook (ordered by hook_name to get lowest step number) current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first() @@ -1211,7 +1213,7 @@ class Orchestrator: for snapshot_id in list(snapshot_progress.keys()): if snapshot_id not in active_ids: progress_layout.log_event( - f"Snapshot completed/removed", + "Snapshot completed/removed", style="blue" ) if snapshot_id in snapshot_progress: @@ -1263,7 +1265,7 @@ class Orchestrator: raise else: self.on_shutdown() - + def start(self) -> int: """ Fork orchestrator as a background process. @@ -1285,7 +1287,7 @@ class Orchestrator: pid=proc.pid, ) return proc.pid - + @classmethod def get_or_start(cls, exit_on_idle: bool = True) -> 'Orchestrator': """ @@ -1296,6 +1298,6 @@ class Orchestrator: print('[grey53]👨‍✈️ Orchestrator already running[/grey53]') # Return a placeholder - actual orchestrator is in another process return cls(exit_on_idle=exit_on_idle) - + orchestrator = cls(exit_on_idle=exit_on_idle) return orchestrator diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index f4d7aa02..b85865cc 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -2,7 +2,6 @@ __package__ = 'archivebox.workers' import sys import time -import signal import socket import psutil import shutil @@ -42,7 +41,7 @@ ORCHESTRATOR_WORKER = { SERVER_WORKER = lambda host, port: { "name": "worker_daphne", - "command": f"daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application", + "command": f"{sys.executable} -m daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application", "autostart": "false", "autorestart": "true", "stdout_logfile": "logs/worker_daphne.log", @@ -513,8 +512,6 @@ def watch_worker(supervisor, daemon_name, interval=5): def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): - global _supervisord_proc - supervisor = get_or_create_supervisord_process(daemonize=daemonize) bg_workers = [ @@ -551,8 +548,6 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): def start_cli_workers(watch=False): - global _supervisord_proc - supervisor = get_or_create_supervisord_process(daemonize=False) start_worker(supervisor, ORCHESTRATOR_WORKER) diff --git a/archivebox/workers/tests/test_orchestrator.py b/archivebox/workers/tests/test_orchestrator.py index 79d37f95..ac8e23a6 100644 --- a/archivebox/workers/tests/test_orchestrator.py +++ b/archivebox/workers/tests/test_orchestrator.py @@ -10,9 +10,7 @@ Tests cover: """ import os -import tempfile import time -from pathlib import Path from datetime import timedelta from unittest.mock import patch, MagicMock @@ -217,7 +215,6 @@ class TestOrchestratorWithProcess(TestCase): def test_orchestrator_scoped_worker_count(self): """Orchestrator with crawl_id should count only descendant workers.""" - import time from archivebox.machine.models import Process, Machine machine = Machine.current() diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 37a920b7..a344f6a2 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -13,13 +13,10 @@ __package__ = 'archivebox.workers' import os import time -import traceback -from typing import ClassVar, Any -from datetime import timedelta +from typing import TYPE_CHECKING, Any, ClassVar from pathlib import Path from multiprocessing import cpu_count -from django.db.models import QuerySet from django.utils import timezone from django.conf import settings @@ -28,6 +25,9 @@ from rich import print from archivebox.misc.logging_util import log_worker_event +if TYPE_CHECKING: + from archivebox.machine.models import Process + CPU_COUNT = cpu_count() @@ -314,7 +314,10 @@ class Worker: process.kill(signal_num=signal.SIGKILL) log_worker_event( worker_type=worker_type, - event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)', + event=( + f'⚠ Sent SIGKILL to {hook_name} + ' + f'{len(children_pids) if children_pids else 0} children (exceeded timeout)' + ), indent_level=indent_level, pid=self.pid, ) @@ -341,7 +344,6 @@ class Worker: from archivebox.machine.models import Process, Machine from archivebox.config.configset import get_config from pathlib import Path - from django.conf import settings import sys refresh_machine_config = bool( @@ -552,7 +554,7 @@ class CrawlWorker(Worker): # Check if crawl is done if self._is_crawl_finished(): - print(f'🔄 Crawl finished, sealing...', file=sys.stderr) + print('🔄 Crawl finished, sealing...', file=sys.stderr) self.crawl.sm.seal() break @@ -813,7 +815,8 @@ class SnapshotWorker(Worker): is_background = is_background_hook(hook_name) # Create ArchiveResult for THIS HOOK (not per plugin) - # One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js) + # One plugin can have multiple hooks + # (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js) # Unique key = (snapshot, plugin, hook_name) for idempotency ar, created = ArchiveResult.objects.get_or_create( snapshot=self.snapshot, @@ -868,7 +871,7 @@ class SnapshotWorker(Worker): self.snapshot.sm.seal() self.snapshot.refresh_from_db() - except Exception as e: + except Exception: # Mark snapshot as sealed even on error (still triggers cleanup) self._finalize_background_hooks() self.snapshot.sm.seal() @@ -1019,7 +1022,6 @@ class SnapshotWorker(Worker): self.background_processes = {} # Update background results now that hooks are done - from archivebox.core.models import ArchiveResult bg_results = self.snapshot.archiveresult_set.filter( hook_name__contains='.bg.', @@ -1034,7 +1036,6 @@ class SnapshotWorker(Worker): if not self.background_processes: return - from archivebox.core.models import ArchiveResult for hook_name, process in list(self.background_processes.items()): exit_code = process.poll() @@ -1165,7 +1166,6 @@ class BinaryWorker(Worker): def runloop(self) -> None: """Install binary(ies).""" - import sys self.on_startup() @@ -1216,7 +1216,7 @@ class BinaryWorker(Worker): except Exception as e: log_worker_event( worker_type='BinaryWorker', - event=f'Failed to install binary', + event='Failed to install binary', indent_level=1, pid=self.pid, error=e, diff --git a/bin/test.sh b/bin/test.sh index 7690d375..7567a56c 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -14,5 +14,5 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" source "$DIR/.venv/bin/activate" -pytest -s --basetemp=tests/data "$@" +pytest -s --basetemp=archivebox/tests/data "$@" exec ./bin/test_plugins.sh diff --git a/docs b/docs index a9e347fa..be25d9bf 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit a9e347fac6fb37f7c5194379aca8aca44839f446 +Subproject commit be25d9bfa2d0f98b6b5b788c43d9629d1b31d217