mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
fix lint
This commit is contained in:
@@ -2,7 +2,6 @@ __package__ = 'archivebox.api'
|
||||
|
||||
import secrets
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import timedelta
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
import os
|
||||
import django
|
||||
import importlib
|
||||
from io import StringIO
|
||||
from types import SimpleNamespace
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
django.setup()
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import TestCase
|
||||
setup_django()
|
||||
|
||||
from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
|
||||
from archivebox.crawls.models import CrawlSchedule
|
||||
User = importlib.import_module('django.contrib.auth.models').User
|
||||
TestCase = importlib.import_module('django.test').TestCase
|
||||
api_v1_cli = importlib.import_module('archivebox.api.v1_cli')
|
||||
ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema
|
||||
cli_schedule = api_v1_cli.cli_schedule
|
||||
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
|
||||
|
||||
|
||||
class CLIScheduleAPITests(TestCase):
|
||||
|
||||
@@ -3,10 +3,7 @@ __package__ = 'archivebox.api'
|
||||
from typing import Optional
|
||||
|
||||
from ninja import Router, Schema
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
|
||||
from archivebox.api.models import APIToken
|
||||
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from django.utils import timezone
|
||||
|
||||
from django.db.models import Q
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
@@ -6,7 +6,7 @@ import json
|
||||
|
||||
from django import forms
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.utils.html import mark_safe
|
||||
from django_object_actions import DjangoObjectActions
|
||||
|
||||
|
||||
|
||||
@@ -2,12 +2,9 @@
|
||||
|
||||
__package__ = 'archivebox.base_models'
|
||||
|
||||
from uuid import UUID
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from typing import ClassVar
|
||||
from pathlib import Path
|
||||
|
||||
from django.contrib import admin
|
||||
from django.db import models
|
||||
from django.db.models import F
|
||||
from django.utils import timezone
|
||||
@@ -17,8 +14,6 @@ from django.conf import settings
|
||||
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
from archivebox import DATA_DIR
|
||||
from archivebox.misc.hashing import get_dir_info
|
||||
|
||||
|
||||
def get_or_create_system_user_pk(username='system'):
|
||||
|
||||
@@ -57,6 +57,7 @@ def add(urls: str | list[str],
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.personas.models import Persona
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.misc.system import get_dir_size
|
||||
@@ -79,11 +80,15 @@ def add(urls: str | list[str],
|
||||
|
||||
# Read URLs directly into crawl
|
||||
urls_content = sources_file.read_text()
|
||||
persona_name = (persona or 'Default').strip() or 'Default'
|
||||
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
|
||||
persona_obj.ensure_dirs()
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
persona_id=persona_obj.id,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
created_by_id=created_by_id,
|
||||
config={
|
||||
@@ -91,7 +96,7 @@ def add(urls: str | list[str],
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
}
|
||||
)
|
||||
@@ -135,8 +140,7 @@ def add(urls: str | list[str],
|
||||
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
|
||||
else:
|
||||
# Foreground mode: run full orchestrator until all work is done
|
||||
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
print('[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
|
||||
orchestrator.runloop() # Block until complete
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ def config(*keys,
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print(f'[grey53]\\[PLUGINS][/grey53]')
|
||||
print('[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
|
||||
import sys
|
||||
from typing import Optional, List
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
|
||||
@@ -410,7 +410,6 @@ def create_personas(
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.personas.models import Persona
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
name_list = list(names) if names else []
|
||||
@@ -493,10 +492,10 @@ def create_personas(
|
||||
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
|
||||
),
|
||||
)
|
||||
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
|
||||
# Extract cookies via CDP
|
||||
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(
|
||||
persona_chrome_dir,
|
||||
@@ -506,8 +505,8 @@ def create_personas(
|
||||
):
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Any
|
||||
from typing import Optional, List
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
|
||||
@@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
start_server_workers,
|
||||
tail_multiple_worker_logs,
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
@@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
|
||||
print(f' Stop the conflicting process or choose a different port')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if orchestrator is already running for this data directory
|
||||
if Orchestrator.is_running():
|
||||
print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
|
||||
print(f' Stop the existing orchestrator before starting a new server')
|
||||
print(f' To stop: pkill -f "archivebox manage orchestrator"')
|
||||
print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
|
||||
print(' Stop the existing orchestrator before starting a new server')
|
||||
print(' To stop: pkill -f "archivebox manage orchestrator"')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if supervisord is already running
|
||||
@@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
|
||||
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
|
||||
print(' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print(' pkill -f "archivebox server"')
|
||||
|
||||
@@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
if not snapshot.downloaded_at:
|
||||
continue
|
||||
print(
|
||||
'[grey53] ' +
|
||||
(
|
||||
'[grey53] '
|
||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||
f'"{snapshot.title}": {snapshot.url}'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH]
|
||||
+ '[grey53]',
|
||||
'[/grey53]'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||
)
|
||||
print('[grey53] ...')
|
||||
|
||||
|
||||
@@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.utils import timezone
|
||||
from django.core.management import call_command
|
||||
|
||||
# Run migrations first to ensure DB schema is up-to-date
|
||||
|
||||
@@ -6,7 +6,7 @@ import sys
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
from typing import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import sys
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
@@ -30,18 +30,15 @@ TEST_CONFIG = {
|
||||
DATA_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
from ..main import init
|
||||
from archivebox.config.constants import (
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
)
|
||||
|
||||
from . import (
|
||||
archivebox_init,
|
||||
archivebox_add,
|
||||
archivebox_remove,
|
||||
)
|
||||
init = importlib.import_module('archivebox.main').init
|
||||
constants = importlib.import_module('archivebox.config.constants')
|
||||
SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
|
||||
JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
|
||||
HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
|
||||
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
|
||||
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
|
||||
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
|
||||
parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
|
||||
|
||||
HIDE_CLI_OUTPUT = True
|
||||
|
||||
@@ -68,6 +65,13 @@ stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
|
||||
|
||||
def load_main_index(*, out_dir: str):
|
||||
index_path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
if not index_path.exists():
|
||||
raise FileNotFoundError(index_path)
|
||||
return list(parse_json_main_index(Path(out_dir)))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def output_hidden(show_failing=True):
|
||||
if not HIDE_CLI_OUTPUT:
|
||||
|
||||
@@ -23,7 +23,6 @@ Each command should:
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
@@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase):
|
||||
|
||||
def test_parse_jsonl_with_id(self):
|
||||
"""JSONL with id field should be recognized."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
|
||||
result = parse_line(line)
|
||||
@@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT
|
||||
read_args_or_stdin, TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
@@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
Extract should accept JSONL output from snapshot command.
|
||||
"""
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
@@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase):
|
||||
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
|
||||
@@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
|
||||
def test_crawl_passes_through_other_types(self):
|
||||
"""crawl create should pass through records with other types."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Input: a Tag record (not a Crawl or URL)
|
||||
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
|
||||
@@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
|
||||
# Mock stdin with both records
|
||||
stdin = StringIO(
|
||||
json.dumps(tag_record) + '\n' +
|
||||
json.dumps(url_record)
|
||||
json.dumps(tag_record)
|
||||
+ '\n'
|
||||
+ json.dumps(url_record)
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
@@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
|
||||
def test_snapshot_passes_through_crawl(self):
|
||||
"""snapshot create should pass through Crawl records."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
crawl_record = {
|
||||
'type': TYPE_CRAWL,
|
||||
|
||||
@@ -8,10 +8,6 @@ and other modules that expect to import config values directly.
|
||||
__package__ = 'archivebox.config'
|
||||
__order__ = 200
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from .paths import (
|
||||
PACKAGE_DIR, # noqa
|
||||
DATA_DIR, # noqa
|
||||
@@ -31,6 +27,7 @@ def _get_config():
|
||||
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
return ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
|
||||
# Direct exports (evaluated at import time for backwards compat)
|
||||
# These are recalculated each time the module attribute is accessed
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ from configparser import ConfigParser
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
import archivebox
|
||||
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
|
||||
@@ -11,10 +11,10 @@ __package__ = "archivebox.config"
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
|
||||
from typing import Any, Dict, Optional, Type, Tuple
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pydantic import Field, ConfigDict
|
||||
from pydantic import ConfigDict
|
||||
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
|
||||
|
||||
|
||||
@@ -166,6 +166,23 @@ def get_config(
|
||||
|
||||
if user is None and crawl and hasattr(crawl, "created_by"):
|
||||
user = crawl.created_by
|
||||
|
||||
if persona is None and crawl is not None:
|
||||
try:
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
persona_id = getattr(crawl, "persona_id", None)
|
||||
if persona_id:
|
||||
persona = Persona.objects.filter(id=persona_id).first()
|
||||
|
||||
if persona is None:
|
||||
crawl_config = getattr(crawl, "config", None) or {}
|
||||
default_persona_name = crawl_config.get("DEFAULT_PERSONA")
|
||||
if default_persona_name:
|
||||
persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default")
|
||||
persona.ensure_dirs()
|
||||
except Exception:
|
||||
pass
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
|
||||
@@ -100,9 +100,11 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
return
|
||||
|
||||
from django.conf import settings
|
||||
from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG)
|
||||
with open(error_log, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
@@ -46,7 +46,6 @@ if RUNNING_AS_UID == 0:
|
||||
# if we are running as root it's really hard to figure out what the correct archivebox user should be
|
||||
# as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users)
|
||||
# check if 911:911 archivebox user exists on host system, and use it instead of 0
|
||||
import pwd
|
||||
if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox':
|
||||
FALLBACK_UID = DEFAULT_PUID
|
||||
FALLBACK_GID = DEFAULT_PGID
|
||||
|
||||
@@ -3,7 +3,6 @@ __package__ = 'archivebox.config'
|
||||
import os
|
||||
import shutil
|
||||
import inspect
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Dict, cast
|
||||
from benedict import benedict
|
||||
|
||||
@@ -30,11 +29,11 @@ KNOWN_BINARIES = [
|
||||
]
|
||||
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
indent_str = " " * indent
|
||||
if indent == 0:
|
||||
indent_str = '\n' # put extra newline between top-level entries
|
||||
|
||||
|
||||
if isinstance(obj, dict):
|
||||
if not obj:
|
||||
return "{}"
|
||||
@@ -42,7 +41,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
for key, value in obj.items():
|
||||
result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
|
||||
return result
|
||||
|
||||
|
||||
elif isinstance(obj, list):
|
||||
if not obj:
|
||||
return "[]"
|
||||
@@ -50,16 +49,16 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
for item in obj:
|
||||
result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
|
||||
return result.rstrip()
|
||||
|
||||
|
||||
elif isinstance(obj, str):
|
||||
if "\n" in obj:
|
||||
return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ")
|
||||
else:
|
||||
return f" {obj}"
|
||||
|
||||
|
||||
elif isinstance(obj, (int, float, bool)):
|
||||
return f" {str(obj)}"
|
||||
|
||||
|
||||
elif callable(obj):
|
||||
source = '\n'.join(
|
||||
'' if 'def ' in line else line
|
||||
@@ -67,7 +66,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
if line.strip()
|
||||
).split('lambda: ')[-1].rstrip(',')
|
||||
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
|
||||
|
||||
|
||||
else:
|
||||
return f" {str(obj)}"
|
||||
|
||||
@@ -75,7 +74,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
|
||||
"""Detect available binaries using shutil.which."""
|
||||
binaries = {}
|
||||
|
||||
|
||||
for name in KNOWN_BINARIES:
|
||||
path = shutil.which(name)
|
||||
if path:
|
||||
@@ -85,7 +84,7 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
|
||||
'version': None, # Could add version detection later
|
||||
'is_available': True,
|
||||
}
|
||||
|
||||
|
||||
return binaries
|
||||
|
||||
|
||||
@@ -144,19 +143,19 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
# Get binaries from database (previously detected/installed)
|
||||
db_binaries = {b.name: b for b in Binary.objects.all()}
|
||||
|
||||
# Get currently detectable binaries
|
||||
|
||||
# Get currently detectable binaries
|
||||
detected = get_detected_binaries()
|
||||
|
||||
|
||||
# Merge and display
|
||||
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
|
||||
|
||||
|
||||
for name in all_binary_names:
|
||||
db_binary = db_binaries.get(name)
|
||||
detected_binary = detected.get(name)
|
||||
|
||||
|
||||
rows['Binary Name'].append(ItemLink(name, key=name))
|
||||
|
||||
|
||||
if db_binary:
|
||||
rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found')
|
||||
rows['Provided By'].append(db_binary.binprovider or 'PATH')
|
||||
@@ -175,6 +174,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
table=rows,
|
||||
)
|
||||
|
||||
|
||||
@render_with_item_view
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
@@ -203,7 +203,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
)
|
||||
except Binary.DoesNotExist:
|
||||
pass
|
||||
|
||||
|
||||
# Try to detect from PATH
|
||||
path = shutil.which(key)
|
||||
if path:
|
||||
@@ -224,7 +224,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
@@ -286,6 +286,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
table=rows,
|
||||
)
|
||||
|
||||
|
||||
@render_with_item_view
|
||||
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
import json
|
||||
@@ -314,7 +315,10 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
# Add config.json data if available
|
||||
if plugin.get('config'):
|
||||
config_json = json.dumps(plugin['config'], indent=2)
|
||||
fields["config.json"] = mark_safe(f'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>')
|
||||
fields["config.json"] = mark_safe(
|
||||
'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; '
|
||||
f'padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>'
|
||||
)
|
||||
|
||||
# Also extract and display individual config properties for easier viewing
|
||||
if 'properties' in plugin['config']:
|
||||
@@ -322,7 +326,6 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
properties_summary = []
|
||||
for prop_name, prop_info in config_properties.items():
|
||||
prop_type = prop_info.get('type', 'unknown')
|
||||
prop_default = prop_info.get('default', 'N/A')
|
||||
prop_desc = prop_info.get('description', '')
|
||||
properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}")
|
||||
|
||||
@@ -365,7 +368,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
title="No running worker processes",
|
||||
table=rows,
|
||||
)
|
||||
|
||||
|
||||
all_config_entries = cast(List[Dict[str, Any]], supervisor.getAllConfigInfo() or [])
|
||||
all_config = {config["name"]: benedict(config) for config in all_config_entries}
|
||||
|
||||
@@ -514,7 +517,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
@render_with_item_view
|
||||
def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert request.user.is_superuser, "Must be a superuser to view configuration settings."
|
||||
|
||||
|
||||
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
|
||||
|
||||
log_text = log_file.read_text()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.contrib import admin
|
||||
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||
|
||||
import archivebox
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
@@ -20,7 +20,6 @@ archivebox_admin = ArchiveBoxAdmin()
|
||||
# patch admin with methods to add data views (implemented by admin_data_views package)
|
||||
# https://github.com/MrThearMan/django-admin-data-views
|
||||
# https://mrthearman.github.io/django-admin-data-views/setup/
|
||||
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
|
||||
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
|
||||
|
||||
@@ -26,7 +26,7 @@ from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from archivebox.core.models import Tag, Snapshot, ArchiveResult
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
from archivebox.core.admin_archiveresults import render_archiveresults_list
|
||||
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
|
||||
|
||||
|
||||
@@ -712,8 +712,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
description="🔁 Redo Failed"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
count = queryset.count()
|
||||
|
||||
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
||||
|
||||
messages.success(
|
||||
@@ -741,8 +739,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
description="🔄 Redo"
|
||||
)
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
count = queryset.count()
|
||||
|
||||
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
||||
|
||||
messages.success(
|
||||
|
||||
@@ -60,7 +60,7 @@ class CoreConfig(AppConfig):
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
machine = Machine.current()
|
||||
Machine.current()
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
Orchestrator(exit_on_idle=False).start()
|
||||
|
||||
@@ -8,11 +8,10 @@ https://docs.djangoproject.com/en/stable/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
setup_django(in_memory_db=False, check_db=True)
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
# Standard Django ASGI application (no websockets/channels needed)
|
||||
application = get_asgi_application()
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ from archivebox.misc.util import URL_REGEX
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
from archivebox.crawls.schedule_utils import validate_schedule
|
||||
from archivebox.hooks import get_plugins
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
@@ -15,7 +16,6 @@ DEPTH_CHOICES = (
|
||||
('4', 'depth = 4 (+ URLs four hops away)'),
|
||||
)
|
||||
|
||||
from archivebox.hooks import get_plugins
|
||||
|
||||
def get_plugin_choices():
|
||||
"""Get available extractor plugins from discovered hooks."""
|
||||
@@ -210,15 +210,18 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
return schedule
|
||||
|
||||
|
||||
class TagWidgetMixin:
|
||||
def format_value(self, value):
|
||||
if value is not None and not isinstance(value, str):
|
||||
value = edit_string_for_tags(value)
|
||||
return super().format_value(value)
|
||||
|
||||
|
||||
class TagWidget(TagWidgetMixin, forms.TextInput):
|
||||
pass
|
||||
|
||||
|
||||
class TagField(forms.CharField):
|
||||
widget = TagWidget
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@ from archivebox.config import VERSION
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
from archivebox.core.host_utils import (
|
||||
build_admin_url,
|
||||
build_api_url,
|
||||
build_web_url,
|
||||
get_api_host,
|
||||
get_admin_host,
|
||||
|
||||
@@ -7,10 +7,8 @@ def forwards_func(apps, schema_editor):
|
||||
SnapshotModel = apps.get_model("core", "Snapshot")
|
||||
TagModel = apps.get_model("core", "Tag")
|
||||
|
||||
db_alias = schema_editor.connection.alias
|
||||
snapshots = SnapshotModel.objects.all()
|
||||
for snapshot in snapshots:
|
||||
tags = snapshot.tags
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
|
||||
)
|
||||
@@ -23,9 +21,7 @@ def forwards_func(apps, schema_editor):
|
||||
|
||||
def reverse_func(apps, schema_editor):
|
||||
SnapshotModel = apps.get_model("core", "Snapshot")
|
||||
TagModel = apps.get_model("core", "Tag")
|
||||
|
||||
db_alias = schema_editor.connection.alias
|
||||
snapshots = SnapshotModel.objects.all()
|
||||
for snapshot in snapshots:
|
||||
tags = snapshot.tags.values_list("name", flat=True)
|
||||
|
||||
@@ -43,7 +43,7 @@ def forwards_func(apps, schema_editor):
|
||||
try:
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
fs_index = json.load(f)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
history = fs_index["history"]
|
||||
|
||||
@@ -234,7 +234,6 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
tag_has_data = cursor.fetchone()[0] > 0
|
||||
|
||||
if tag_has_data:
|
||||
tag_cols = get_table_columns('core_tag')
|
||||
cursor.execute("PRAGMA table_info(core_tag)")
|
||||
tag_id_type = None
|
||||
for row in cursor.fetchall():
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
|
||||
|
||||
from django.db import migrations, models
|
||||
import uuid
|
||||
|
||||
|
||||
def create_default_crawl_and_assign_snapshots(apps, schema_editor):
|
||||
|
||||
@@ -347,7 +347,7 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
migrated_count += 1
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Linked ArchiveResult to Process')
|
||||
print('DEBUG 0027: Linked ArchiveResult to Process')
|
||||
|
||||
except Exception as e:
|
||||
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
|
||||
from typing import Optional, Dict, Iterable, Any, List
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import datetime, timedelta
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
@@ -12,19 +12,18 @@ from pathlib import Path
|
||||
from statemachine import State, registry
|
||||
|
||||
from django.db import models
|
||||
from django.db.models import QuerySet, Value, Case, When, IntegerField
|
||||
from django.db.models import QuerySet
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import slugify
|
||||
from django.utils import timezone
|
||||
from django.core.cache import cache
|
||||
from django.urls import reverse, reverse_lazy
|
||||
from django.urls import reverse_lazy
|
||||
from django.contrib import admin
|
||||
from django.conf import settings
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.system import get_dir_size, atomic_write
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||
from archivebox.misc.hashing import get_dir_info
|
||||
from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||
from archivebox.hooks import (
|
||||
get_plugins, get_plugin_name, get_plugin_icon,
|
||||
)
|
||||
@@ -186,7 +185,7 @@ class SnapshotQuerySet(models.QuerySet):
|
||||
for pattern in patterns:
|
||||
try:
|
||||
qsearch |= query_search_index(pattern)
|
||||
except:
|
||||
except BaseException:
|
||||
raise SystemExit(2)
|
||||
return self.all() & qsearch
|
||||
|
||||
@@ -344,8 +343,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
@property
|
||||
def process_set(self):
|
||||
"""Get all Process objects related to this snapshot's ArchiveResults."""
|
||||
import json
|
||||
import json
|
||||
from archivebox.machine.models import Process
|
||||
return Process.objects.filter(archiveresult__snapshot_id=self.id)
|
||||
|
||||
@@ -458,13 +455,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
if not old_dir.exists() or old_dir == new_dir:
|
||||
# No migration needed
|
||||
print(f"[DEBUG _fs_migrate] Returning None (early return)")
|
||||
print("[DEBUG _fs_migrate] Returning None (early return)")
|
||||
return None
|
||||
|
||||
if new_dir.exists():
|
||||
# New directory already exists (files already copied), but we still need cleanup
|
||||
# Return cleanup info so old directory can be cleaned up
|
||||
print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
|
||||
print("[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
|
||||
return (old_dir, new_dir)
|
||||
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -499,7 +496,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
# Schedule cleanup AFTER transaction commits successfully
|
||||
# This ensures DB changes are committed before we delete old files
|
||||
from django.db import transaction
|
||||
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
|
||||
|
||||
# Return cleanup info for manual cleanup if needed (when called directly)
|
||||
@@ -594,8 +590,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
domain = self.extract_domain_from_url(self.url)
|
||||
|
||||
return (
|
||||
CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
|
||||
date_str / domain / str(self.id)
|
||||
CONSTANTS.DATA_DIR / 'users' / username / 'snapshots'
|
||||
/ date_str / domain / str(self.id)
|
||||
)
|
||||
else:
|
||||
# Unknown version - use current
|
||||
@@ -670,7 +666,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
|
||||
return snapshot
|
||||
elif candidates.count() > 1:
|
||||
print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
|
||||
print("[DEBUG load_from_directory] Multiple fuzzy matches, using first")
|
||||
return candidates.first()
|
||||
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
|
||||
return None
|
||||
@@ -767,7 +763,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
ts_int = int(float(ts))
|
||||
# 1995-01-01 to 2035-12-31
|
||||
return 788918400 <= ts_int <= 2082758400
|
||||
except:
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
return False
|
||||
|
||||
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
|
||||
@@ -850,7 +846,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
try:
|
||||
with open(json_path) as f:
|
||||
index_data = json.load(f)
|
||||
except:
|
||||
except (OSError, TypeError, ValueError, json.JSONDecodeError):
|
||||
pass
|
||||
|
||||
# Merge title
|
||||
@@ -929,7 +925,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if result_data.get('start_ts'):
|
||||
try:
|
||||
start_ts = parser.parse(result_data['start_ts'])
|
||||
except:
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
pass
|
||||
|
||||
if (plugin, start_ts) in existing:
|
||||
@@ -940,7 +936,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if result_data.get('end_ts'):
|
||||
try:
|
||||
end_ts = parser.parse(result_data['end_ts'])
|
||||
except:
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
pass
|
||||
|
||||
# Support both 'output' (legacy) and 'output_str' (new JSONL) field names
|
||||
@@ -957,7 +953,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
start_ts=start_ts,
|
||||
end_ts=end_ts,
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def write_index_json(self):
|
||||
@@ -1176,7 +1172,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
try:
|
||||
shutil.move(str(snapshot_dir), str(dest))
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@@ -1208,7 +1204,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
try:
|
||||
cls._merge_snapshots(snapshots)
|
||||
merged += 1
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return merged
|
||||
@@ -1244,7 +1240,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
try:
|
||||
shutil.rmtree(dup_dir)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Merge tags
|
||||
@@ -1615,7 +1611,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
"""
|
||||
import re
|
||||
from django.utils import timezone
|
||||
from archivebox.misc.util import parse_date
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
@@ -2125,7 +2120,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
|
||||
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
|
||||
from archivebox.misc.util import ts_to_date_str
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
|
||||
result = {
|
||||
@@ -2283,9 +2277,9 @@ class SnapshotMachine(BaseStateMachine):
|
||||
|
||||
# Tick Event (polled by workers)
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
queued.to.itself(unless='can_start')
|
||||
| queued.to(started, cond='can_start')
|
||||
| started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
# Manual event (can also be triggered by last ArchiveResult finishing)
|
||||
@@ -2783,7 +2777,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
Updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config with proper context
|
||||
@@ -3190,16 +3184,16 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
# queued → skipped (if exceeded max attempts)
|
||||
# started → backoff → started (retry)
|
||||
tick = (
|
||||
queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start')
|
||||
queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
|
||||
| queued.to.itself(unless='can_start')
|
||||
| queued.to(started, cond='can_start')
|
||||
| started.to(succeeded, cond='is_succeeded')
|
||||
| started.to(failed, cond='is_failed')
|
||||
| started.to(skipped, cond='is_skipped')
|
||||
| started.to(backoff, cond='is_backoff')
|
||||
| backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
|
||||
| backoff.to.itself(unless='can_start')
|
||||
| backoff.to(started, cond='can_start')
|
||||
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
|
||||
# Reason: backoff should always retry→started, then started→final states
|
||||
)
|
||||
@@ -3241,8 +3235,8 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
|
||||
and not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
@@ -3286,7 +3280,6 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
|
||||
# Update Process with network interface
|
||||
if self.archiveresult.process_id:
|
||||
|
||||
@@ -6,6 +6,7 @@ import inspect
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf.locale.en import formats as en_formats # type: ignore
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
import archivebox
|
||||
@@ -13,6 +14,7 @@ import archivebox
|
||||
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa
|
||||
from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url
|
||||
from .settings_logging import SETTINGS_LOGGING
|
||||
|
||||
|
||||
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
|
||||
@@ -54,8 +56,8 @@ INSTALLED_APPS = [
|
||||
"django.contrib.staticfiles",
|
||||
"django.contrib.admin",
|
||||
# 3rd-party apps from PyPI
|
||||
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
|
||||
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
"signal_webhooks", # handles REST API outbound webhooks
|
||||
"django_object_actions", # provides easy Django Admin action buttons on change views
|
||||
# Our ArchiveBox-provided apps (use fully qualified names)
|
||||
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
|
||||
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
|
||||
@@ -117,7 +119,6 @@ try:
|
||||
|
||||
try:
|
||||
# Try to import django-auth-ldap (will fail if not installed)
|
||||
import django_auth_ldap
|
||||
from django_auth_ldap.config import LDAPSearch
|
||||
import ldap
|
||||
|
||||
@@ -414,9 +415,6 @@ DATETIME_FORMAT = "Y-m-d h:i:s A"
|
||||
SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A"
|
||||
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
|
||||
|
||||
|
||||
from django.conf.locale.en import formats as en_formats # type: ignore
|
||||
|
||||
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
|
||||
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
|
||||
|
||||
@@ -425,9 +423,6 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
|
||||
### Logging Settings
|
||||
################################################################################
|
||||
|
||||
|
||||
from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG
|
||||
|
||||
LOGGING = SETTINGS_LOGGING
|
||||
|
||||
|
||||
|
||||
@@ -5,8 +5,6 @@ import os
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
import pydantic
|
||||
import django.template
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Tests for the core views, especially AddView."""
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import django
|
||||
from unittest.mock import patch
|
||||
@@ -8,13 +9,14 @@ from unittest.mock import patch
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
django.setup()
|
||||
|
||||
from django.test import TestCase, Client
|
||||
from django.contrib.auth.models import User
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.core.models import Tag
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
TestCase = importlib.import_module('django.test').TestCase
|
||||
Client = importlib.import_module('django.test').Client
|
||||
User = importlib.import_module('django.contrib.auth.models').User
|
||||
reverse = importlib.import_module('django.urls').reverse
|
||||
Crawl = importlib.import_module('archivebox.crawls.models').Crawl
|
||||
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
|
||||
Tag = importlib.import_module('archivebox.core.models').Tag
|
||||
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
|
||||
|
||||
|
||||
class AddViewTests(TestCase):
|
||||
@@ -252,7 +254,7 @@ class AddViewTests(TestCase):
|
||||
def test_add_staff_admin_custom_config_is_allowed(self):
|
||||
"""Admin users can override crawl config."""
|
||||
self.client.logout()
|
||||
admin_user = User.objects.create_user(
|
||||
User.objects.create_user(
|
||||
username='adminuser',
|
||||
password='adminpass123',
|
||||
email='admin@example.com',
|
||||
|
||||
@@ -10,7 +10,7 @@ from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
|
||||
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.views import View
|
||||
from django.views.generic.list import ListView
|
||||
@@ -24,9 +24,8 @@ from django.utils.decorators import method_decorator
|
||||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
|
||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
@@ -35,6 +34,9 @@ from archivebox.search import query_search_index
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.forms import AddLinkForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.hooks import get_enabled_plugins, get_plugin_name
|
||||
|
||||
|
||||
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
|
||||
@@ -49,12 +51,6 @@ def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
|
||||
return target
|
||||
|
||||
|
||||
from archivebox.core.forms import AddLinkForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.hooks import get_enabled_plugins, get_plugin_name
|
||||
|
||||
|
||||
|
||||
class HomepageView(View):
|
||||
def get(self, request):
|
||||
if request.user.is_authenticated:
|
||||
@@ -1066,10 +1062,6 @@ class HealthCheckView(View):
|
||||
status=200
|
||||
)
|
||||
|
||||
|
||||
import json
|
||||
from django.http import JsonResponse
|
||||
|
||||
def live_progress_view(request):
|
||||
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||
try:
|
||||
@@ -1077,7 +1069,6 @@ def live_progress_view(request):
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
# Get orchestrator status
|
||||
orchestrator_running = Orchestrator.is_running()
|
||||
@@ -1133,7 +1124,6 @@ def live_progress_view(request):
|
||||
})
|
||||
|
||||
# Build hierarchical active crawls with nested snapshots and archive results
|
||||
from django.db.models import Prefetch
|
||||
|
||||
running_workers = Process.objects.filter(
|
||||
machine=machine,
|
||||
@@ -1387,7 +1377,7 @@ def find_config_default(key: str) -> str:
|
||||
return default_val
|
||||
|
||||
def find_config_type(key: str) -> str:
|
||||
from typing import get_type_hints, ClassVar
|
||||
from typing import ClassVar
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
for config in CONFIGS.values():
|
||||
@@ -1430,7 +1420,6 @@ def key_is_safe(key: str) -> bool:
|
||||
|
||||
def find_config_source(key: str, merged_config: dict) -> str:
|
||||
"""Determine where a config value comes from."""
|
||||
import os
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
# Check if it's from archivebox.machine.config
|
||||
@@ -1464,12 +1453,11 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
# Get merged config that includes Machine.config overrides
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
Machine.current()
|
||||
merged_config = get_config()
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
# Fallback if Machine model not available
|
||||
merged_config = get_config()
|
||||
machine = None
|
||||
|
||||
rows = {
|
||||
"Section": [],
|
||||
@@ -1525,7 +1513,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
import os
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
|
||||
@@ -343,20 +343,17 @@ class InlineTagEditorWidget(TagEditorWidget):
|
||||
snapshot_id = snapshot_id or self.snapshot_id
|
||||
|
||||
# Parse value to get list of tag dicts with id and name
|
||||
tags = []
|
||||
tag_data = []
|
||||
if value:
|
||||
if hasattr(value, 'all'): # QuerySet
|
||||
for tag in value.all():
|
||||
tag_data.append({'id': tag.pk, 'name': tag.name})
|
||||
tag_data.sort(key=lambda x: x['name'].lower())
|
||||
tags = [t['name'] for t in tag_data]
|
||||
elif isinstance(value, (list, tuple)):
|
||||
if value and hasattr(value[0], 'name'):
|
||||
for tag in value:
|
||||
tag_data.append({'id': tag.pk, 'name': tag.name})
|
||||
tag_data.sort(key=lambda x: x['name'].lower())
|
||||
tags = [t['name'] for t in tag_data]
|
||||
|
||||
widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
|
||||
widget_id = self._normalize_id(widget_id_raw)
|
||||
|
||||
@@ -9,9 +9,8 @@ https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
|
||||
|
||||
import archivebox # noqa
|
||||
from archivebox.config.django import setup_django
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
setup_django(in_memory_db=False, check_db=True)
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
application = get_wsgi_application()
|
||||
|
||||
@@ -1,17 +1,11 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html, format_html_join, mark_safe
|
||||
from django.contrib import admin, messages
|
||||
from django.urls import path
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_POST
|
||||
from django.db.models import Count, Q
|
||||
|
||||
from archivebox import DATA_DIR
|
||||
|
||||
from django_object_actions import action
|
||||
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import TYPE_CHECKING, Iterable
|
||||
from typing import TYPE_CHECKING
|
||||
from datetime import timedelta
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
|
||||
from django.db import models
|
||||
from django.db.models import QuerySet
|
||||
from django.core.validators import MaxValueValidator, MinValueValidator
|
||||
from django.conf import settings
|
||||
from django.urls import reverse_lazy
|
||||
@@ -15,13 +14,12 @@ from django_stubs_ext.db.models import TypedModelMeta
|
||||
from statemachine import State, registry
|
||||
from rich import print
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
|
||||
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
||||
from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
class CrawlSchedule(ModelWithUUID, ModelWithNotes):
|
||||
@@ -111,7 +109,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
label = models.CharField(max_length=64, blank=True, null=False, default='')
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
|
||||
output_dir = models.CharField(max_length=512, null=False, blank=True, default='')
|
||||
|
||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
@@ -252,6 +249,22 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
return system_url
|
||||
return None
|
||||
|
||||
def resolve_persona(self):
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
if self.persona_id:
|
||||
persona = Persona.objects.filter(id=self.persona_id).first()
|
||||
if persona is None:
|
||||
raise Persona.DoesNotExist(f'Crawl {self.id} references missing Persona {self.persona_id}')
|
||||
return persona
|
||||
|
||||
default_persona_name = str((self.config or {}).get('DEFAULT_PERSONA') or '').strip()
|
||||
if default_persona_name:
|
||||
persona, _ = Persona.objects.get_or_create(name=default_persona_name or 'Default')
|
||||
return persona
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def add_url(self, entry: dict) -> bool:
|
||||
"""
|
||||
@@ -391,7 +404,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
f.flush()
|
||||
|
||||
def get_runtime_config():
|
||||
return get_config(crawl=self)
|
||||
config = get_config(crawl=self)
|
||||
if persona_runtime_overrides:
|
||||
config.update(persona_runtime_overrides)
|
||||
return config
|
||||
|
||||
system_task = self.get_system_task()
|
||||
if system_task == 'archivebox://update':
|
||||
@@ -402,6 +418,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
machine = Machine.current()
|
||||
declared_binary_names: set[str] = set()
|
||||
persona_runtime_overrides: dict[str, str] = {}
|
||||
persona = self.resolve_persona()
|
||||
if persona:
|
||||
base_runtime_config = get_config(crawl=self, persona=persona)
|
||||
chrome_binary = str(base_runtime_config.get('CHROME_BINARY') or '')
|
||||
persona_runtime_overrides = persona.prepare_runtime_for_crawl(
|
||||
crawl=self,
|
||||
chrome_binary=chrome_binary,
|
||||
)
|
||||
|
||||
def install_declared_binaries(binary_names: set[str]) -> None:
|
||||
if not binary_names:
|
||||
@@ -563,7 +588,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
# Discover and run on_Crawl hooks
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Discovering Crawl hooks...\n')
|
||||
f.write('Discovering Crawl hooks...\n')
|
||||
f.flush()
|
||||
hooks = discover_hooks('Crawl', config=get_runtime_config())
|
||||
with open(debug_log, 'a') as f:
|
||||
@@ -588,17 +613,17 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
print(f'[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]')
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Skipping snapshot creation for system crawl: {system_task}\n')
|
||||
f.write(f'=== Crawl.run() complete ===\n\n')
|
||||
f.write('=== Crawl.run() complete ===\n\n')
|
||||
f.flush()
|
||||
return None
|
||||
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Creating snapshots from URLs...\n')
|
||||
f.write('Creating snapshots from URLs...\n')
|
||||
f.flush()
|
||||
created_snapshots = self.create_snapshots_from_urls()
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Created {len(created_snapshots)} snapshots\n')
|
||||
f.write(f'=== Crawl.run() complete ===\n\n')
|
||||
f.write('=== Crawl.run() complete ===\n\n')
|
||||
f.flush()
|
||||
|
||||
# Return first snapshot for this crawl (newly created or existing)
|
||||
@@ -647,6 +672,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
for pid_file in self.output_dir.glob('**/*.pid'):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
|
||||
persona = self.resolve_persona()
|
||||
if persona:
|
||||
persona.cleanup_runtime_for_crawl(self)
|
||||
|
||||
# Run on_CrawlEnd hooks
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=self)
|
||||
@@ -715,9 +744,9 @@ class CrawlMachine(BaseStateMachine):
|
||||
|
||||
# Tick Event (polled by workers)
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
queued.to.itself(unless='can_start')
|
||||
| queued.to(started, cond='can_start')
|
||||
| started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
# Manual event (triggered by last Snapshot sealing)
|
||||
@@ -740,7 +769,6 @@ class CrawlMachine(BaseStateMachine):
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
import sys
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)
|
||||
|
||||
@@ -758,7 +786,7 @@ class CrawlMachine(BaseStateMachine):
|
||||
)
|
||||
else:
|
||||
# No snapshots (system crawl like archivebox://install)
|
||||
print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
|
||||
print('[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
|
||||
# Seal immediately since there's no work to do
|
||||
self.seal()
|
||||
|
||||
|
||||
@@ -56,16 +56,18 @@ __package__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, TypedDict
|
||||
from typing import TYPE_CHECKING, List, Dict, Any, Optional, TypedDict
|
||||
|
||||
from abx_plugins import get_plugins_dir
|
||||
from django.conf import settings
|
||||
from django.utils.safestring import mark_safe
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
# Plugin directories
|
||||
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
|
||||
@@ -266,9 +268,7 @@ def run_hook(
|
||||
"""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
import time
|
||||
import sys
|
||||
start_time = time.time()
|
||||
|
||||
# Auto-detect timeout from plugin config if not explicitly provided
|
||||
if timeout is None:
|
||||
|
||||
@@ -9,7 +9,6 @@ __package__ = "archivebox.ldap"
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.contrib.auth.models import User
|
||||
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
|
||||
else:
|
||||
try:
|
||||
|
||||
@@ -10,6 +10,7 @@ from datetime import timedelta, datetime
|
||||
from statemachine import State, registry
|
||||
|
||||
from django.db import models
|
||||
from django.db.models import QuerySet
|
||||
from django.utils import timezone
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
@@ -197,7 +198,6 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
class BinaryManager(models.Manager):
|
||||
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary':
|
||||
"""Get or create an Binary record from the database or cache."""
|
||||
global _CURRENT_BINARIES
|
||||
cached = _CURRENT_BINARIES.get(name)
|
||||
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
|
||||
return cached
|
||||
@@ -583,7 +583,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
Called by state machine if needed (not typically used for binaries
|
||||
since installations are foreground, but included for consistency).
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
# Kill any background binary installation hooks using Process records
|
||||
# (rarely used since binary installations are typically foreground)
|
||||
@@ -1026,9 +1025,11 @@ class Process(models.Model):
|
||||
# Check cache validity
|
||||
if _CURRENT_PROCESS:
|
||||
# Verify: same PID, same machine, cache not expired
|
||||
if (_CURRENT_PROCESS.pid == current_pid and
|
||||
_CURRENT_PROCESS.machine_id == machine.id and
|
||||
timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)):
|
||||
if (
|
||||
_CURRENT_PROCESS.pid == current_pid
|
||||
and _CURRENT_PROCESS.machine_id == machine.id
|
||||
and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
|
||||
):
|
||||
_CURRENT_PROCESS.ensure_log_files()
|
||||
return _CURRENT_PROCESS
|
||||
_CURRENT_PROCESS = None
|
||||
@@ -1111,7 +1112,6 @@ class Process(models.Model):
|
||||
machine = machine or Machine.current()
|
||||
|
||||
# Debug logging
|
||||
import sys
|
||||
# print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
|
||||
|
||||
# Get parent process start time from OS
|
||||
@@ -1630,7 +1630,6 @@ class Process(models.Model):
|
||||
self (updated with pid, started_at, etc.)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
# Validate pwd is set (required for output files)
|
||||
if not self.pwd:
|
||||
@@ -1846,7 +1845,6 @@ class Process(models.Model):
|
||||
Returns:
|
||||
True if process was terminated, False if already dead
|
||||
"""
|
||||
import time
|
||||
import signal
|
||||
|
||||
proc = self.proc
|
||||
@@ -2199,8 +2197,8 @@ class BinaryMachine(BaseStateMachine):
|
||||
|
||||
# Tick Event - install happens during transition
|
||||
tick = (
|
||||
queued.to.itself(unless='can_install') |
|
||||
queued.to(installed, cond='can_install', on='on_install')
|
||||
queued.to.itself(unless='can_install')
|
||||
| queued.to(installed, cond='can_install', on='on_install')
|
||||
)
|
||||
|
||||
def can_install(self) -> bool:
|
||||
@@ -2303,10 +2301,10 @@ class ProcessMachine(BaseStateMachine):
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(running, cond='can_start') |
|
||||
running.to.itself(unless='is_exited') |
|
||||
running.to(exited, cond='is_exited')
|
||||
queued.to.itself(unless='can_start')
|
||||
| queued.to(running, cond='can_start')
|
||||
| running.to.itself(unless='is_exited')
|
||||
| running.to(exited, cond='is_exited')
|
||||
)
|
||||
|
||||
# Additional events (for explicit control)
|
||||
|
||||
@@ -12,8 +12,6 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import timedelta
|
||||
from unittest.mock import patch
|
||||
|
||||
@@ -29,7 +27,6 @@ from archivebox.machine.models import (
|
||||
BinaryMachine,
|
||||
ProcessMachine,
|
||||
MACHINE_RECHECK_INTERVAL,
|
||||
PROCESS_RECHECK_INTERVAL,
|
||||
PID_REUSE_WINDOW,
|
||||
)
|
||||
|
||||
@@ -323,7 +320,6 @@ class TestProcessModel(TestCase):
|
||||
def test_process_update_and_requeue(self):
|
||||
"""Process.update_and_requeue() should update fields and save."""
|
||||
process = Process.objects.create(machine=self.machine, cmd=['test'])
|
||||
old_modified = process.modified_at
|
||||
|
||||
process.update_and_requeue(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
__package__ = 'archivebox.mcp'
|
||||
|
||||
"""
|
||||
Model Context Protocol (MCP) server implementation for ArchiveBox.
|
||||
|
||||
@@ -10,9 +8,7 @@ Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport.
|
||||
import sys
|
||||
import json
|
||||
import traceback
|
||||
from typing import Any, Dict, List, Optional
|
||||
from io import StringIO
|
||||
from contextlib import redirect_stdout, redirect_stderr
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
from click.testing import CliRunner
|
||||
|
||||
@@ -225,7 +225,6 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
||||
|
||||
|
||||
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
|
||||
import archivebox
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.misc.logging_util import pretty_path
|
||||
|
||||
@@ -35,7 +35,6 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L
|
||||
with open(index_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
timestamp = data.get('timestamp')
|
||||
url = data.get('url')
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
|
||||
@@ -21,13 +21,12 @@ if TYPE_CHECKING:
|
||||
|
||||
from rich import print
|
||||
from rich.panel import Panel
|
||||
from django.core.management.base import DjangoHelpFormatter
|
||||
|
||||
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.logging import ANSI, stderr
|
||||
from archivebox.misc.logging import ANSI
|
||||
|
||||
@dataclass
|
||||
class RuntimeStats:
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import django
|
||||
import pydantic
|
||||
|
||||
import datetime
|
||||
import warnings
|
||||
|
||||
import benedict
|
||||
from daphne import access
|
||||
import django_stubs_ext
|
||||
from django.utils import timezone
|
||||
|
||||
django_stubs_ext.monkeypatch()
|
||||
|
||||
|
||||
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
|
||||
import datetime
|
||||
from django.utils import timezone
|
||||
timezone.utc = datetime.timezone.utc
|
||||
|
||||
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||
@@ -26,12 +28,9 @@ timezone.utc = datetime.timezone.utc
|
||||
|
||||
# Hide site-packages/sonic/client.py:115: SyntaxWarning
|
||||
# https://github.com/xmonader/python-sonic-client/pull/18
|
||||
import warnings # noqa
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic')
|
||||
|
||||
# Make daphne log requests quieter and esier to read
|
||||
from daphne import access # noqa
|
||||
|
||||
class ModifiedAccessLogGenerator(access.AccessLogGenerator):
|
||||
"""Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files"""
|
||||
|
||||
@@ -68,5 +67,4 @@ access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry #
|
||||
# fix benedict objects to pretty-print/repr more nicely with rich
|
||||
# https://stackoverflow.com/a/79048811/2156113
|
||||
# https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol
|
||||
import benedict # noqa
|
||||
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore
|
||||
|
||||
@@ -135,7 +135,6 @@ class ProcessLogPanel:
|
||||
if line:
|
||||
log_lines.append(Text(line, style="cyan"))
|
||||
|
||||
compact = self.compact if self.compact is not None else self._is_background_hook()
|
||||
max_body = max(1, self.max_lines - len(header_lines))
|
||||
if not log_lines:
|
||||
log_lines = []
|
||||
|
||||
@@ -4,10 +4,11 @@ __package__ = 'archivebox.misc'
|
||||
import os
|
||||
import signal
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
from json import dump
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Set, Tuple
|
||||
from typing import Optional, Union, Tuple
|
||||
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
|
||||
|
||||
from atomicwrites import atomic_write as lib_atomic_write
|
||||
@@ -58,7 +59,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
|
||||
# far into the TimeoutExpired exception.
|
||||
process.wait()
|
||||
raise
|
||||
except: # Including KeyboardInterrupt, communicate handled that.
|
||||
except BaseException: # Including KeyboardInterrupt, communicate handled that.
|
||||
process.kill()
|
||||
# We don't call process.wait() as .__exit__ does that for us.
|
||||
raise
|
||||
|
||||
@@ -1,3 +1,2 @@
|
||||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
||||
|
||||
@@ -11,8 +11,12 @@ Each persona has its own:
|
||||
|
||||
__package__ = 'archivebox.personas'
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterator
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.db import models
|
||||
from django.conf import settings
|
||||
@@ -21,8 +25,32 @@ from django.utils import timezone
|
||||
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
try:
|
||||
import fcntl
|
||||
except ImportError: # pragma: no cover
|
||||
fcntl = None
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
pass
|
||||
|
||||
|
||||
VOLATILE_PROFILE_DIR_NAMES = {
|
||||
'Cache',
|
||||
'Code Cache',
|
||||
'GPUCache',
|
||||
'ShaderCache',
|
||||
'Service Worker',
|
||||
'GCM Store',
|
||||
'Crashpad',
|
||||
'BrowserMetrics',
|
||||
}
|
||||
|
||||
VOLATILE_PROFILE_FILE_NAMES = {
|
||||
'BrowserMetrics-spare.pma',
|
||||
'SingletonCookie',
|
||||
'SingletonLock',
|
||||
'SingletonSocket',
|
||||
}
|
||||
|
||||
|
||||
class Persona(ModelWithConfig):
|
||||
@@ -120,37 +148,118 @@ class Persona(ModelWithConfig):
|
||||
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_chrome(self) -> bool:
|
||||
"""
|
||||
Clean up Chrome state files (SingletonLock, etc.) for this persona.
|
||||
|
||||
Returns:
|
||||
True if cleanup was performed, False if no cleanup needed
|
||||
"""
|
||||
def cleanup_chrome_profile(self, profile_dir: Path) -> bool:
|
||||
"""Remove volatile Chrome state that should never be reused across launches."""
|
||||
cleaned = False
|
||||
chrome_dir = self.path / 'chrome_user_data'
|
||||
|
||||
if not chrome_dir.exists():
|
||||
if not profile_dir.exists():
|
||||
return False
|
||||
|
||||
# Clean up SingletonLock files
|
||||
for lock_file in chrome_dir.glob('**/SingletonLock'):
|
||||
try:
|
||||
lock_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
for path in profile_dir.rglob('*'):
|
||||
if path.name in VOLATILE_PROFILE_FILE_NAMES:
|
||||
try:
|
||||
path.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Clean up SingletonSocket files
|
||||
for socket_file in chrome_dir.glob('**/SingletonSocket'):
|
||||
for dirname in VOLATILE_PROFILE_DIR_NAMES:
|
||||
for path in profile_dir.rglob(dirname):
|
||||
if not path.is_dir():
|
||||
continue
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
cleaned = True
|
||||
|
||||
for path in profile_dir.rglob('*.log'):
|
||||
try:
|
||||
socket_file.unlink()
|
||||
path.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return cleaned
|
||||
|
||||
def cleanup_chrome(self) -> bool:
|
||||
"""Clean up volatile Chrome state for this persona's base profile."""
|
||||
return self.cleanup_chrome_profile(self.path / 'chrome_user_data')
|
||||
|
||||
@contextmanager
|
||||
def lock_runtime_for_crawl(self):
|
||||
lock_path = self.path / '.archivebox-crawl-profile.lock'
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with lock_path.open('w') as lock_file:
|
||||
if fcntl is not None:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if fcntl is not None:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
def runtime_root_for_crawl(self, crawl) -> Path:
|
||||
return Path(crawl.output_dir) / '.persona' / self.name
|
||||
|
||||
def runtime_profile_dir_for_crawl(self, crawl) -> Path:
|
||||
return self.runtime_root_for_crawl(crawl) / 'chrome_user_data'
|
||||
|
||||
def runtime_downloads_dir_for_crawl(self, crawl) -> Path:
|
||||
return self.runtime_root_for_crawl(crawl) / 'chrome_downloads'
|
||||
|
||||
def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None:
|
||||
destination_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.rmtree(destination_dir, ignore_errors=True)
|
||||
destination_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
copy_cmd: list[str] | None = None
|
||||
source_contents = f'{source_dir}/.'
|
||||
|
||||
if sys.platform == 'darwin':
|
||||
copy_cmd = ['cp', '-cR', source_contents, str(destination_dir)]
|
||||
elif sys.platform.startswith('linux'):
|
||||
copy_cmd = ['cp', '-a', source_contents, str(destination_dir)]
|
||||
|
||||
if copy_cmd:
|
||||
result = subprocess.run(copy_cmd, capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return
|
||||
|
||||
shutil.rmtree(destination_dir, ignore_errors=True)
|
||||
destination_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True)
|
||||
|
||||
def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = '') -> dict[str, str]:
|
||||
self.ensure_dirs()
|
||||
|
||||
template_dir = Path(self.CHROME_USER_DATA_DIR)
|
||||
runtime_root = self.runtime_root_for_crawl(crawl)
|
||||
runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl)
|
||||
runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl)
|
||||
|
||||
with self.lock_runtime_for_crawl():
|
||||
if not runtime_profile_dir.exists():
|
||||
if template_dir.exists() and any(template_dir.iterdir()):
|
||||
self.copy_chrome_profile(template_dir, runtime_profile_dir)
|
||||
else:
|
||||
runtime_profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
runtime_downloads_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.cleanup_chrome_profile(runtime_profile_dir)
|
||||
|
||||
(runtime_root / 'persona_name.txt').write_text(self.name)
|
||||
(runtime_root / 'template_dir.txt').write_text(str(template_dir))
|
||||
if chrome_binary:
|
||||
(runtime_root / 'chrome_binary.txt').write_text(chrome_binary)
|
||||
|
||||
return {
|
||||
'CHROME_USER_DATA_DIR': str(runtime_profile_dir),
|
||||
'CHROME_DOWNLOADS_DIR': str(runtime_downloads_dir),
|
||||
}
|
||||
|
||||
def cleanup_runtime_for_crawl(self, crawl) -> None:
|
||||
shutil.rmtree(Path(crawl.output_dir) / '.persona', ignore_errors=True)
|
||||
|
||||
@classmethod
|
||||
def get_or_create_default(cls) -> 'Persona':
|
||||
"""Get or create the Default persona."""
|
||||
|
||||
@@ -1,3 +1,2 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
|
||||
@@ -1,3 +1,2 @@
|
||||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
||||
|
||||
@@ -14,7 +14,7 @@ Search backends must provide a search.py module with:
|
||||
|
||||
__package__ = 'archivebox.search'
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
@@ -22,9 +22,6 @@ from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.logging import stderr
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
# Cache discovered backends to avoid repeated filesystem scans
|
||||
_search_backends_cache: Optional[dict] = None
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import subprocess
|
||||
import textwrap
|
||||
@@ -13,6 +12,8 @@ import pytest
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
pytest_plugins = ["archivebox.tests.fixtures"]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Helpers (defined before fixtures that use them)
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
import subprocess
|
||||
import json
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
from .fixtures import *
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
def test_depth_flag_is_accepted(process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
@@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
|
||||
|
||||
def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
arg_process = subprocess.run(
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
|
||||
@@ -9,7 +9,7 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase, Client, override_settings
|
||||
from django.test import override_settings
|
||||
from django.urls import reverse
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from importlib.util import find_spec
|
||||
|
||||
|
||||
class TestLDAPConfig(unittest.TestCase):
|
||||
@@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase):
|
||||
|
||||
def test_django_settings_with_ldap_library_check(self):
|
||||
"""Test that Django settings check for LDAP libraries when enabled."""
|
||||
# Try to import django-auth-ldap to see if it's available
|
||||
try:
|
||||
import django_auth_ldap
|
||||
import ldap
|
||||
ldap_available = True
|
||||
except ImportError:
|
||||
ldap_available = False
|
||||
ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None
|
||||
|
||||
# If LDAP libraries are not available, settings should handle gracefully
|
||||
if not ldap_available:
|
||||
|
||||
@@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
|
||||
@@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
assert 'test' in tags_str or 'example' in tags_str
|
||||
|
||||
|
||||
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test add persists the selected persona so browser config derives from it later."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
persona_id, default_persona = c.execute(
|
||||
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert persona_id
|
||||
assert default_persona == 'Default'
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
|
||||
|
||||
|
||||
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding the same URL twice creates separate crawls and snapshots.
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
|
||||
@@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_config_displays_all_config(tmp_path, process):
|
||||
|
||||
@@ -9,14 +9,11 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
assert_jsonl_contains_type,
|
||||
create_test_url,
|
||||
create_test_crawl_json,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -7,8 +7,6 @@ Verify command runs successfully and produces output.
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_help_runs_successfully(tmp_path):
|
||||
"""Test that help command runs and produces output."""
|
||||
|
||||
@@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
|
||||
|
||||
@@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_install_runs_successfully(tmp_path, process):
|
||||
"""Test that install command runs without error."""
|
||||
|
||||
@@ -6,9 +6,6 @@ Verify manage command runs Django management commands.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_manage_help_works(tmp_path, process):
|
||||
|
||||
@@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -8,7 +8,6 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
|
||||
@@ -10,11 +10,9 @@ Tests cover:
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -6,9 +6,6 @@ Verify search queries snapshots from DB.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing).
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import signal
|
||||
import time
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_server_shows_usage_info(tmp_path, process):
|
||||
|
||||
@@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only).
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_shell_command_exists(tmp_path, process):
|
||||
"""Test that shell command is recognized."""
|
||||
|
||||
@@ -9,12 +9,10 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
assert_jsonl_contains_type,
|
||||
create_test_url,
|
||||
)
|
||||
|
||||
|
||||
@@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
|
||||
@@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_update_runs_successfully_on_empty_archive(tmp_path, process):
|
||||
|
||||
@@ -11,7 +11,9 @@ import tempfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import process
|
||||
|
||||
FIXTURES = (process,)
|
||||
|
||||
|
||||
def _archivebox_cli() -> str:
|
||||
|
||||
@@ -6,7 +6,6 @@ import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_config_shows_all_config_values(tmp_path, process):
|
||||
@@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
# Read the config file directly to verify it was written
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
|
||||
@@ -4,11 +4,9 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -8,7 +8,6 @@ import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
@@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
# Should not error
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
from .fixtures import *
|
||||
import json as pyjson
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
|
||||
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
|
||||
@@ -16,7 +16,7 @@ import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
import json, shutil
|
||||
import sqlite3
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
|
||||
@@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
|
||||
|
||||
# In the new architecture, URLs are saved to source files
|
||||
# Check that a source file was created with the URL
|
||||
@@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
|
||||
|
||||
# Check that a source file was created with both URLs
|
||||
sources_dir = tmp_path / "sources"
|
||||
@@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
|
||||
|
||||
# Check database permissions
|
||||
assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
|
||||
|
||||
@@ -7,7 +7,6 @@ import sqlite3
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
class TestInstallDryRun:
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
def test_search_json(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
|
||||
@@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x.
|
||||
- New fields like depth, retry_at, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
@@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
f"Files lost during migration: {files_before_count} -> {files_after_count}")
|
||||
|
||||
# Run update to trigger filesystem reorganization
|
||||
print(f"\n[*] Running archivebox update to reorganize filesystem...")
|
||||
print("\n[*] Running archivebox update to reorganize filesystem...")
|
||||
result = run_archivebox(self.work_dir, ['update'], timeout=120)
|
||||
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
|
||||
|
||||
@@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
|
||||
# CRITICAL: Verify sample files exist in new structure
|
||||
self.assertGreater(len(new_sample_files), 0,
|
||||
f"Sample files not found in new structure")
|
||||
"Sample files not found in new structure")
|
||||
|
||||
# Verify new path format
|
||||
for path_key, file_path in new_sample_files.items():
|
||||
|
||||
@@ -10,7 +10,6 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict, recursive_test_site
|
||||
|
||||
|
||||
def wait_for_db_condition(timeout, condition, interval=0.5):
|
||||
@@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "true",
|
||||
"SAVE_WGET": "false",
|
||||
})
|
||||
|
||||
proc = subprocess.Popen(
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing a snapshot by URL pattern"""
|
||||
|
||||
@@ -7,7 +7,6 @@ import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process
|
||||
|
||||
|
||||
def _fetchone(tmp_path, query):
|
||||
|
||||
420
archivebox/tests/test_schedule_e2e.py
Normal file
420
archivebox/tests/test_schedule_e2e.py
Normal file
@@ -0,0 +1,420 @@
|
||||
#!/usr/bin/env python3
|
||||
"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
|
||||
|
||||
import os
|
||||
import socket
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from .conftest import run_python_cwd
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def init_archive(cwd: Path) -> None:
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'init', '--quick'],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
|
||||
def build_test_env(port: int, **extra: str) -> dict[str, str]:
|
||||
env = os.environ.copy()
|
||||
env.pop('DATA_DIR', None)
|
||||
env.update({
|
||||
'LISTEN_HOST': f'archivebox.localhost:{port}',
|
||||
'ALLOWED_HOSTS': '*',
|
||||
'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
|
||||
'PUBLIC_ADD_VIEW': 'True',
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'TIMEOUT': '20',
|
||||
'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'SAVE_HTMLTOTEXT': 'False',
|
||||
'SAVE_WGET': 'True',
|
||||
'USE_CHROME': 'False',
|
||||
})
|
||||
env.update(extra)
|
||||
return env
|
||||
|
||||
|
||||
def get_free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.bind(('127.0.0.1', 0))
|
||||
return sock.getsockname()[1]
|
||||
|
||||
|
||||
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
|
||||
def stop_server(cwd: Path) -> None:
|
||||
script = textwrap.dedent(
|
||||
"""
|
||||
import os
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
import django
|
||||
django.setup()
|
||||
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
|
||||
stop_existing_supervisord_process()
|
||||
print('stopped')
|
||||
"""
|
||||
)
|
||||
run_python_cwd(script, cwd=cwd, timeout=30)
|
||||
|
||||
|
||||
def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
|
||||
deadline = time.time() + timeout
|
||||
last_exc = None
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
response = requests.get(
|
||||
f'http://127.0.0.1:{port}{path}',
|
||||
headers={'Host': host},
|
||||
timeout=2,
|
||||
allow_redirects=False,
|
||||
)
|
||||
if response.status_code < 500:
|
||||
return response
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
time.sleep(0.5)
|
||||
raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
|
||||
|
||||
|
||||
def make_latest_schedule_due(cwd: Path) -> None:
|
||||
conn = sqlite3.connect(cwd / 'index.sqlite3')
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE crawls_crawl
|
||||
SET created_at = datetime('now', '-2 day'),
|
||||
modified_at = datetime('now', '-2 day')
|
||||
WHERE id = (
|
||||
SELECT template_id
|
||||
FROM crawls_crawlschedule
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_snapshot_file_text(cwd: Path, url: str) -> str:
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
|
||||
assert snapshot is not None, 'missing snapshot'
|
||||
assert snapshot.status == 'sealed', snapshot.status
|
||||
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
candidates = []
|
||||
preferred_patterns = (
|
||||
'wget/**/index.html',
|
||||
'wget/**/*.html',
|
||||
'trafilatura/content.html',
|
||||
'trafilatura/content.txt',
|
||||
'defuddle/content.html',
|
||||
'defuddle/content.txt',
|
||||
)
|
||||
for pattern in preferred_patterns:
|
||||
for candidate in snapshot_dir.glob(pattern):
|
||||
if candidate.is_file():
|
||||
candidates.append(candidate)
|
||||
|
||||
if not candidates:
|
||||
for candidate in snapshot_dir.rglob('*'):
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
rel = candidate.relative_to(snapshot_dir)
|
||||
if rel.parts and rel.parts[0] == 'responses':
|
||||
continue
|
||||
if candidate.suffix not in ('.html', '.htm', '.txt'):
|
||||
continue
|
||||
if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
|
||||
continue
|
||||
candidates.append(candidate)
|
||||
|
||||
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
|
||||
print(candidates[0].read_text(errors='ignore'))
|
||||
"""
|
||||
)
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
||||
assert code == 0, stderr
|
||||
return stdout
|
||||
|
||||
|
||||
def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
|
||||
deadline = time.time() + timeout
|
||||
last_error = None
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
return get_snapshot_file_text(cwd, url)
|
||||
except AssertionError as err:
|
||||
last_error = err
|
||||
time.sleep(2)
|
||||
raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
|
||||
|
||||
|
||||
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
|
||||
conn = sqlite3.connect(cwd / 'index.sqlite3')
|
||||
try:
|
||||
scheduled_snapshots = conn.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(scheduled_url,),
|
||||
).fetchone()[0]
|
||||
one_shot_snapshots = conn.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(one_shot_url,),
|
||||
).fetchone()[0]
|
||||
scheduled_crawls = conn.execute(
|
||||
"""
|
||||
SELECT COUNT(*)
|
||||
FROM crawls_crawl
|
||||
WHERE schedule_id IS NOT NULL
|
||||
AND urls = ?
|
||||
""",
|
||||
(scheduled_url,),
|
||||
).fetchone()[0]
|
||||
return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def create_admin_and_token(cwd: Path) -> str:
|
||||
script = textwrap.dedent(
|
||||
"""
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
User = get_user_model()
|
||||
user, _ = User.objects.get_or_create(
|
||||
username='apitestadmin',
|
||||
defaults={
|
||||
'email': 'apitestadmin@example.com',
|
||||
'is_staff': True,
|
||||
'is_superuser': True,
|
||||
},
|
||||
)
|
||||
user.is_staff = True
|
||||
user.is_superuser = True
|
||||
user.set_password('testpass123')
|
||||
user.save()
|
||||
|
||||
token = APIToken.objects.create(
|
||||
created_by=user,
|
||||
expires=timezone.now() + timedelta(days=1),
|
||||
)
|
||||
print(token.token)
|
||||
"""
|
||||
)
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
||||
assert code == 0, stderr
|
||||
return stdout.strip().splitlines()[-1]
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
|
||||
schedule_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
)
|
||||
assert schedule_result.returncode == 0, schedule_result.stderr
|
||||
assert 'Created scheduled crawl' in schedule_result.stdout
|
||||
|
||||
make_latest_schedule_due(tmp_path)
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'web.archivebox.localhost:{port}')
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
|
||||
assert 'Root' in captured_text
|
||||
assert 'About' in captured_text
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
scheduled_url = recursive_test_site['root_url']
|
||||
one_shot_url = recursive_test_site['child_urls'][0]
|
||||
|
||||
schedule_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
)
|
||||
assert schedule_result.returncode == 0, schedule_result.stderr
|
||||
|
||||
make_latest_schedule_due(tmp_path)
|
||||
|
||||
add_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120,
|
||||
)
|
||||
assert add_result.returncode == 0, add_result.stderr
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
|
||||
assert 'Deep About' in captured_text or 'About' in captured_text
|
||||
|
||||
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
|
||||
assert one_shot_snapshots >= 1
|
||||
assert scheduled_snapshots == 0
|
||||
assert scheduled_crawls == 1 # template only, no materialized scheduled run
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
api_token = create_admin_and_token(tmp_path)
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
|
||||
|
||||
response = requests.post(
|
||||
f'http://127.0.0.1:{port}/api/v1/cli/schedule',
|
||||
headers={
|
||||
'Host': f'api.archivebox.localhost:{port}',
|
||||
'X-ArchiveBox-API-Key': api_token,
|
||||
},
|
||||
json={
|
||||
'every': 'daily',
|
||||
'import_path': recursive_test_site['root_url'],
|
||||
'quiet': True,
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert response.status_code == 200, response.text
|
||||
payload = response.json()
|
||||
assert payload['success'] is True
|
||||
assert payload['result_format'] == 'json'
|
||||
assert len(payload['result']['created_schedule_ids']) == 1
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port, PUBLIC_ADD_VIEW='True')
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
|
||||
|
||||
response = requests.post(
|
||||
f'http://127.0.0.1:{port}/add/',
|
||||
headers={'Host': f'web.archivebox.localhost:{port}'},
|
||||
data={
|
||||
'url': recursive_test_site['root_url'],
|
||||
'depth': '0',
|
||||
'schedule': 'daily',
|
||||
'tag': 'web-ui',
|
||||
'notes': 'created from web ui',
|
||||
},
|
||||
timeout=10,
|
||||
allow_redirects=False,
|
||||
)
|
||||
|
||||
assert response.status_code in (302, 303), response.text
|
||||
|
||||
conn = sqlite3.connect(tmp_path / 'index.sqlite3')
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT cs.schedule, c.urls, c.tags_str
|
||||
FROM crawls_crawlschedule cs
|
||||
JOIN crawls_crawl c ON c.schedule_id = cs.id
|
||||
ORDER BY cs.created_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
@@ -3,12 +3,9 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -6,13 +6,11 @@ import subprocess
|
||||
import sqlite3
|
||||
from archivebox.machine.models import Process
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
|
||||
@@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
|
||||
|
||||
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
|
||||
snapshot_id = str(uuid.UUID(snapshot_id_raw))
|
||||
crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row
|
||||
username = user_row[0]
|
||||
crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d')
|
||||
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
|
||||
domain = urlparse(snapshot_url).hostname or 'unknown'
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user