This commit is contained in:
Nick Sweeting
2026-03-15 18:45:29 -07:00
parent f97725d16f
commit 934e02695b
111 changed files with 919 additions and 461 deletions

View File

@@ -2,7 +2,6 @@ __package__ = 'archivebox.api'
import secrets
from archivebox.uuid_compat import uuid7
from datetime import timedelta
from django.conf import settings
from django.db import models

View File

@@ -1,16 +1,17 @@
import os
import django
import importlib
from io import StringIO
from types import SimpleNamespace
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
from archivebox.config.django import setup_django
from django.contrib.auth.models import User
from django.test import TestCase
setup_django()
from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
from archivebox.crawls.models import CrawlSchedule
User = importlib.import_module('django.contrib.auth.models').User
TestCase = importlib.import_module('django.test').TestCase
api_v1_cli = importlib.import_module('archivebox.api.v1_cli')
ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema
cli_schedule = api_v1_cli.cli_schedule
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
class CLIScheduleAPITests(TestCase):

View File

@@ -3,10 +3,7 @@ __package__ = 'archivebox.api'
from typing import Optional
from ninja import Router, Schema
from django.utils import timezone
from datetime import timedelta
from archivebox.api.models import APIToken
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token

View File

@@ -5,7 +5,6 @@ from typing import List, Optional
from datetime import datetime
from django.utils import timezone
from django.db.models import Q
from django.contrib.auth import get_user_model
from ninja import Router, Schema

View File

@@ -6,7 +6,7 @@ import json
from django import forms
from django.contrib import admin
from django.utils.html import format_html, mark_safe
from django.utils.html import mark_safe
from django_object_actions import DjangoObjectActions

View File

@@ -2,12 +2,9 @@
__package__ = 'archivebox.base_models'
from uuid import UUID
from archivebox.uuid_compat import uuid7
from typing import ClassVar
from pathlib import Path
from django.contrib import admin
from django.db import models
from django.db.models import F
from django.utils import timezone
@@ -17,8 +14,6 @@ from django.conf import settings
from django_stubs_ext.db.models import TypedModelMeta
from archivebox import DATA_DIR
from archivebox.misc.hashing import get_dir_info
def get_or_create_system_user_pk(username='system'):

View File

@@ -57,6 +57,7 @@ def add(urls: str | list[str],
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.personas.models import Persona
from archivebox.workers.orchestrator import Orchestrator
from archivebox.misc.logging_util import printable_filesize
from archivebox.misc.system import get_dir_size
@@ -79,11 +80,15 @@ def add(urls: str | list[str],
# Read URLs directly into crawl
urls_content = sources_file.read_text()
persona_name = (persona or 'Default').strip() or 'Default'
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
persona_obj.ensure_dirs()
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
tags_str=tag,
persona_id=persona_obj.id,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
created_by_id=created_by_id,
config={
@@ -91,7 +96,7 @@ def add(urls: str | list[str],
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona or 'Default',
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
}
)
@@ -135,8 +140,7 @@ def add(urls: str | list[str],
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
# Foreground mode: run full orchestrator until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
from archivebox.workers.orchestrator import Orchestrator
print('[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
orchestrator.runloop() # Block until complete

View File

@@ -94,7 +94,7 @@ def config(*keys,
# Display all plugin config in single [PLUGINS] section
if plugin_keys:
print(f'[grey53]\\[PLUGINS][/grey53]')
print('[grey53]\\[PLUGINS][/grey53]')
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')

View File

@@ -31,7 +31,6 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
import sys
from typing import Optional, List
import rich_click as click

View File

@@ -3,8 +3,6 @@
__package__ = 'archivebox.cli'
import os
import sys
import shutil
import rich_click as click
from rich import print

View File

@@ -410,7 +410,6 @@ def create_personas(
"""
from archivebox.misc.jsonl import write_record
from archivebox.personas.models import Persona
from archivebox.config.constants import CONSTANTS
is_tty = sys.stdout.isatty()
name_list = list(names) if names else []
@@ -493,10 +492,10 @@ def create_personas(
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
),
)
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(
persona_chrome_dir,
@@ -506,8 +505,8 @@ def create_personas(
):
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
else:
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
except Exception as e:
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)

View File

@@ -3,7 +3,6 @@
__package__ = 'archivebox.cli'
from typing import Optional
from pathlib import Path
import rich_click as click

View File

@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox search'
from pathlib import Path
from typing import Optional, List, Any
from typing import Optional, List
import rich_click as click
from rich import print
@@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')

View File

@@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
get_existing_supervisord_process,
get_worker,
start_server_workers,
tail_multiple_worker_logs,
is_port_in_use,
)
from archivebox.workers.orchestrator import Orchestrator
@@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
print(f' Stop the conflicting process or choose a different port')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
# Check if orchestrator is already running for this data directory
if Orchestrator.is_running():
print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
print(f' Stop the existing orchestrator before starting a new server')
print(f' To stop: pkill -f "archivebox manage orchestrator"')
print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
print(' Stop the existing orchestrator before starting a new server')
print(' To stop: pkill -f "archivebox manage orchestrator"')
sys.exit(1)
# Check if supervisord is already running
@@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print(' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')

View File

@@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None:
if not snapshot.downloaded_at:
continue
print(
'[grey53] ' +
(
'[grey53] '
f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
f'"{snapshot.title}": {snapshot.url}'
)[:SHELL_CONFIG.TERM_WIDTH]
+ '[grey53]',
'[/grey53]'
)[:SHELL_CONFIG.TERM_WIDTH],
)
print('[grey53] ...')

View File

@@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (),
from archivebox.config.django import setup_django
setup_django()
from archivebox.core.models import Snapshot
from django.utils import timezone
from django.core.management import call_command
# Run migrations first to ensure DB schema is up-to-date

View File

@@ -6,7 +6,7 @@ import sys
import os
import platform
from pathlib import Path
from typing import Iterable, Optional
from typing import Iterable
import rich_click as click

View File

@@ -3,13 +3,13 @@
__package__ = 'archivebox.cli'
import importlib
import os
import sys
import shutil
import sys
import unittest
from pathlib import Path
from contextlib import contextmanager
from pathlib import Path
TEST_CONFIG = {
'USE_COLOR': 'False',
@@ -30,18 +30,15 @@ TEST_CONFIG = {
DATA_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
from ..main import init
from archivebox.config.constants import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from . import (
archivebox_init,
archivebox_add,
archivebox_remove,
)
init = importlib.import_module('archivebox.main').init
constants = importlib.import_module('archivebox.config.constants')
SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
HIDE_CLI_OUTPUT = True
@@ -68,6 +65,13 @@ stdout = sys.stdout
stderr = sys.stderr
def load_main_index(*, out_dir: str):
index_path = Path(out_dir) / JSON_INDEX_FILENAME
if not index_path.exists():
raise FileNotFoundError(index_path)
return list(parse_json_main_index(Path(out_dir)))
@contextmanager
def output_hidden(show_failing=True):
if not HIDE_CLI_OUTPUT:

View File

@@ -23,7 +23,6 @@ Each command should:
__package__ = 'archivebox.cli'
import os
import sys
import json
import shutil
import tempfile
@@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase):
def test_parse_jsonl_with_id(self):
"""JSONL with id field should be recognized."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
from archivebox.misc.jsonl import parse_line
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
result = parse_line(line)
@@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
"""
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT
read_args_or_stdin, TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
@@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox snapshot URL | archivebox extract
Extract should accept JSONL output from snapshot command.
"""
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin,
TYPE_SNAPSHOT
@@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase):
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_plugins
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
@@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase):
def test_crawl_passes_through_other_types(self):
"""crawl create should pass through records with other types."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Input: a Tag record (not a Crawl or URL)
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
@@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase):
# Mock stdin with both records
stdin = StringIO(
json.dumps(tag_record) + '\n' +
json.dumps(url_record)
json.dumps(tag_record)
+ '\n'
+ json.dumps(url_record)
)
stdin.isatty = lambda: False
@@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase):
def test_snapshot_passes_through_crawl(self):
"""snapshot create should pass through Crawl records."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
from archivebox.misc.jsonl import TYPE_CRAWL
crawl_record = {
'type': TYPE_CRAWL,

View File

@@ -8,10 +8,6 @@ and other modules that expect to import config values directly.
__package__ = 'archivebox.config'
__order__ = 200
import shutil
from pathlib import Path
from typing import Dict, List, Optional
from .paths import (
PACKAGE_DIR, # noqa
DATA_DIR, # noqa
@@ -31,6 +27,7 @@ def _get_config():
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
return ARCHIVING_CONFIG, STORAGE_CONFIG
# Direct exports (evaluated at import time for backwards compat)
# These are recalculated each time the module attribute is accessed

View File

@@ -9,7 +9,6 @@ from configparser import ConfigParser
from benedict import benedict
import archivebox
from archivebox.config.constants import CONSTANTS

View File

@@ -11,10 +11,10 @@ __package__ = "archivebox.config"
import os
import json
from pathlib import Path
from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
from typing import Any, Dict, Optional, Type, Tuple
from configparser import ConfigParser
from pydantic import Field, ConfigDict
from pydantic import ConfigDict
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
@@ -166,6 +166,23 @@ def get_config(
if user is None and crawl and hasattr(crawl, "created_by"):
user = crawl.created_by
if persona is None and crawl is not None:
try:
from archivebox.personas.models import Persona
persona_id = getattr(crawl, "persona_id", None)
if persona_id:
persona = Persona.objects.filter(id=persona_id).first()
if persona is None:
crawl_config = getattr(crawl, "config", None) or {}
default_persona_name = crawl_config.get("DEFAULT_PERSONA")
if default_persona_name:
persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default")
persona.ensure_dirs()
except Exception:
pass
from archivebox.config.constants import CONSTANTS
from archivebox.config.common import (
SHELL_CONFIG,

View File

@@ -100,9 +100,11 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
return
from django.conf import settings
from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
error_log = getattr(settings, 'ERROR_LOG', DEFAULT_ERROR_LOG)
with open(error_log, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")

View File

@@ -46,7 +46,6 @@ if RUNNING_AS_UID == 0:
# if we are running as root it's really hard to figure out what the correct archivebox user should be
# as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users)
# check if 911:911 archivebox user exists on host system, and use it instead of 0
import pwd
if pwd.getpwuid(DEFAULT_PUID).pw_name == 'archivebox':
FALLBACK_UID = DEFAULT_PUID
FALLBACK_GID = DEFAULT_PGID

View File

@@ -3,7 +3,6 @@ __package__ = 'archivebox.config'
import os
import shutil
import inspect
from pathlib import Path
from typing import Any, List, Dict, cast
from benedict import benedict
@@ -30,11 +29,11 @@ KNOWN_BINARIES = [
]
def obj_to_yaml(obj: Any, indent: int=0) -> str:
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
indent_str = " " * indent
if indent == 0:
indent_str = '\n' # put extra newline between top-level entries
if isinstance(obj, dict):
if not obj:
return "{}"
@@ -42,7 +41,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
for key, value in obj.items():
result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
return result
elif isinstance(obj, list):
if not obj:
return "[]"
@@ -50,16 +49,16 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
for item in obj:
result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
return result.rstrip()
elif isinstance(obj, str):
if "\n" in obj:
return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ")
else:
return f" {obj}"
elif isinstance(obj, (int, float, bool)):
return f" {str(obj)}"
elif callable(obj):
source = '\n'.join(
'' if 'def ' in line else line
@@ -67,7 +66,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
if line.strip()
).split('lambda: ')[-1].rstrip(',')
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
else:
return f" {str(obj)}"
@@ -75,7 +74,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
"""Detect available binaries using shutil.which."""
binaries = {}
for name in KNOWN_BINARIES:
path = shutil.which(name)
if path:
@@ -85,7 +84,7 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
'version': None, # Could add version detection later
'is_available': True,
}
return binaries
@@ -144,19 +143,19 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Get binaries from database (previously detected/installed)
db_binaries = {b.name: b for b in Binary.objects.all()}
# Get currently detectable binaries
# Get currently detectable binaries
detected = get_detected_binaries()
# Merge and display
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
for name in all_binary_names:
db_binary = db_binaries.get(name)
detected_binary = detected.get(name)
rows['Binary Name'].append(ItemLink(name, key=name))
if db_binary:
rows['Found Version'].append(f'{db_binary.version}' if db_binary.version else '✅ found')
rows['Provided By'].append(db_binary.binprovider or 'PATH')
@@ -175,6 +174,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
table=rows,
)
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
@@ -203,7 +203,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
)
except Binary.DoesNotExist:
pass
# Try to detect from PATH
path = shutil.which(key)
if path:
@@ -224,7 +224,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
},
],
)
return ItemContext(
slug=key,
title=key,
@@ -286,6 +286,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
table=rows,
)
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import json
@@ -314,7 +315,10 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
# Add config.json data if available
if plugin.get('config'):
config_json = json.dumps(plugin['config'], indent=2)
fields["config.json"] = mark_safe(f'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>')
fields["config.json"] = mark_safe(
'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; '
f'padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>'
)
# Also extract and display individual config properties for easier viewing
if 'properties' in plugin['config']:
@@ -322,7 +326,6 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
properties_summary = []
for prop_name, prop_info in config_properties.items():
prop_type = prop_info.get('type', 'unknown')
prop_default = prop_info.get('default', 'N/A')
prop_desc = prop_info.get('description', '')
properties_summary.append(f"{prop_name} ({prop_type}): {prop_desc}")
@@ -365,7 +368,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
title="No running worker processes",
table=rows,
)
all_config_entries = cast(List[Dict[str, Any]], supervisor.getAllConfigInfo() or [])
all_config = {config["name"]: benedict(config) for config in all_config_entries}
@@ -514,7 +517,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, "Must be a superuser to view configuration settings."
log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
log_text = log_file.read_text()

View File

@@ -1,8 +1,8 @@
__package__ = 'archivebox.core'
from django.contrib import admin
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
import archivebox
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
@@ -20,7 +20,6 @@ archivebox_admin = ArchiveBoxAdmin()
# patch admin with methods to add data views (implemented by admin_data_views package)
# https://github.com/MrThearMan/django-admin-data-views
# https://mrthearman.github.io/django-admin-data-views/setup/
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore

View File

@@ -26,7 +26,7 @@ from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.admin_archiveresults import render_archiveresults_list
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
@@ -712,8 +712,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="🔁 Redo Failed"
)
def update_snapshots(self, request, queryset):
count = queryset.count()
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
@@ -741,8 +739,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="🔄 Redo"
)
def overwrite_snapshots(self, request, queryset):
count = queryset.count()
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(

View File

@@ -60,7 +60,7 @@ class CoreConfig(AppConfig):
from archivebox.workers.orchestrator import Orchestrator
Process.cleanup_stale_running()
machine = Machine.current()
Machine.current()
if not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()

View File

@@ -8,11 +8,10 @@ https://docs.djangoproject.com/en/stable/howto/deployment/asgi/
"""
from archivebox.config.django import setup_django
from django.core.asgi import get_asgi_application
setup_django(in_memory_db=False, check_db=True)
from django.core.asgi import get_asgi_application
# Standard Django ASGI application (no websockets/channels needed)
application = get_asgi_application()

View File

@@ -6,6 +6,7 @@ from archivebox.misc.util import URL_REGEX
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
from archivebox.crawls.schedule_utils import validate_schedule
from archivebox.hooks import get_plugins
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@@ -15,7 +16,6 @@ DEPTH_CHOICES = (
('4', 'depth = 4 (+ URLs four hops away)'),
)
from archivebox.hooks import get_plugins
def get_plugin_choices():
"""Get available extractor plugins from discovered hooks."""
@@ -210,15 +210,18 @@ class AddLinkForm(forms.Form):
return schedule
class TagWidgetMixin:
def format_value(self, value):
if value is not None and not isinstance(value, str):
value = edit_string_for_tags(value)
return super().format_value(value)
class TagWidget(TagWidgetMixin, forms.TextInput):
pass
class TagField(forms.CharField):
widget = TagWidget

View File

@@ -17,7 +17,6 @@ from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from archivebox.core.host_utils import (
build_admin_url,
build_api_url,
build_web_url,
get_api_host,
get_admin_host,

View File

@@ -7,10 +7,8 @@ def forwards_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
TagModel = apps.get_model("core", "Tag")
db_alias = schema_editor.connection.alias
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
tags = snapshot.tags
tag_set = (
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
)
@@ -23,9 +21,7 @@ def forwards_func(apps, schema_editor):
def reverse_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
TagModel = apps.get_model("core", "Tag")
db_alias = schema_editor.connection.alias
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
tags = snapshot.tags.values_list("name", flat=True)

View File

@@ -43,7 +43,7 @@ def forwards_func(apps, schema_editor):
try:
with open(out_dir / "index.json", "r") as f:
fs_index = json.load(f)
except Exception as e:
except Exception:
continue
history = fs_index["history"]

View File

@@ -234,7 +234,6 @@ def upgrade_core_tables(apps, schema_editor):
tag_has_data = cursor.fetchone()[0] > 0
if tag_has_data:
tag_cols = get_table_columns('core_tag')
cursor.execute("PRAGMA table_info(core_tag)")
tag_id_type = None
for row in cursor.fetchall():

View File

@@ -2,7 +2,6 @@
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
from django.db import migrations, models
import uuid
def create_default_crawl_and_assign_snapshots(apps, schema_editor):

View File

@@ -347,7 +347,7 @@ def copy_archiveresult_data_to_process(apps, schema_editor):
migrated_count += 1
if i == 0:
print(f'DEBUG 0027: Linked ArchiveResult to Process')
print('DEBUG 0027: Linked ArchiveResult to Process')
except Exception as e:
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')

View File

@@ -1,6 +1,6 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
from typing import Optional, Dict, Iterable, Any, List
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
@@ -12,19 +12,18 @@ from pathlib import Path
from statemachine import State, registry
from django.db import models
from django.db.models import QuerySet, Value, Case, When, IntegerField
from django.db.models import QuerySet
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.utils import timezone
from django.core.cache import cache
from django.urls import reverse, reverse_lazy
from django.urls import reverse_lazy
from django.contrib import admin
from django.conf import settings
from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.hooks import (
get_plugins, get_plugin_name, get_plugin_icon,
)
@@ -186,7 +185,7 @@ class SnapshotQuerySet(models.QuerySet):
for pattern in patterns:
try:
qsearch |= query_search_index(pattern)
except:
except BaseException:
raise SystemExit(2)
return self.all() & qsearch
@@ -344,8 +343,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@property
def process_set(self):
"""Get all Process objects related to this snapshot's ArchiveResults."""
import json
import json
from archivebox.machine.models import Process
return Process.objects.filter(archiveresult__snapshot_id=self.id)
@@ -458,13 +455,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not old_dir.exists() or old_dir == new_dir:
# No migration needed
print(f"[DEBUG _fs_migrate] Returning None (early return)")
print("[DEBUG _fs_migrate] Returning None (early return)")
return None
if new_dir.exists():
# New directory already exists (files already copied), but we still need cleanup
# Return cleanup info so old directory can be cleaned up
print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
print("[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
return (old_dir, new_dir)
new_dir.mkdir(parents=True, exist_ok=True)
@@ -499,7 +496,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Schedule cleanup AFTER transaction commits successfully
# This ensures DB changes are committed before we delete old files
from django.db import transaction
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
# Return cleanup info for manual cleanup if needed (when called directly)
@@ -594,8 +590,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
domain = self.extract_domain_from_url(self.url)
return (
CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
date_str / domain / str(self.id)
CONSTANTS.DATA_DIR / 'users' / username / 'snapshots'
/ date_str / domain / str(self.id)
)
else:
# Unknown version - use current
@@ -670,7 +666,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
return snapshot
elif candidates.count() > 1:
print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
print("[DEBUG load_from_directory] Multiple fuzzy matches, using first")
return candidates.first()
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
return None
@@ -767,7 +763,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
ts_int = int(float(ts))
# 1995-01-01 to 2035-12-31
return 788918400 <= ts_int <= 2082758400
except:
except (TypeError, ValueError, OverflowError):
return False
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
@@ -850,7 +846,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
with open(json_path) as f:
index_data = json.load(f)
except:
except (OSError, TypeError, ValueError, json.JSONDecodeError):
pass
# Merge title
@@ -929,7 +925,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if result_data.get('start_ts'):
try:
start_ts = parser.parse(result_data['start_ts'])
except:
except (TypeError, ValueError, OverflowError):
pass
if (plugin, start_ts) in existing:
@@ -940,7 +936,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if result_data.get('end_ts'):
try:
end_ts = parser.parse(result_data['end_ts'])
except:
except (TypeError, ValueError, OverflowError):
pass
# Support both 'output' (legacy) and 'output_str' (new JSONL) field names
@@ -957,7 +953,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
start_ts=start_ts,
end_ts=end_ts,
)
except:
except Exception:
pass
def write_index_json(self):
@@ -1176,7 +1172,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
shutil.move(str(snapshot_dir), str(dest))
except:
except Exception:
pass
@classmethod
@@ -1208,7 +1204,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
cls._merge_snapshots(snapshots)
merged += 1
except:
except Exception:
pass
return merged
@@ -1244,7 +1240,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
try:
shutil.rmtree(dup_dir)
except:
except Exception:
pass
# Merge tags
@@ -1615,7 +1611,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
"""
import re
from django.utils import timezone
from archivebox.misc.util import parse_date
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.config.common import GENERAL_CONFIG
@@ -2125,7 +2120,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
from archivebox.misc.util import ts_to_date_str
from archivebox.core.host_utils import build_snapshot_url
result = {
@@ -2283,9 +2277,9 @@ class SnapshotMachine(BaseStateMachine):
# Tick Event (polled by workers)
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to(sealed, cond='is_finished')
queued.to.itself(unless='can_start')
| queued.to(started, cond='can_start')
| started.to(sealed, cond='is_finished')
)
# Manual event (can also be triggered by last ArchiveResult finishing)
@@ -2783,7 +2777,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
Updates status/output fields, queues discovered URLs, and triggers indexing.
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
from archivebox.config.configset import get_config
# Get merged config with proper context
@@ -3190,16 +3184,16 @@ class ArchiveResultMachine(BaseStateMachine):
# queued → skipped (if exceeded max attempts)
# started → backoff → started (retry)
tick = (
queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start')
queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
| queued.to.itself(unless='can_start')
| queued.to(started, cond='can_start')
| started.to(succeeded, cond='is_succeeded')
| started.to(failed, cond='is_failed')
| started.to(skipped, cond='is_skipped')
| started.to(backoff, cond='is_backoff')
| backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
| backoff.to.itself(unless='can_start')
| backoff.to(started, cond='can_start')
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
# Reason: backoff should always retry→started, then started→final states
)
@@ -3241,8 +3235,8 @@ class ArchiveResultMachine(BaseStateMachine):
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
not self.archiveresult.output_str
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
and not self.archiveresult.output_str
)
def is_finished(self) -> bool:
@@ -3286,7 +3280,6 @@ class ArchiveResultMachine(BaseStateMachine):
@started.enter
def enter_started(self):
from archivebox.machine.models import NetworkInterface
# Update Process with network interface
if self.archiveresult.process_id:

View File

@@ -6,6 +6,7 @@ import inspect
from pathlib import Path
from django.conf.locale.en import formats as en_formats # type: ignore
from django.utils.crypto import get_random_string
import archivebox
@@ -13,6 +14,7 @@ import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa
from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url
from .settings_logging import SETTINGS_LOGGING
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
@@ -54,8 +56,8 @@ INSTALLED_APPS = [
"django.contrib.staticfiles",
"django.contrib.admin",
# 3rd-party apps from PyPI
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
"signal_webhooks", # handles REST API outbound webhooks
"django_object_actions", # provides easy Django Admin action buttons on change views
# Our ArchiveBox-provided apps (use fully qualified names)
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
@@ -117,7 +119,6 @@ try:
try:
# Try to import django-auth-ldap (will fail if not installed)
import django_auth_ldap
from django_auth_ldap.config import LDAPSearch
import ldap
@@ -414,9 +415,6 @@ DATETIME_FORMAT = "Y-m-d h:i:s A"
SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A"
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
from django.conf.locale.en import formats as en_formats # type: ignore
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
@@ -425,9 +423,6 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
### Logging Settings
################################################################################
from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG
LOGGING = SETTINGS_LOGGING

View File

@@ -5,8 +5,6 @@ import os
import tempfile
import logging
import pydantic
import django.template
from archivebox.config import CONSTANTS

View File

@@ -1,5 +1,6 @@
"""Tests for the core views, especially AddView."""
import importlib
import os
import django
from unittest.mock import patch
@@ -8,13 +9,14 @@ from unittest.mock import patch
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
from django.test import TestCase, Client
from django.contrib.auth.models import User
from django.urls import reverse
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Tag
from archivebox.config.common import SERVER_CONFIG
TestCase = importlib.import_module('django.test').TestCase
Client = importlib.import_module('django.test').Client
User = importlib.import_module('django.contrib.auth.models').User
reverse = importlib.import_module('django.urls').reverse
Crawl = importlib.import_module('archivebox.crawls.models').Crawl
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
Tag = importlib.import_module('archivebox.core.models').Tag
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
class AddViewTests(TestCase):
@@ -252,7 +254,7 @@ class AddViewTests(TestCase):
def test_add_staff_admin_custom_config_is_allowed(self):
"""Admin users can override crawl config."""
self.client.logout()
admin_user = User.objects.create_user(
User.objects.create_user(
username='adminuser',
password='adminpass123',
email='admin@example.com',

View File

@@ -10,7 +10,7 @@ from pathlib import Path
from urllib.parse import urlparse
from django.shortcuts import render, redirect
from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.views import View
from django.views.generic.list import ListView
@@ -24,9 +24,8 @@ from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
from archivebox.misc.serve_static import serve_static_with_byterange_support
@@ -35,6 +34,9 @@ from archivebox.search import query_search_index
from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
@@ -49,12 +51,6 @@ def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
return target
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
class HomepageView(View):
def get(self, request):
if request.user.is_authenticated:
@@ -1066,10 +1062,6 @@ class HealthCheckView(View):
status=200
)
import json
from django.http import JsonResponse
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
@@ -1077,7 +1069,6 @@ def live_progress_view(request):
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
@@ -1133,7 +1124,6 @@ def live_progress_view(request):
})
# Build hierarchical active crawls with nested snapshots and archive results
from django.db.models import Prefetch
running_workers = Process.objects.filter(
machine=machine,
@@ -1387,7 +1377,7 @@ def find_config_default(key: str) -> str:
return default_val
def find_config_type(key: str) -> str:
from typing import get_type_hints, ClassVar
from typing import ClassVar
CONFIGS = get_all_configs()
for config in CONFIGS.values():
@@ -1430,7 +1420,6 @@ def key_is_safe(key: str) -> bool:
def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
import os
from archivebox.machine.models import Machine
# Check if it's from archivebox.machine.config
@@ -1464,12 +1453,11 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Get merged config that includes Machine.config overrides
try:
from archivebox.machine.models import Machine
machine = Machine.current()
Machine.current()
merged_config = get_config()
except Exception as e:
except Exception:
# Fallback if Machine model not available
merged_config = get_config()
machine = None
rows = {
"Section": [],
@@ -1525,7 +1513,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import os
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet

View File

@@ -343,20 +343,17 @@ class InlineTagEditorWidget(TagEditorWidget):
snapshot_id = snapshot_id or self.snapshot_id
# Parse value to get list of tag dicts with id and name
tags = []
tag_data = []
if value:
if hasattr(value, 'all'): # QuerySet
for tag in value.all():
tag_data.append({'id': tag.pk, 'name': tag.name})
tag_data.sort(key=lambda x: x['name'].lower())
tags = [t['name'] for t in tag_data]
elif isinstance(value, (list, tuple)):
if value and hasattr(value[0], 'name'):
for tag in value:
tag_data.append({'id': tag.pk, 'name': tag.name})
tag_data.sort(key=lambda x: x['name'].lower())
tags = [t['name'] for t in tag_data]
widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
widget_id = self._normalize_id(widget_id_raw)

View File

@@ -9,9 +9,8 @@ https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
import archivebox # noqa
from archivebox.config.django import setup_django
from django.core.wsgi import get_wsgi_application
setup_django(in_memory_db=False, check_db=True)
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()

View File

@@ -1,17 +1,11 @@
__package__ = 'archivebox.crawls'
import json
from pathlib import Path
from django import forms
from django.utils.html import format_html, format_html_join, mark_safe
from django.contrib import admin, messages
from django.urls import path
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from django.db.models import Count, Q
from archivebox import DATA_DIR
from django_object_actions import action

View File

@@ -1,12 +1,11 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING, Iterable
from typing import TYPE_CHECKING
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
from django.db import models
from django.db.models import QuerySet
from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings
from django.urls import reverse_lazy
@@ -15,13 +14,12 @@ from django_stubs_ext.db.models import TypedModelMeta
from statemachine import State, registry
from rich import print
from archivebox.config import CONSTANTS
from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule
if TYPE_CHECKING:
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
class CrawlSchedule(ModelWithUUID, ModelWithNotes):
@@ -111,7 +109,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
label = models.CharField(max_length=64, blank=True, null=False, default='')
notes = models.TextField(blank=True, null=False, default='')
schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
output_dir = models.CharField(max_length=512, null=False, blank=True, default='')
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
@@ -252,6 +249,22 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return system_url
return None
def resolve_persona(self):
from archivebox.personas.models import Persona
if self.persona_id:
persona = Persona.objects.filter(id=self.persona_id).first()
if persona is None:
raise Persona.DoesNotExist(f'Crawl {self.id} references missing Persona {self.persona_id}')
return persona
default_persona_name = str((self.config or {}).get('DEFAULT_PERSONA') or '').strip()
if default_persona_name:
persona, _ = Persona.objects.get_or_create(name=default_persona_name or 'Default')
return persona
return None
def add_url(self, entry: dict) -> bool:
"""
@@ -391,7 +404,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
f.flush()
def get_runtime_config():
return get_config(crawl=self)
config = get_config(crawl=self)
if persona_runtime_overrides:
config.update(persona_runtime_overrides)
return config
system_task = self.get_system_task()
if system_task == 'archivebox://update':
@@ -402,6 +418,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
machine = Machine.current()
declared_binary_names: set[str] = set()
persona_runtime_overrides: dict[str, str] = {}
persona = self.resolve_persona()
if persona:
base_runtime_config = get_config(crawl=self, persona=persona)
chrome_binary = str(base_runtime_config.get('CHROME_BINARY') or '')
persona_runtime_overrides = persona.prepare_runtime_for_crawl(
crawl=self,
chrome_binary=chrome_binary,
)
def install_declared_binaries(binary_names: set[str]) -> None:
if not binary_names:
@@ -563,7 +588,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
f.write(f'Discovering Crawl hooks...\n')
f.write('Discovering Crawl hooks...\n')
f.flush()
hooks = discover_hooks('Crawl', config=get_runtime_config())
with open(debug_log, 'a') as f:
@@ -588,17 +613,17 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
print(f'[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]')
with open(debug_log, 'a') as f:
f.write(f'Skipping snapshot creation for system crawl: {system_task}\n')
f.write(f'=== Crawl.run() complete ===\n\n')
f.write('=== Crawl.run() complete ===\n\n')
f.flush()
return None
with open(debug_log, 'a') as f:
f.write(f'Creating snapshots from URLs...\n')
f.write('Creating snapshots from URLs...\n')
f.flush()
created_snapshots = self.create_snapshots_from_urls()
with open(debug_log, 'a') as f:
f.write(f'Created {len(created_snapshots)} snapshots\n')
f.write(f'=== Crawl.run() complete ===\n\n')
f.write('=== Crawl.run() complete ===\n\n')
f.flush()
# Return first snapshot for this crawl (newly created or existing)
@@ -647,6 +672,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
for pid_file in self.output_dir.glob('**/*.pid'):
pid_file.unlink(missing_ok=True)
persona = self.resolve_persona()
if persona:
persona.cleanup_runtime_for_crawl(self)
# Run on_CrawlEnd hooks
from archivebox.config.configset import get_config
config = get_config(crawl=self)
@@ -715,9 +744,9 @@ class CrawlMachine(BaseStateMachine):
# Tick Event (polled by workers)
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to(sealed, cond='is_finished')
queued.to.itself(unless='can_start')
| queued.to(started, cond='can_start')
| started.to(sealed, cond='is_finished')
)
# Manual event (triggered by last Snapshot sealing)
@@ -740,7 +769,6 @@ class CrawlMachine(BaseStateMachine):
@started.enter
def enter_started(self):
import sys
from archivebox.core.models import Snapshot
print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)
@@ -758,7 +786,7 @@ class CrawlMachine(BaseStateMachine):
)
else:
# No snapshots (system crawl like archivebox://install)
print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
print('[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
# Seal immediately since there's no work to do
self.seal()

View File

@@ -56,16 +56,18 @@ __package__ = 'archivebox'
import os
import json
import time
from functools import lru_cache
from pathlib import Path
from typing import List, Dict, Any, Optional, TypedDict
from typing import TYPE_CHECKING, List, Dict, Any, Optional, TypedDict
from abx_plugins import get_plugins_dir
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config.constants import CONSTANTS
if TYPE_CHECKING:
from archivebox.machine.models import Process
# Plugin directories
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
@@ -266,9 +268,7 @@ def run_hook(
"""
from archivebox.machine.models import Process, Machine
from archivebox.config.constants import CONSTANTS
import time
import sys
start_time = time.time()
# Auto-detect timeout from plugin config if not explicitly provided
if timeout is None:

View File

@@ -9,7 +9,6 @@ __package__ = "archivebox.ldap"
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from django.contrib.auth.models import User
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
else:
try:

View File

@@ -10,6 +10,7 @@ from datetime import timedelta, datetime
from statemachine import State, registry
from django.db import models
from django.db.models import QuerySet
from django.utils import timezone
from django.utils.functional import cached_property
@@ -197,7 +198,6 @@ class NetworkInterface(ModelWithHealthStats):
class BinaryManager(models.Manager):
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary':
"""Get or create an Binary record from the database or cache."""
global _CURRENT_BINARIES
cached = _CURRENT_BINARIES.get(name)
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
return cached
@@ -583,7 +583,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
Called by state machine if needed (not typically used for binaries
since installations are foreground, but included for consistency).
"""
from pathlib import Path
# Kill any background binary installation hooks using Process records
# (rarely used since binary installations are typically foreground)
@@ -1026,9 +1025,11 @@ class Process(models.Model):
# Check cache validity
if _CURRENT_PROCESS:
# Verify: same PID, same machine, cache not expired
if (_CURRENT_PROCESS.pid == current_pid and
_CURRENT_PROCESS.machine_id == machine.id and
timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)):
if (
_CURRENT_PROCESS.pid == current_pid
and _CURRENT_PROCESS.machine_id == machine.id
and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
):
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
_CURRENT_PROCESS = None
@@ -1111,7 +1112,6 @@ class Process(models.Model):
machine = machine or Machine.current()
# Debug logging
import sys
# print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
# Get parent process start time from OS
@@ -1630,7 +1630,6 @@ class Process(models.Model):
self (updated with pid, started_at, etc.)
"""
import subprocess
import time
# Validate pwd is set (required for output files)
if not self.pwd:
@@ -1846,7 +1845,6 @@ class Process(models.Model):
Returns:
True if process was terminated, False if already dead
"""
import time
import signal
proc = self.proc
@@ -2199,8 +2197,8 @@ class BinaryMachine(BaseStateMachine):
# Tick Event - install happens during transition
tick = (
queued.to.itself(unless='can_install') |
queued.to(installed, cond='can_install', on='on_install')
queued.to.itself(unless='can_install')
| queued.to(installed, cond='can_install', on='on_install')
)
def can_install(self) -> bool:
@@ -2303,10 +2301,10 @@ class ProcessMachine(BaseStateMachine):
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(running, cond='can_start') |
running.to.itself(unless='is_exited') |
running.to(exited, cond='is_exited')
queued.to.itself(unless='can_start')
| queued.to(running, cond='can_start')
| running.to.itself(unless='is_exited')
| running.to(exited, cond='is_exited')
)
# Additional events (for explicit control)

View File

@@ -12,8 +12,6 @@ Tests cover:
"""
import os
import sys
from pathlib import Path
from datetime import timedelta
from unittest.mock import patch
@@ -29,7 +27,6 @@ from archivebox.machine.models import (
BinaryMachine,
ProcessMachine,
MACHINE_RECHECK_INTERVAL,
PROCESS_RECHECK_INTERVAL,
PID_REUSE_WINDOW,
)
@@ -323,7 +320,6 @@ class TestProcessModel(TestCase):
def test_process_update_and_requeue(self):
"""Process.update_and_requeue() should update fields and save."""
process = Process.objects.create(machine=self.machine, cmd=['test'])
old_modified = process.modified_at
process.update_and_requeue(
status=Process.StatusChoices.RUNNING,

View File

@@ -1,5 +1,3 @@
__package__ = 'archivebox.mcp'
"""
Model Context Protocol (MCP) server implementation for ArchiveBox.
@@ -10,9 +8,7 @@ Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport.
import sys
import json
import traceback
from typing import Any, Dict, List, Optional
from io import StringIO
from contextlib import redirect_stdout, redirect_stderr
from typing import Optional
import click
from click.testing import CliRunner

View File

@@ -225,7 +225,6 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
import archivebox
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.misc.logging import STDERR
from archivebox.misc.logging_util import pretty_path

View File

@@ -35,7 +35,6 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L
with open(index_path, 'r') as f:
data = json.load(f)
timestamp = data.get('timestamp')
url = data.get('url')
except Exception:
continue

View File

@@ -21,13 +21,12 @@ if TYPE_CHECKING:
from rich import print
from rich.panel import Panel
from django.core.management.base import DjangoHelpFormatter
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import ANSI, stderr
from archivebox.misc.logging import ANSI
@dataclass
class RuntimeStats:

View File

@@ -1,16 +1,18 @@
__package__ = 'archivebox'
import django
import pydantic
import datetime
import warnings
import benedict
from daphne import access
import django_stubs_ext
from django.utils import timezone
django_stubs_ext.monkeypatch()
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
import datetime
from django.utils import timezone
timezone.utc = datetime.timezone.utc
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
@@ -26,12 +28,9 @@ timezone.utc = datetime.timezone.utc
# Hide site-packages/sonic/client.py:115: SyntaxWarning
# https://github.com/xmonader/python-sonic-client/pull/18
import warnings # noqa
warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic')
# Make daphne log requests quieter and esier to read
from daphne import access # noqa
class ModifiedAccessLogGenerator(access.AccessLogGenerator):
"""Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files"""
@@ -68,5 +67,4 @@ access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry #
# fix benedict objects to pretty-print/repr more nicely with rich
# https://stackoverflow.com/a/79048811/2156113
# https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol
import benedict # noqa
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore

View File

@@ -135,7 +135,6 @@ class ProcessLogPanel:
if line:
log_lines.append(Text(line, style="cyan"))
compact = self.compact if self.compact is not None else self._is_background_hook()
max_body = max(1, self.max_lines - len(header_lines))
if not log_lines:
log_lines = []

View File

@@ -4,10 +4,11 @@ __package__ = 'archivebox.misc'
import os
import signal
import shutil
import sys
from json import dump
from pathlib import Path
from typing import Optional, Union, Set, Tuple
from typing import Optional, Union, Tuple
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from atomicwrites import atomic_write as lib_atomic_write
@@ -58,7 +59,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
# far into the TimeoutExpired exception.
process.wait()
raise
except: # Including KeyboardInterrupt, communicate handled that.
except BaseException: # Including KeyboardInterrupt, communicate handled that.
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise

View File

@@ -1,3 +1,2 @@
from django.contrib import admin
# Register your models here.

View File

@@ -11,8 +11,12 @@ Each persona has its own:
__package__ = 'archivebox.personas'
import shutil
import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, Iterator
from typing import TYPE_CHECKING
from django.db import models
from django.conf import settings
@@ -21,8 +25,32 @@ from django.utils import timezone
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
from archivebox.uuid_compat import uuid7
try:
import fcntl
except ImportError: # pragma: no cover
fcntl = None
if TYPE_CHECKING:
from django.db.models import QuerySet
pass
VOLATILE_PROFILE_DIR_NAMES = {
'Cache',
'Code Cache',
'GPUCache',
'ShaderCache',
'Service Worker',
'GCM Store',
'Crashpad',
'BrowserMetrics',
}
VOLATILE_PROFILE_FILE_NAMES = {
'BrowserMetrics-spare.pma',
'SingletonCookie',
'SingletonLock',
'SingletonSocket',
}
class Persona(ModelWithConfig):
@@ -120,37 +148,118 @@ class Persona(ModelWithConfig):
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
def cleanup_chrome(self) -> bool:
"""
Clean up Chrome state files (SingletonLock, etc.) for this persona.
Returns:
True if cleanup was performed, False if no cleanup needed
"""
def cleanup_chrome_profile(self, profile_dir: Path) -> bool:
"""Remove volatile Chrome state that should never be reused across launches."""
cleaned = False
chrome_dir = self.path / 'chrome_user_data'
if not chrome_dir.exists():
if not profile_dir.exists():
return False
# Clean up SingletonLock files
for lock_file in chrome_dir.glob('**/SingletonLock'):
try:
lock_file.unlink()
cleaned = True
except OSError:
pass
for path in profile_dir.rglob('*'):
if path.name in VOLATILE_PROFILE_FILE_NAMES:
try:
path.unlink()
cleaned = True
except OSError:
pass
# Clean up SingletonSocket files
for socket_file in chrome_dir.glob('**/SingletonSocket'):
for dirname in VOLATILE_PROFILE_DIR_NAMES:
for path in profile_dir.rglob(dirname):
if not path.is_dir():
continue
shutil.rmtree(path, ignore_errors=True)
cleaned = True
for path in profile_dir.rglob('*.log'):
try:
socket_file.unlink()
path.unlink()
cleaned = True
except OSError:
pass
return cleaned
def cleanup_chrome(self) -> bool:
"""Clean up volatile Chrome state for this persona's base profile."""
return self.cleanup_chrome_profile(self.path / 'chrome_user_data')
@contextmanager
def lock_runtime_for_crawl(self):
lock_path = self.path / '.archivebox-crawl-profile.lock'
lock_path.parent.mkdir(parents=True, exist_ok=True)
with lock_path.open('w') as lock_file:
if fcntl is not None:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
try:
yield
finally:
if fcntl is not None:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
def runtime_root_for_crawl(self, crawl) -> Path:
return Path(crawl.output_dir) / '.persona' / self.name
def runtime_profile_dir_for_crawl(self, crawl) -> Path:
return self.runtime_root_for_crawl(crawl) / 'chrome_user_data'
def runtime_downloads_dir_for_crawl(self, crawl) -> Path:
return self.runtime_root_for_crawl(crawl) / 'chrome_downloads'
def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None:
destination_dir.parent.mkdir(parents=True, exist_ok=True)
shutil.rmtree(destination_dir, ignore_errors=True)
destination_dir.mkdir(parents=True, exist_ok=True)
copy_cmd: list[str] | None = None
source_contents = f'{source_dir}/.'
if sys.platform == 'darwin':
copy_cmd = ['cp', '-cR', source_contents, str(destination_dir)]
elif sys.platform.startswith('linux'):
copy_cmd = ['cp', '-a', source_contents, str(destination_dir)]
if copy_cmd:
result = subprocess.run(copy_cmd, capture_output=True, text=True)
if result.returncode == 0:
return
shutil.rmtree(destination_dir, ignore_errors=True)
destination_dir.mkdir(parents=True, exist_ok=True)
shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True)
def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = '') -> dict[str, str]:
self.ensure_dirs()
template_dir = Path(self.CHROME_USER_DATA_DIR)
runtime_root = self.runtime_root_for_crawl(crawl)
runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl)
runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl)
with self.lock_runtime_for_crawl():
if not runtime_profile_dir.exists():
if template_dir.exists() and any(template_dir.iterdir()):
self.copy_chrome_profile(template_dir, runtime_profile_dir)
else:
runtime_profile_dir.mkdir(parents=True, exist_ok=True)
runtime_downloads_dir.mkdir(parents=True, exist_ok=True)
self.cleanup_chrome_profile(runtime_profile_dir)
(runtime_root / 'persona_name.txt').write_text(self.name)
(runtime_root / 'template_dir.txt').write_text(str(template_dir))
if chrome_binary:
(runtime_root / 'chrome_binary.txt').write_text(chrome_binary)
return {
'CHROME_USER_DATA_DIR': str(runtime_profile_dir),
'CHROME_DOWNLOADS_DIR': str(runtime_downloads_dir),
}
def cleanup_runtime_for_crawl(self, crawl) -> None:
shutil.rmtree(Path(crawl.output_dir) / '.persona', ignore_errors=True)
@classmethod
def get_or_create_default(cls) -> 'Persona':
"""Get or create the Default persona."""

View File

@@ -1,3 +1,2 @@
from django.test import TestCase
# Create your tests here.

View File

@@ -1,3 +1,2 @@
from django.shortcuts import render
# Create your views here.

View File

@@ -14,7 +14,7 @@ Search backends must provide a search.py module with:
__package__ = 'archivebox.search'
from typing import TYPE_CHECKING, Any, Optional
from typing import Any, Optional
from django.db.models import QuerySet
@@ -22,9 +22,6 @@ from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
from archivebox.config.common import SEARCH_BACKEND_CONFIG
if TYPE_CHECKING:
from archivebox.core.models import Snapshot
# Cache discovered backends to avoid repeated filesystem scans
_search_backends_cache: Optional[dict] = None

View File

@@ -1,7 +1,6 @@
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
import os
import shutil
import sys
import subprocess
import textwrap
@@ -13,6 +12,8 @@ import pytest
from archivebox.uuid_compat import uuid7
pytest_plugins = ["archivebox.tests.fixtures"]
# =============================================================================
# CLI Helpers (defined before fixtures that use them)

View File

@@ -1,9 +1,6 @@
import subprocess
import json
import sqlite3
import os
from .fixtures import *
import sqlite3
import subprocess
def test_depth_flag_is_accepted(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
@@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
arg_process = subprocess.run(
subprocess.run(
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,

View File

@@ -9,7 +9,7 @@ Tests cover:
"""
import pytest
from django.test import TestCase, Client, override_settings
from django.test import override_settings
from django.urls import reverse
from django.contrib.auth import get_user_model

View File

@@ -9,7 +9,7 @@ import os
import sys
import tempfile
import unittest
from pathlib import Path
from importlib.util import find_spec
class TestLDAPConfig(unittest.TestCase):
@@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase):
def test_django_settings_with_ldap_library_check(self):
"""Test that Django settings check for LDAP libraries when enabled."""
# Try to import django-auth-ldap to see if it's available
try:
import django_auth_ldap
import ldap
ldap_available = True
except ImportError:
ldap_available = False
ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None
# If LDAP libraries are not available, settings should handle gracefully
if not ldap_available:

View File

@@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie
"""
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
import subprocess
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
@@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
assert 'test' in tags_str or 'example' in tags_str
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
"""Test add persists the selected persona so browser config derives from it later."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
persona_id, default_persona = c.execute(
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
).fetchone()
conn.close()
assert persona_id
assert default_persona == 'Default'
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice creates separate crawls and snapshots.

View File

@@ -9,7 +9,6 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,

View File

@@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly.
import os
import subprocess
from pathlib import Path
from .fixtures import *
def test_config_displays_all_config(tmp_path, process):

View File

@@ -9,14 +9,11 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
create_test_crawl_json,
)

View File

@@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
import subprocess
def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):

View File

@@ -7,8 +7,6 @@ Verify command runs successfully and produces output.
import os
import subprocess
from .fixtures import *
def test_help_runs_successfully(tmp_path):
"""Test that help command runs and produces output."""

View File

@@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config.
"""
import os
import subprocess
import sqlite3
from pathlib import Path
import subprocess
from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')

View File

@@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB.
"""
import os
import subprocess
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import *
def test_install_runs_successfully(tmp_path, process):
"""Test that install command runs without error."""

View File

@@ -6,9 +6,6 @@ Verify manage command runs Django management commands.
import os
import subprocess
import sqlite3
from .fixtures import *
def test_manage_help_works(tmp_path, process):

View File

@@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem.
"""
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
import subprocess
def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):

View File

@@ -8,7 +8,6 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,

View File

@@ -10,11 +10,9 @@ Tests cover:
import json
import sqlite3
import time
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
)

View File

@@ -5,7 +5,6 @@ import os
import sqlite3
import subprocess
from .fixtures import process, disable_extractors_dict
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):

View File

@@ -6,9 +6,6 @@ Verify search queries snapshots from DB.
import os
import subprocess
import sqlite3
from .fixtures import *
def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):

View File

@@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing).
import os
import subprocess
import signal
import time
from .fixtures import *
def test_server_shows_usage_info(tmp_path, process):

View File

@@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only).
import os
import subprocess
from .fixtures import *
def test_shell_command_exists(tmp_path, process):
"""Test that shell command is recognized."""

View File

@@ -9,12 +9,10 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
)

View File

@@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem.
"""
import os
import subprocess
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import *
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}

View File

@@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
import subprocess
def test_update_runs_successfully_on_empty_archive(tmp_path, process):

View File

@@ -11,7 +11,9 @@ import tempfile
import subprocess
from pathlib import Path
from .fixtures import *
from .fixtures import process
FIXTURES = (process,)
def _archivebox_cli() -> str:

View File

@@ -6,7 +6,6 @@ import subprocess
import pytest
from .fixtures import process, disable_extractors_dict
def test_config_shows_all_config_values(tmp_path, process):
@@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
capture_output=True,
text=True,
)
assert result.returncode == 0, result.stderr
# Read the config file directly to verify it was written
config_file = tmp_path / 'ArchiveBox.conf'

View File

@@ -4,11 +4,9 @@
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):

View File

@@ -8,7 +8,6 @@ import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
text=True,
env=disable_extractors_dict,
)
assert result.returncode == 0, result.stderr
# Should not error
conn = sqlite3.connect('index.sqlite3')

View File

@@ -1,8 +1,12 @@
from .fixtures import *
import json as pyjson
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}

View File

@@ -16,7 +16,7 @@ import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, patch
from unittest.mock import patch
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')

View File

@@ -3,13 +3,13 @@
import os
import subprocess
from pathlib import Path
import json, shutil
import sqlite3
from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
@@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# In the new architecture, URLs are saved to source files
# Check that a source file was created with the URL
@@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
capture_output=True, env=disable_extractors_dict)
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# Check that a source file was created with both URLs
sources_dir = tmp_path / "sources"
@@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
env=disable_extractors_dict)
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# Check database permissions
assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)

View File

@@ -7,7 +7,6 @@ import sqlite3
import pytest
from .fixtures import process, disable_extractors_dict
class TestInstallDryRun:

View File

@@ -1,7 +1,9 @@
import json
import subprocess
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_search_json(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],

View File

@@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x.
- New fields like depth, retry_at, etc.
"""
import json
import shutil
import sqlite3
import subprocess
import tempfile
import unittest
from pathlib import Path
@@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
f"Files lost during migration: {files_before_count} -> {files_after_count}")
# Run update to trigger filesystem reorganization
print(f"\n[*] Running archivebox update to reorganize filesystem...")
print("\n[*] Running archivebox update to reorganize filesystem...")
result = run_archivebox(self.work_dir, ['update'], timeout=120)
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
@@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
# CRITICAL: Verify sample files exist in new structure
self.assertGreater(len(new_sample_files), 0,
f"Sample files not found in new structure")
"Sample files not found in new structure")
# Verify new path format
for path_key, file_path in new_sample_files.items():

View File

@@ -10,7 +10,6 @@ from pathlib import Path
import pytest
from .fixtures import process, disable_extractors_dict, recursive_test_site
def wait_for_db_condition(timeout, condition, interval=0.5):
@@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "true",
"SAVE_WGET": "false",
})
proc = subprocess.Popen(

View File

@@ -1,7 +1,10 @@
import os
import sqlite3
import subprocess
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
"""Test removing a snapshot by URL pattern"""

View File

@@ -7,7 +7,6 @@ import subprocess
import pytest
from .fixtures import process
def _fetchone(tmp_path, query):

View File

@@ -0,0 +1,420 @@
#!/usr/bin/env python3
"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
import os
import socket
import sqlite3
import subprocess
import sys
import textwrap
import time
from pathlib import Path
import pytest
import requests
from .conftest import run_python_cwd
REPO_ROOT = Path(__file__).resolve().parents[2]
def init_archive(cwd: Path) -> None:
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init', '--quick'],
cwd=cwd,
capture_output=True,
text=True,
timeout=60,
)
assert result.returncode == 0, result.stderr
def build_test_env(port: int, **extra: str) -> dict[str, str]:
env = os.environ.copy()
env.pop('DATA_DIR', None)
env.update({
'LISTEN_HOST': f'archivebox.localhost:{port}',
'ALLOWED_HOSTS': '*',
'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
'PUBLIC_ADD_VIEW': 'True',
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'TIMEOUT': '20',
'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'SAVE_FAVICON': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'SAVE_HTMLTOTEXT': 'False',
'SAVE_WGET': 'True',
'USE_CHROME': 'False',
})
env.update(extra)
return env
def get_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(('127.0.0.1', 0))
return sock.getsockname()[1]
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
cwd=cwd,
capture_output=True,
text=True,
env=env,
timeout=60,
)
assert result.returncode == 0, result.stderr
def stop_server(cwd: Path) -> None:
script = textwrap.dedent(
"""
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
import django
django.setup()
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
stop_existing_supervisord_process()
print('stopped')
"""
)
run_python_cwd(script, cwd=cwd, timeout=30)
def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
deadline = time.time() + timeout
last_exc = None
while time.time() < deadline:
try:
response = requests.get(
f'http://127.0.0.1:{port}{path}',
headers={'Host': host},
timeout=2,
allow_redirects=False,
)
if response.status_code < 500:
return response
except requests.RequestException as exc:
last_exc = exc
time.sleep(0.5)
raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
def make_latest_schedule_due(cwd: Path) -> None:
conn = sqlite3.connect(cwd / 'index.sqlite3')
try:
conn.execute(
"""
UPDATE crawls_crawl
SET created_at = datetime('now', '-2 day'),
modified_at = datetime('now', '-2 day')
WHERE id = (
SELECT template_id
FROM crawls_crawlschedule
ORDER BY created_at DESC
LIMIT 1
)
"""
)
conn.commit()
finally:
conn.close()
def get_snapshot_file_text(cwd: Path, url: str) -> str:
script = textwrap.dedent(
f"""
import os
from pathlib import Path
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
import django
django.setup()
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
assert snapshot is not None, 'missing snapshot'
assert snapshot.status == 'sealed', snapshot.status
snapshot_dir = Path(snapshot.output_dir)
candidates = []
preferred_patterns = (
'wget/**/index.html',
'wget/**/*.html',
'trafilatura/content.html',
'trafilatura/content.txt',
'defuddle/content.html',
'defuddle/content.txt',
)
for pattern in preferred_patterns:
for candidate in snapshot_dir.glob(pattern):
if candidate.is_file():
candidates.append(candidate)
if not candidates:
for candidate in snapshot_dir.rglob('*'):
if not candidate.is_file():
continue
rel = candidate.relative_to(snapshot_dir)
if rel.parts and rel.parts[0] == 'responses':
continue
if candidate.suffix not in ('.html', '.htm', '.txt'):
continue
if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
continue
candidates.append(candidate)
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
print(candidates[0].read_text(errors='ignore'))
"""
)
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
assert code == 0, stderr
return stdout
def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
deadline = time.time() + timeout
last_error = None
while time.time() < deadline:
try:
return get_snapshot_file_text(cwd, url)
except AssertionError as err:
last_error = err
time.sleep(2)
raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
conn = sqlite3.connect(cwd / 'index.sqlite3')
try:
scheduled_snapshots = conn.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
(scheduled_url,),
).fetchone()[0]
one_shot_snapshots = conn.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
(one_shot_url,),
).fetchone()[0]
scheduled_crawls = conn.execute(
"""
SELECT COUNT(*)
FROM crawls_crawl
WHERE schedule_id IS NOT NULL
AND urls = ?
""",
(scheduled_url,),
).fetchone()[0]
return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
finally:
conn.close()
def create_admin_and_token(cwd: Path) -> str:
script = textwrap.dedent(
"""
import os
from datetime import timedelta
from django.utils import timezone
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
import django
django.setup()
from django.contrib.auth import get_user_model
from archivebox.api.models import APIToken
User = get_user_model()
user, _ = User.objects.get_or_create(
username='apitestadmin',
defaults={
'email': 'apitestadmin@example.com',
'is_staff': True,
'is_superuser': True,
},
)
user.is_staff = True
user.is_superuser = True
user.set_password('testpass123')
user.save()
token = APIToken.objects.create(
created_by=user,
expires=timezone.now() + timedelta(days=1),
)
print(token.token)
"""
)
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
assert code == 0, stderr
return stdout.strip().splitlines()[-1]
@pytest.mark.timeout(180)
def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port)
schedule_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
cwd=tmp_path,
capture_output=True,
text=True,
env=env,
timeout=60,
)
assert schedule_result.returncode == 0, schedule_result.stderr
assert 'Created scheduled crawl' in schedule_result.stdout
make_latest_schedule_due(tmp_path)
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'web.archivebox.localhost:{port}')
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
assert 'Root' in captured_text
assert 'About' in captured_text
finally:
stop_server(tmp_path)
@pytest.mark.timeout(180)
def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port)
scheduled_url = recursive_test_site['root_url']
one_shot_url = recursive_test_site['child_urls'][0]
schedule_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
cwd=tmp_path,
capture_output=True,
text=True,
env=env,
timeout=60,
)
assert schedule_result.returncode == 0, schedule_result.stderr
make_latest_schedule_due(tmp_path)
add_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
cwd=tmp_path,
capture_output=True,
text=True,
env=env,
timeout=120,
)
assert add_result.returncode == 0, add_result.stderr
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
assert 'Deep About' in captured_text or 'About' in captured_text
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
assert one_shot_snapshots >= 1
assert scheduled_snapshots == 0
assert scheduled_crawls == 1 # template only, no materialized scheduled run
@pytest.mark.timeout(180)
def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port)
api_token = create_admin_and_token(tmp_path)
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
response = requests.post(
f'http://127.0.0.1:{port}/api/v1/cli/schedule',
headers={
'Host': f'api.archivebox.localhost:{port}',
'X-ArchiveBox-API-Key': api_token,
},
json={
'every': 'daily',
'import_path': recursive_test_site['root_url'],
'quiet': True,
},
timeout=10,
)
assert response.status_code == 200, response.text
payload = response.json()
assert payload['success'] is True
assert payload['result_format'] == 'json'
assert len(payload['result']['created_schedule_ids']) == 1
finally:
stop_server(tmp_path)
@pytest.mark.timeout(180)
def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port, PUBLIC_ADD_VIEW='True')
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
response = requests.post(
f'http://127.0.0.1:{port}/add/',
headers={'Host': f'web.archivebox.localhost:{port}'},
data={
'url': recursive_test_site['root_url'],
'depth': '0',
'schedule': 'daily',
'tag': 'web-ui',
'notes': 'created from web ui',
},
timeout=10,
allow_redirects=False,
)
assert response.status_code in (302, 303), response.text
conn = sqlite3.connect(tmp_path / 'index.sqlite3')
try:
row = conn.execute(
"""
SELECT cs.schedule, c.urls, c.tags_str
FROM crawls_crawlschedule cs
JOIN crawls_crawl c ON c.schedule_id = cs.id
ORDER BY cs.created_at DESC
LIMIT 1
"""
).fetchone()
finally:
conn.close()
assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
finally:
stop_server(tmp_path)

View File

@@ -3,12 +3,9 @@
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):

View File

@@ -6,13 +6,11 @@ import subprocess
import sqlite3
from archivebox.machine.models import Process
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse
import uuid
import pytest
from .fixtures import process, disable_extractors_dict
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
@@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
snapshot_id = str(uuid.UUID(snapshot_id_raw))
crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row
username = user_row[0]
crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d')
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
domain = urlparse(snapshot_url).hostname or 'unknown'

Some files were not shown because too many files have changed in this diff Show More