bump package versions

This commit is contained in:
Nick Sweeting
2026-03-15 20:47:28 -07:00
parent bc21d4bfdb
commit 9de084da65
32 changed files with 469 additions and 711 deletions

View File

@@ -69,7 +69,7 @@ class ModelWithNotes(models.Model):
"""Mixin for models with a notes field."""
notes = models.TextField(blank=True, null=False, default='')
class Meta:
class Meta(TypedModelMeta):
abstract = True
@@ -78,7 +78,7 @@ class ModelWithHealthStats(models.Model):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
class Meta:
class Meta(TypedModelMeta):
abstract = True
@property
@@ -96,7 +96,7 @@ class ModelWithConfig(models.Model):
"""Mixin for models with a JSON config field."""
config = models.JSONField(default=dict, null=True, blank=True, editable=True)
class Meta:
class Meta(TypedModelMeta):
abstract = True

View File

@@ -297,6 +297,7 @@ def pluginmap(
if not quiet:
# Show diagram if this model has one
if info.get('diagram'):
assert info['diagram'] is not None
prnt(Panel(
info['diagram'],
title=f'[bold green]{info["machine"]}[/bold green]',

View File

@@ -69,8 +69,8 @@ def list_processes(
for process in queryset:
if is_tty:
binary_name_str = process.binary.name if process.binary else 'unknown'
exit_code = process.returncode if process.returncode is not None else '?'
status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
exit_code = process.exit_code if process.exit_code is not None else '?'
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
else:
write_record(process.to_json())

View File

@@ -208,7 +208,7 @@ def search(filter_patterns: list[str] | None=None,
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders: dict[str, Snapshot | None] = {snapshot.output_dir: snapshot for snapshot in snapshots}
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
output = printable_folders(folders, with_headers)
# Structured exports must be written directly to stdout.

View File

@@ -11,6 +11,8 @@ import unittest
from contextlib import contextmanager
from pathlib import Path
from archivebox.config.constants import CONSTANTS
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
@@ -31,10 +33,9 @@ DATA_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
init = importlib.import_module('archivebox.main').init
constants = importlib.import_module('archivebox.config.constants')
SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
SQL_INDEX_FILENAME = CONSTANTS.SQL_INDEX_FILENAME
JSON_INDEX_FILENAME = CONSTANTS.JSON_INDEX_FILENAME
HTML_INDEX_FILENAME = CONSTANTS.HTML_INDEX_FILENAME
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')

View File

@@ -68,6 +68,15 @@ def require(value: T | None) -> T:
return value
class MockTTYStringIO(StringIO):
def __init__(self, initial_value: str = '', *, is_tty: bool):
super().__init__(initial_value)
self._is_tty = is_tty
def isatty(self) -> bool:
return self._is_tty
# =============================================================================
# JSONL Utility Tests
# =============================================================================
@@ -176,10 +185,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
from archivebox.misc.jsonl import read_args_or_stdin
stdin_content = 'https://example1.com\nhttps://example2.com\n'
stream = StringIO(stdin_content)
# Mock isatty to return False (simulating piped input)
stream.isatty = lambda: False
stream = MockTTYStringIO(stdin_content, is_tty=False)
records = list(read_args_or_stdin((), stream=stream))
@@ -192,8 +198,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
from archivebox.misc.jsonl import read_args_or_stdin
stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
stream = StringIO(stdin_content)
stream.isatty = lambda: False
stream = MockTTYStringIO(stdin_content, is_tty=False)
records = list(read_args_or_stdin((), stream=stream))
@@ -206,8 +211,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
stream = StringIO(stdin_content)
stream.isatty = lambda: False
stream = MockTTYStringIO(stdin_content, is_tty=False)
records = list(read_args_or_stdin((), stream=stream))
@@ -219,8 +223,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
"""Should not read from TTY stdin (would block)."""
from archivebox.misc.jsonl import read_args_or_stdin
stream = StringIO('https://example.com')
stream.isatty = lambda: True # Simulate TTY
stream = MockTTYStringIO('https://example.com', is_tty=True)
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 0)
@@ -297,8 +300,7 @@ class TestSnapshotCommand(unittest.TestCase):
"""snapshot should accept Crawl JSONL as input."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
stdin = StringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n')
stdin.isatty = lambda: False
stdin = MockTTYStringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n', is_tty=False)
records = list(read_args_or_stdin((), stream=stdin))
@@ -311,8 +313,7 @@ class TestSnapshotCommand(unittest.TestCase):
"""snapshot should accept JSONL with tags and other metadata."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
stdin.isatty = lambda: False
stdin = MockTTYStringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n', is_tty=False)
records = list(read_args_or_stdin((), stream=stdin))
@@ -353,8 +354,7 @@ class TestArchiveResultCommand(unittest.TestCase):
"""archiveresult should accept JSONL Snapshot records."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
stdin.isatty = lambda: False
stdin = MockTTYStringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n', is_tty=False)
records = list(read_args_or_stdin((), stream=stdin))
@@ -461,395 +461,6 @@ class TestURLCollection(unittest.TestCase):
self.assertEqual(len(urls), 0)
# =============================================================================
# Integration Tests
# =============================================================================
class TestPipingWorkflowIntegration(unittest.TestCase):
"""
Integration tests for the complete piping workflow.
These tests require Django to be set up and use the actual database.
"""
@classmethod
def setUpClass(cls):
"""Set up Django and test database."""
cls.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = cls.test_dir
# Initialize Django
from archivebox.config.django import setup_django
setup_django()
# Initialize the archive
from archivebox.cli.archivebox_init import init
init()
@classmethod
def tearDownClass(cls):
"""Clean up test database."""
shutil.rmtree(cls.test_dir, ignore_errors=True)
def test_crawl_creates_and_outputs_jsonl(self):
"""
Test: archivebox crawl URL1 URL2 URL3
Should create a single Crawl with all URLs and output JSONL when piped.
"""
from archivebox.crawls.models import Crawl
from archivebox.misc.jsonl import TYPE_CRAWL
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Create crawl with multiple URLs (as newline-separated string)
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
crawl = require(Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}))
self.assertIsNotNone(crawl.id)
self.assertEqual(crawl.urls, urls)
self.assertEqual(crawl.status, 'queued')
# Verify URLs list
urls_list = crawl.get_urls_list()
self.assertEqual(len(urls_list), 2)
self.assertIn('https://test-crawl-1.example.com', urls_list)
self.assertIn('https://test-crawl-2.example.com', urls_list)
# Verify output format
output = crawl.to_json()
self.assertEqual(output['type'], TYPE_CRAWL)
self.assertIn('id', output)
self.assertEqual(output['urls'], urls)
self.assertIn('schema_version', output)
def test_snapshot_accepts_crawl_jsonl(self):
"""
Test: archivebox crawl URL | archivebox snapshot
Snapshot should accept Crawl JSONL and create Snapshots for each URL.
"""
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin,
TYPE_CRAWL, TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Step 1: Create crawl (simulating 'archivebox crawl')
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
crawl = require(Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}))
crawl_output = crawl.to_json()
# Step 2: Parse crawl output as snapshot input
stdin = StringIO(json.dumps(crawl_output) + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_CRAWL)
# Step 3: Create snapshots from crawl URLs
created_snapshots = []
for url in crawl.get_urls_list():
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
self.assertEqual(len(created_snapshots), 2)
# Verify snapshot output
for snapshot in created_snapshots:
output = snapshot.to_json()
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn(output['url'], [
'https://crawl-to-snap-1.example.com',
'https://crawl-to-snap-2.example.com'
])
def test_snapshot_creates_and_outputs_jsonl(self):
"""
Test: archivebox snapshot URL
Should create a Snapshot and output JSONL when piped.
"""
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin, TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Simulate input
url = 'https://test-snapshot-1.example.com'
records = list(read_args_or_stdin((url,)))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], url)
# Create snapshot
overrides = {'created_by_id': created_by_id}
snapshot = require(Snapshot.from_json(records[0], overrides=overrides))
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
# Verify output format
output = snapshot.to_json()
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn('id', output)
self.assertEqual(output['url'], url)
def test_extract_accepts_snapshot_from_previous_command(self):
"""
Test: archivebox snapshot URL | archivebox extract
Extract should accept JSONL output from snapshot command.
"""
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin,
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
overrides = {'created_by_id': created_by_id}
snapshot = require(Snapshot.from_json({'url': url}, overrides=overrides))
snapshot_output = snapshot.to_json()
# Step 2: Parse snapshot output as extract input
stdin = StringIO(json.dumps(snapshot_output) + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], str(snapshot.id))
# Step 3: Gather snapshot IDs (as extract does)
snapshot_ids = set()
for record in records:
if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
snapshot_ids.add(record['id'])
self.assertIn(str(snapshot.id), snapshot_ids)
def test_full_pipeline_crawl_snapshot_extract(self):
"""
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
This is equivalent to: archivebox add --depth=0 URL
"""
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin,
TYPE_CRAWL, TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# === archivebox crawl https://example.com ===
url = 'https://test-pipeline-full.example.com'
crawl = require(Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id}))
crawl_jsonl = json.dumps(crawl.to_json())
# === | archivebox snapshot ===
stdin = StringIO(crawl_jsonl + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_CRAWL)
# Create snapshots from crawl
created_snapshots = []
for record in records:
if record.get('type') == TYPE_CRAWL:
crawl_id = record.get('id')
if crawl_id:
db_crawl = Crawl.objects.get(id=crawl_id)
for crawl_url in db_crawl.get_urls_list():
snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
self.assertEqual(len(created_snapshots), 1)
self.assertEqual(created_snapshots[0].url, url)
# === | archivebox extract ===
snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], str(created_snapshots[0].id))
class TestDepthWorkflows(unittest.TestCase):
"""Test various depth crawl workflows."""
@classmethod
def setUpClass(cls):
"""Set up Django and test database."""
cls.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = cls.test_dir
from archivebox.config.django import setup_django
setup_django()
from archivebox.cli.archivebox_init import init
init()
@classmethod
def tearDownClass(cls):
"""Clean up test database."""
shutil.rmtree(cls.test_dir, ignore_errors=True)
def test_depth_0_workflow(self):
"""
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
Depth 0: Only archive the specified URL, no recursive crawling.
"""
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Create crawl with depth 0
url = 'https://depth0-test.example.com'
crawl = require(Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id}))
self.assertEqual(crawl.max_depth, 0)
# Create snapshot
snapshot = require(Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id}))
self.assertEqual(snapshot.url, url)
def test_depth_metadata_in_crawl(self):
"""Test that depth metadata is stored in Crawl."""
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Create crawl with depth
crawl = require(Crawl.from_json(
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
overrides={'created_by_id': created_by_id}
))
self.assertEqual(crawl.max_depth, 2)
# Verify in JSONL output
output = crawl.to_json()
self.assertEqual(output['max_depth'], 2)
class TestParserPluginWorkflows(unittest.TestCase):
"""Test workflows with specific parser plugins."""
@classmethod
def setUpClass(cls):
"""Set up Django and test database."""
cls.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = cls.test_dir
from archivebox.config.django import setup_django
setup_django()
from archivebox.cli.archivebox_init import init
init()
@classmethod
def tearDownClass(cls):
"""Clean up test database."""
shutil.rmtree(cls.test_dir, ignore_errors=True)
def test_html_parser_workflow(self):
"""
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_plugins
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
)
# Collect URLs
discovered = collect_urls_from_plugins(snapshot_dir)
self.assertEqual(len(discovered), 1)
self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')
def test_rss_parser_workflow(self):
"""
Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_plugins
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
'{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
'{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
)
# Collect URLs
discovered = collect_urls_from_plugins(snapshot_dir)
self.assertEqual(len(discovered), 2)
self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))
def test_multiple_parsers_dedupe(self):
"""
Multiple parsers may discover the same URL - should be deduplicated.
"""
from archivebox.hooks import collect_urls_from_plugins
# Create mock output with duplicate URLs from different parsers
snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://same-url.com"}\n'
)
(snapshot_dir / 'wget').mkdir(exist_ok=True)
(snapshot_dir / 'wget' / 'urls.jsonl').write_text(
'{"url": "https://same-url.com"}\n' # Same URL, different extractor
)
# Collect URLs
all_discovered = collect_urls_from_plugins(snapshot_dir)
# Both entries are returned (deduplication happens at the crawl command level)
self.assertEqual(len(all_discovered), 2)
# Verify both extractors found the same URL
urls = {d['url'] for d in all_discovered}
self.assertEqual(urls, {'https://same-url.com'})
class TestEdgeCases(unittest.TestCase):
"""Test edge cases and error handling."""
@@ -858,8 +469,7 @@ class TestEdgeCases(unittest.TestCase):
from archivebox.misc.jsonl import read_args_or_stdin
# Empty args, TTY stdin (should not block)
stdin = StringIO('')
stdin.isatty = lambda: True
stdin = MockTTYStringIO('', is_tty=True)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 0)
@@ -868,12 +478,12 @@ class TestEdgeCases(unittest.TestCase):
"""Should skip malformed JSONL lines."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO(
stdin = MockTTYStringIO(
'{"url": "https://good.com"}\n'
'not valid json\n'
'{"url": "https://also-good.com"}\n'
'{"url": "https://also-good.com"}\n',
is_tty=False,
)
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
@@ -885,12 +495,12 @@ class TestEdgeCases(unittest.TestCase):
"""Should handle mixed URLs and JSONL."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO(
stdin = MockTTYStringIO(
'https://plain-url.com\n'
'{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
'01234567-89ab-cdef-0123-456789abcdef\n' # UUID
'01234567-89ab-cdef-0123-456789abcdef\n', # UUID
is_tty=False,
)
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
@@ -942,12 +552,12 @@ class TestPassThroughBehavior(unittest.TestCase):
url_record = {'url': 'https://example.com'}
# Mock stdin with both records
stdin = StringIO(
stdin = MockTTYStringIO(
json.dumps(tag_record)
+ '\n'
+ json.dumps(url_record)
+ json.dumps(url_record),
is_tty=False,
)
stdin.isatty = lambda: False
# The Tag should be passed through, the URL should create a Crawl
# (This is a unit test of the pass-through logic)

View File

@@ -5,6 +5,7 @@ import pwd
import sys
import socket
import platform
from typing import cast
from rich import print
@@ -32,7 +33,7 @@ EGID = os.getegid()
SUDO_UID = int(os.environ.get('SUDO_UID', 0))
SUDO_GID = int(os.environ.get('SUDO_GID', 0))
USER: str = Path('~').expanduser().resolve().name
HOSTNAME: str = max([socket.gethostname(), platform.node()], key=len)
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
IS_ROOT = RUNNING_AS_UID == 0
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')

View File

@@ -33,6 +33,11 @@ def is_superuser(request: HttpRequest) -> bool:
return bool(getattr(request.user, 'is_superuser', False))
def format_parsed_datetime(value: object) -> str:
parsed = parse_date(value)
return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else ""
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
indent_str = " " * indent
if indent == 0:
@@ -412,7 +417,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows["Name"].append(ItemLink(proc_name, key=proc_name))
rows["State"].append(str(proc_data.get("statename") or ""))
rows['PID'].append(proc_description.replace('pid ', ''))
rows["Started"].append(parse_date(proc_start).strftime("%Y-%m-%d %H:%M:%S") if proc_start else '')
rows["Started"].append(format_parsed_datetime(proc_start))
rows["Command"].append(str(proc_config.get("command") or ""))
rows["Logfile"].append(
format_html(
@@ -458,7 +463,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
relevant_config = CONFIG_FILE.read_text()
relevant_logs = str(supervisor.readLog(0, 10_000_000))
start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0]
uptime = str(timezone.now() - parse_date(start_ts)).split(".")[0]
start_dt = parse_date(start_ts)
uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else ""
supervisor_state = supervisor.getState()
proc: Dict[str, object] = {
@@ -485,8 +491,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"Command": str(proc.get("name") or ""),
"PID": str(proc.get("pid") or ""),
"State": str(proc.get("statename") or ""),
"Started": parse_date(proc.get("start")).strftime("%Y-%m-%d %H:%M:%S") if proc.get("start") else "",
"Stopped": parse_date(proc.get("stop")).strftime("%Y-%m-%d %H:%M:%S") if proc.get("stop") else "",
"Started": format_parsed_datetime(proc.get("start")),
"Stopped": format_parsed_datetime(proc.get("stop")),
"Exit Status": str(proc.get("exitstatus") or ""),
"Logfile": str(proc.get("stdout_logfile") or ""),
"Uptime": str(str(proc.get("description") or "").split("uptime ", 1)[-1]),
@@ -524,7 +530,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
for logfile in log_files:
st = logfile.stat()
rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name))
rows["Last Updated"].append(parse_date(st.st_mtime).strftime("%Y-%m-%d %H:%M:%S"))
rows["Last Updated"].append(format_parsed_datetime(st.st_mtime))
rows["Size"].append(f'{st.st_size//1000} kb')
with open(logfile, 'rb') as f:
@@ -557,7 +563,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"fields": {
"Path": str(log_file),
"Size": f"{log_stat.st_size//1000} kb",
"Last Updated": parse_date(log_stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S"),
"Last Updated": format_parsed_datetime(log_stat.st_mtime),
"Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]),
"Full Log": log_text,
},

View File

@@ -1,7 +1,20 @@
__package__ = 'archivebox.core'
from typing import TYPE_CHECKING, Any
from django.contrib import admin
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
from admin_data_views.admin import (
admin_data_index_view as adv_admin_data_index_view,
get_admin_data_urls as adv_get_admin_data_urls,
get_app_list as adv_get_app_list,
)
if TYPE_CHECKING:
from django.http import HttpRequest
from django.template.response import TemplateResponse
from django.urls import URLPattern, URLResolver
from admin_data_views.typing import AppDict
class ArchiveBoxAdmin(admin.AdminSite):
@@ -10,6 +23,20 @@ class ArchiveBoxAdmin(admin.AdminSite):
site_title = 'Admin'
namespace = 'admin'
def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']:
if app_label is None:
return adv_get_app_list(self, request)
return adv_get_app_list(self, request, app_label)
def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse':
return adv_admin_data_index_view(self, request, **kwargs)
def get_admin_data_urls(self) -> list['URLResolver | URLPattern']:
return adv_get_admin_data_urls(self)
def get_urls(self) -> list['URLResolver | URLPattern']:
return self.get_admin_data_urls() + super().get_urls()
archivebox_admin = ArchiveBoxAdmin()
# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin
@@ -17,13 +44,6 @@ archivebox_admin = ArchiveBoxAdmin()
# patch admin with methods to add data views (implemented by admin_data_views package)
# https://github.com/MrThearMan/django-admin-data-views
# https://mrthearman.github.io/django-admin-data-views/setup/
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########

View File

@@ -1,9 +1,9 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any, List
from typing import Optional, Dict, Iterable, Any, List, Sequence, cast
import uuid
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
import os
import json
@@ -20,6 +20,7 @@ from django.core.cache import cache
from django.urls import reverse_lazy
from django.contrib import admin
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
@@ -51,7 +52,7 @@ class Tag(ModelWithUUID):
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
class Meta(ModelWithUUID.Meta):
app_label = 'core'
verbose_name = "Tag"
verbose_name_plural = "Tags"
@@ -88,7 +89,7 @@ class Tag(ModelWithUUID):
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
return str(reverse_lazy('api-1:get_tag', args=[self.id]))
def to_json(self) -> dict:
"""
@@ -104,7 +105,7 @@ class Tag(ModelWithUUID):
}
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None):
"""
Create/update Tag from JSON dict.
@@ -259,7 +260,7 @@ class SnapshotQuerySet(models.QuerySet):
})
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base]
"""Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
def filter(self, *args, **kwargs):
@@ -283,8 +284,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
from django.db import transaction
if atomic:
with transaction.atomic():
return self.delete()
return self.delete()
return self.get_queryset().delete()
return self.get_queryset().delete()
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
@@ -318,10 +319,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
StatusChoices = ModelWithStateMachine.StatusChoices
active_state = StatusChoices.STARTED
crawl_id: uuid.UUID
parent_snapshot_id: uuid.UUID | None
_prefetched_objects_cache: dict[str, Any]
objects = SnapshotManager()
archiveresult_set: models.Manager['ArchiveResult']
class Meta(TypedModelMeta):
class Meta(
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
ModelWithHealthStats.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
@@ -663,6 +674,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
if candidates.count() == 1:
snapshot = candidates.first()
if snapshot is None:
return None
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
return snapshot
elif candidates.count() > 1:
@@ -751,14 +764,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
@staticmethod
def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> Optional[str]:
"""
Select best timestamp from index.json vs folder name.
Validates range (1995-2035).
Prefers index.json if valid.
"""
def is_valid_timestamp(ts):
def is_valid_timestamp(ts: object | None) -> bool:
if not isinstance(ts, (str, int, float)):
return False
try:
ts_int = int(float(ts))
# 1995-01-01 to 2035-12-31
@@ -769,12 +784,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
folder_valid = is_valid_timestamp(folder_name)
if index_valid:
return str(int(float(index_timestamp)))
elif folder_valid:
return str(int(float(folder_name)))
else:
return None
if index_valid and index_timestamp is not None:
return str(int(float(str(index_timestamp))))
if folder_valid:
return str(int(float(str(folder_name))))
return None
@classmethod
def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
@@ -1039,7 +1053,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
result = {
result: dict[str, Any] = {
'snapshot': None,
'archive_results': [],
'binaries': [],
@@ -1210,7 +1224,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return merged
@classmethod
def _merge_snapshots(cls, snapshots: list['Snapshot']):
def _merge_snapshots(cls, snapshots: Sequence['Snapshot']):
"""
Merge exact duplicates.
Keep oldest, union files + ArchiveResults.
@@ -1271,19 +1285,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@admin.display(description='Tags')
def tags_str(self, nocache=True) -> str | None:
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
prefetched_cache = getattr(self, '_prefetched_objects_cache', {})
if 'tags' in prefetched_cache:
return calc_tags_str()
cache_key = f'{self.pk}-tags'
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
def icons(self, path: Optional[str] = None) -> str:
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
from django.utils.html import format_html
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
prefetched_cache = getattr(self, '_prefetched_objects_cache', {})
if 'archiveresult_set' in prefetched_cache:
archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
else:
# Filter for results that have either output_files or output_str
@@ -1331,7 +1347,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_snapshot', args=[self.id])
return str(reverse_lazy('api-1:get_snapshot', args=[self.id]))
def get_absolute_url(self):
return f'/{self.archive_path}'
@@ -1341,23 +1357,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return url_domain(self.url)
@property
def output_dir(self):
def title_stripped(self) -> str:
return (self.title or '').strip()
@property
def output_dir(self) -> Path:
"""The filesystem path to the snapshot's output directory."""
import os
current_path = self.get_storage_path_for_version(self.fs_version)
if current_path.exists():
return str(current_path)
return current_path
# Check for backwards-compat symlink
old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
if old_path.is_symlink():
return str(Path(os.readlink(old_path)).resolve())
link_target = Path(os.readlink(old_path))
return (old_path.parent / link_target).resolve() if not link_target.is_absolute() else link_target.resolve()
elif old_path.exists():
return str(old_path)
return old_path
return str(current_path)
return current_path
def ensure_legacy_archive_symlink(self) -> None:
"""Ensure the legacy archive/<timestamp> path resolves to this snapshot."""
@@ -1405,7 +1426,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
date_base = crawl.created_at or self.created_at or timezone.now()
date_str = date_base.strftime('%Y%m%d')
domain = self.extract_domain_from_url(self.url)
username = crawl.created_by.username if crawl.created_by_id else 'system'
username = crawl.created_by.username if getattr(crawl, 'created_by_id', None) else 'system'
crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id)
link_path = crawl_dir / 'snapshots' / domain / str(self.id)
@@ -1591,7 +1612,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
}
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None, queue_for_extraction: bool = True):
"""
Create/update Snapshot from JSON dict.
@@ -1859,7 +1880,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'is_sealed': is_sealed,
}
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
@@ -2163,20 +2184,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
cols = cols or ['timestamp', 'is_archived', 'url']
return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
def write_json_details(self, out_dir: Optional[str] = None) -> None:
def write_json_details(self, out_dir: Path | str | None = None) -> None:
"""Write JSON index file for this snapshot to its output directory"""
out_dir = out_dir or self.output_dir
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
path = output_dir / CONSTANTS.JSON_INDEX_FILENAME
atomic_write(str(path), self.to_dict(extended=True))
def write_html_details(self, out_dir: Optional[str] = None) -> None:
def write_html_details(self, out_dir: Path | str | None = None) -> None:
"""Write HTML detail page for this snapshot to its output directory"""
from django.template.loader import render_to_string
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.configset import get_config
from archivebox.misc.logging_util import printable_filesize
out_dir = out_dir or self.output_dir
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
config = get_config()
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
TITLE_LOADING_MSG = 'Not yet archived...'
@@ -2198,12 +2219,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
for plugin in preview_priority:
out = outputs_by_plugin.get(plugin)
if out and out.get('path'):
best_preview_path = out['path']
best_preview_path = str(out['path'])
best_result = out
break
if best_preview_path == 'about:blank' and outputs:
best_preview_path = outputs[0].get('path') or 'about:blank'
best_preview_path = str(outputs[0].get('path') or 'about:blank')
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
@@ -2223,7 +2244,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'archiveresults': outputs,
}
rendered_html = render_to_string('snapshot.html', context)
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
# =========================================================================
# Helper Methods
@@ -2285,6 +2306,8 @@ class SnapshotMachine(BaseStateMachine):
# Manual event (can also be triggered by last ArchiveResult finishing)
seal = started.to(sealed)
snapshot: Snapshot
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
return can_start
@@ -2332,7 +2355,7 @@ class SnapshotMachine(BaseStateMachine):
if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
# Seal the parent crawl
crawl.sm.seal()
cast(Any, crawl).sm.seal()
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
@@ -2391,7 +2414,15 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
state_field_name = 'status'
active_state = StatusChoices.STARTED
class Meta(TypedModelMeta):
snapshot_id: uuid.UUID
process_id: uuid.UUID | None
class Meta(
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
@@ -2442,7 +2473,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return record
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None):
"""
Create/update ArchiveResult from JSON dict.
@@ -2469,7 +2500,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Get or create by snapshot_id + plugin
try:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.get(id=snapshot_id)
result, _ = ArchiveResult.objects.get_or_create(
@@ -2531,7 +2561,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_archiveresult', args=[self.id])
return str(reverse_lazy('api-1:get_archiveresult', args=[self.id]))
def get_absolute_url(self):
return f'/{self.snapshot.archive_path}/{self.plugin}'
@@ -3198,6 +3228,8 @@ class ArchiveResultMachine(BaseStateMachine):
# Reason: backoff should always retry→started, then started→final states
)
archiveresult: ArchiveResult
def can_start(self) -> bool:
"""Pure function - check if AR can start (has valid URL)."""
return bool(self.archiveresult.snapshot.url)
@@ -3259,7 +3291,7 @@ class ArchiveResultMachine(BaseStateMachine):
process = self.archiveresult.process
# If process is NOT running anymore, reap the background hook
if not process.is_running():
if not process.is_running:
self.archiveresult.update_from_output()
# Check if now in final state after reaping
return self.archiveresult.status in (
@@ -3331,7 +3363,7 @@ class ArchiveResultMachine(BaseStateMachine):
if remaining_active == 0:
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
# Seal the parent snapshot
snapshot.sm.seal()
cast(Any, snapshot).sm.seal()
@succeeded.enter
def enter_succeeded(self):

View File

@@ -3,6 +3,8 @@ __package__ = "archivebox.core"
import os
import sys
import inspect
import importlib
from typing import Any, cast
from pathlib import Path
@@ -119,8 +121,8 @@ try:
try:
# Try to import django-auth-ldap (will fail if not installed)
from django_auth_ldap.config import LDAPSearch
import ldap
LDAPSearch = importlib.import_module("django_auth_ldap.config").LDAPSearch
ldap = importlib.import_module("ldap")
# Configure LDAP authentication
AUTH_LDAP_SERVER_URI = LDAP_CONFIG.LDAP_SERVER_URI
@@ -130,7 +132,7 @@ try:
# Configure user search
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_CONFIG.LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
getattr(ldap, "SCOPE_SUBTREE", 2),
LDAP_CONFIG.LDAP_USER_FILTER,
)
@@ -432,7 +434,7 @@ LOGGING = SETTINGS_LOGGING
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
SIGNAL_WEBHOOKS = {
SIGNAL_WEBHOOKS: dict[str, object] = {
"HOOKS": {
# ... is a special sigil value that means "use the default autogenerated hooks"
"django.contrib.auth.models.User": ...,
@@ -444,7 +446,8 @@ SIGNAL_WEBHOOKS = {
}
# Avoid background threads touching sqlite connections (especially during tests/migrations).
if DATABASES["default"]["ENGINE"].endswith("sqlite3"):
default_database = cast(dict[str, Any], DATABASES["default"])
if str(default_database["ENGINE"]).endswith("sqlite3"):
SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler"
################################################################################
@@ -551,10 +554,8 @@ if DEBUG_TOOLBAR:
MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"]
if DEBUG:
from django_autotyping.typing import AutotypingSettingsDict
INSTALLED_APPS += ["django_autotyping"]
AUTOTYPING: AutotypingSettingsDict = {
AUTOTYPING = {
"STUBS_GENERATION": {
"LOCAL_STUBS_DIR": PACKAGE_DIR / "typings",
}

View File

@@ -1,5 +1,7 @@
"""Template tags for accessing config values in templates."""
from typing import Any
from django import template
from archivebox.config.configset import get_config as _get_config
@@ -8,7 +10,7 @@ register = template.Library()
@register.simple_tag
def get_config(key: str) -> any:
def get_config(key: str) -> Any:
"""
Get a config value by key.

View File

@@ -4,6 +4,9 @@ import importlib
import os
import django
from unittest.mock import patch
from typing import TypeVar, cast
from django.forms import BaseForm
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
@@ -18,6 +21,14 @@ CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedul
Tag = importlib.import_module('archivebox.core.models').Tag
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
T = TypeVar('T')
def require(value: T | None) -> T:
if value is None:
raise AssertionError('Expected value to be present')
return value
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
@@ -111,7 +122,7 @@ class AddViewTests(TestCase):
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
@@ -140,8 +151,8 @@ class AddViewTests(TestCase):
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = Crawl.objects.first()
schedule = CrawlSchedule.objects.first()
crawl = require(Crawl.objects.first())
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
@@ -159,7 +170,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
schedule = CrawlSchedule.objects.first()
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
@@ -173,7 +184,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
@@ -209,7 +220,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
@@ -236,7 +247,7 @@ class AddViewTests(TestCase):
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.order_by('-created_at').first()
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
@@ -248,7 +259,7 @@ class AddViewTests(TestCase):
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.order_by('-created_at').first()
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_staff_admin_custom_config_is_allowed(self):
@@ -269,7 +280,7 @@ class AddViewTests(TestCase):
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.order_by('-created_at').first()
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
def test_add_empty_urls_fails(self):
@@ -281,7 +292,7 @@ class AddViewTests(TestCase):
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'url', 'This field is required.')
self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
@@ -355,7 +366,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
@@ -365,7 +376,7 @@ class AddViewTests(TestCase):
'depth': '0',
})
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)

View File

@@ -4,6 +4,7 @@ from django.urls import path, re_path, include
from django.views import static
from django.conf import settings
from django.views.generic.base import RedirectView
from django.http import HttpRequest
from archivebox.misc.serve_static import serve_static
@@ -53,7 +54,7 @@ urlpatterns = [
path("api/", include('archivebox.api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda *_: 1/0), # type: ignore
path('error/', lambda request: _raise_test_error(request)),
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
@@ -61,6 +62,10 @@ urlpatterns = [
path('', HomepageView.as_view(), name='Home'),
]
def _raise_test_error(_request: HttpRequest):
raise ZeroDivisionError('Intentional test error route')
if settings.DEBUG_TOOLBAR:
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]

View File

@@ -5,13 +5,14 @@ import posixpath
from glob import glob, escape
from django.utils import timezone
import inspect
from typing import Callable, get_type_hints
from typing import Callable, cast, get_type_hints
from pathlib import Path
from urllib.parse import urlparse
from django.shortcuts import render, redirect
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.views import View
from django.views.generic.list import ListView
from django.views.generic import FormView
@@ -21,7 +22,7 @@ from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt
from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.typing import TableContext, ItemContext, SectionData
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
@@ -854,7 +855,7 @@ class AddView(UserPassesTestMixin, FormView):
def _can_override_crawl_config(self) -> bool:
user = self.request.user
return bool(user.is_authenticated and (user.is_superuser or user.is_staff))
return bool(user.is_authenticated and (getattr(user, 'is_superuser', False) or getattr(user, 'is_staff', False)))
def _get_custom_config_overrides(self, form: AddLinkForm) -> dict:
custom_config = form.cleaned_data.get("config") or {}
@@ -906,7 +907,7 @@ class AddView(UserPassesTestMixin, FormView):
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'
created_by_name = getattr(self.request.user, 'username', 'web') if self.request.user.is_authenticated else 'web'
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
@@ -1015,8 +1016,8 @@ class WebAddView(AddView):
return super().dispatch(request, *args, **kwargs)
def get(self, request, url: str):
requested_url = urldecode(url)
def get(self, request: HttpRequest, *args: object, **kwargs: object):
requested_url = urldecode(str(kwargs.get('url') or (args[0] if args else '')))
if not requested_url:
raise Http404
@@ -1025,6 +1026,7 @@ class WebAddView(AddView):
return redirect(f'/{snapshot.url_path}')
add_url = self._normalize_add_url(requested_url)
assert self.form_class is not None
defaults_form = self.form_class()
form_data = {
'url': add_url,
@@ -1045,6 +1047,7 @@ class WebAddView(AddView):
crawl = self._create_crawl_from_form(form)
snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
assert snapshot is not None
return redirect(f'/{snapshot.url_path}')
@@ -1385,7 +1388,7 @@ def find_config_type(key: str) -> str:
# Try to get from pydantic model_fields first (more reliable)
if hasattr(config, 'model_fields') and key in config.model_fields:
field = config.model_fields[key]
if hasattr(field, 'annotation'):
if hasattr(field, 'annotation') and field.annotation is not None:
try:
return str(field.annotation.__name__)
except AttributeError:
@@ -1448,7 +1451,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.'
# Get merged config that includes Machine.config overrides
try:
@@ -1519,7 +1522,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
CONFIGS = get_all_configs()
FLAT_CONFIG = get_flat_config()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.'
# Get merged config
merged_config = get_config()
@@ -1575,62 +1578,62 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
section_data = cast(SectionData, {
"name": section_header,
"description": None,
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
'Source': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
<b>Configuration Sources (in priority order):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),
'Source': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>
{f'<br/><b>Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
'''),
},
})
return ItemContext(
slug=key,
title=key,
data=[
{
"name": section_header,
"description": None,
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
'Source': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
<b>Configuration Sources (in priority order):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),
'Source': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>
{f'<br/><b>💡 Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
'''),
},
},
],
data=[section_data],
)

View File

@@ -16,7 +16,7 @@ class TagEditorWidget(forms.Widget):
- Press Enter or Space to create new tags (auto-creates if doesn't exist)
- Uses AJAX for autocomplete and tag creation
"""
template_name = None # We render manually
template_name = "" # We render manually
class Media:
css = {'all': []}

View File

@@ -2,7 +2,8 @@ __package__ = 'archivebox.crawls'
from django import forms
from django.utils.html import format_html, format_html_join, mark_safe
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from django.contrib import admin, messages
from django.db.models import Count, Q

View File

@@ -1,6 +1,7 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
import uuid
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
@@ -10,7 +11,6 @@ from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings
from django.urls import reverse_lazy
from django.utils import timezone
from django_stubs_ext.db.models import TypedModelMeta
from statemachine import State, registry
from rich import print
@@ -36,7 +36,7 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes):
crawl_set: models.Manager['Crawl']
class Meta(TypedModelMeta):
class Meta(ModelWithUUID.Meta, ModelWithNotes.Meta):
app_label = 'crawls'
verbose_name = 'Scheduled Crawl'
verbose_name_plural = 'Scheduled Crawls'
@@ -47,7 +47,7 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes):
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_any', args=[self.id])
return str(reverse_lazy('api-1:get_any', args=[self.id]))
def save(self, *args, **kwargs):
self.schedule = (self.schedule or '').strip()
@@ -119,9 +119,17 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
StatusChoices = ModelWithStateMachine.StatusChoices
active_state = StatusChoices.STARTED
schedule_id: uuid.UUID | None
sm: 'CrawlMachine'
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
class Meta(
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithHealthStats.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'crawls'
verbose_name = 'Crawl'
verbose_name_plural = 'Crawls'
@@ -152,7 +160,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
return str(reverse_lazy('api-1:get_crawl', args=[self.id]))
def to_json(self) -> dict:
"""
@@ -172,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
}
@staticmethod
def from_json(record: dict, overrides: dict = None):
def from_json(record: dict, overrides: dict | None = None):
"""
Create or get a Crawl from a JSON dict.
@@ -746,6 +754,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# =============================================================================
class CrawlMachine(BaseStateMachine):
crawl: Crawl
"""
State machine for managing Crawl lifecycle.

View File

@@ -1013,7 +1013,7 @@ def get_plugin_icon(plugin: str) -> str:
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] | None = None) -> Dict[str, int]:
"""
Process JSONL records from hook output.
Dispatches to Model.from_json() for each record type.

View File

@@ -1,6 +1,7 @@
__package__ = 'archivebox.ideas'
import asyncio
import importlib
import json
import os
import shlex
@@ -13,12 +14,14 @@ from typing import Any, Callable, Mapping, MutableMapping, Optional
from pydantic import BaseModel, Field
try:
from bubus import BaseEvent, EventBus
bubus = importlib.import_module("bubus")
BaseEvent = bubus.BaseEvent
EventBus = bubus.EventBus
except Exception as exc: # pragma: no cover - optional dependency
raise ImportError('ProcessPlugin requires bubus to be installed') from exc
try:
from bubus.service import uuid7str
uuid7str = importlib.import_module("bubus.service").uuid7str
except Exception: # pragma: no cover - optional dependency
from uuid import uuid4 as _uuid4

View File

@@ -6,18 +6,15 @@ This module extends django-auth-ldap to support the LDAP_CREATE_SUPERUSER flag.
__package__ = "archivebox.ldap"
from typing import TYPE_CHECKING
import importlib
if TYPE_CHECKING:
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
else:
try:
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
except ImportError:
# If django-auth-ldap is not installed, create a dummy base class
class BaseLDAPBackend:
"""Dummy LDAP backend when django-auth-ldap is not installed."""
pass
try:
BaseLDAPBackend = importlib.import_module("django_auth_ldap.backend").LDAPBackend
except ImportError:
class BaseLDAPBackend:
"""Dummy LDAP backend when django-auth-ldap is not installed."""
pass
class ArchiveBoxLDAPBackend(BaseLDAPBackend):
@@ -36,7 +33,11 @@ class ArchiveBoxLDAPBackend(BaseLDAPBackend):
"""
from archivebox.config.ldap import LDAP_CONFIG
user = super().authenticate_ldap_user(ldap_user, password)
base_authenticate = getattr(super(), "authenticate_ldap_user", None)
if base_authenticate is None:
return None
user = base_authenticate(ldap_user, password)
if user and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
# Grant superuser privileges to all LDAP-authenticated users

View File

@@ -1,11 +1,15 @@
from __future__ import annotations
__package__ = 'archivebox.machine'
import os
import sys
import uuid
import socket
from pathlib import Path
from archivebox.uuid_compat import uuid7
from datetime import timedelta, datetime
from typing import TYPE_CHECKING, Any, cast
from statemachine import State, registry
@@ -13,21 +17,31 @@ from django.db import models
from django.db.models import QuerySet
from django.utils import timezone
from django.utils.functional import cached_property
from django_stubs_ext.db.models import TypedModelMeta
from archivebox.base_models.models import ModelWithHealthStats
from archivebox.workers.models import BaseStateMachine, ModelWithStateMachine
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
_psutil: Any | None = None
try:
import psutil
import psutil as _psutil_import
PSUTIL_AVAILABLE = True
except ImportError:
PSUTIL_AVAILABLE = False
else:
_psutil = _psutil_import
_CURRENT_MACHINE = None
_CURRENT_INTERFACE = None
_CURRENT_BINARIES = {}
_CURRENT_PROCESS = None
if TYPE_CHECKING:
import psutil
from archivebox.core.models import ArchiveResult
else:
psutil = cast(Any, _psutil)
_CURRENT_MACHINE: Machine | None = None
_CURRENT_INTERFACE: NetworkInterface | None = None
_CURRENT_BINARIES: dict[str, Binary] = {}
_CURRENT_PROCESS: Process | None = None
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
@@ -64,10 +78,10 @@ class Machine(ModelWithHealthStats):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
objects: MachineManager = MachineManager()
objects = MachineManager() # pyright: ignore[reportIncompatibleVariableOverride]
networkinterface_set: models.Manager['NetworkInterface']
class Meta:
class Meta(ModelWithHealthStats.Meta):
app_label = 'machine'
@classmethod
@@ -127,7 +141,7 @@ class Machine(ModelWithHealthStats):
}
@staticmethod
def from_json(record: dict, overrides: dict = None):
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
"""
Update Machine config from JSON dict.
@@ -172,9 +186,10 @@ class NetworkInterface(ModelWithHealthStats):
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
# num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
objects: NetworkInterfaceManager = NetworkInterfaceManager()
objects = NetworkInterfaceManager() # pyright: ignore[reportIncompatibleVariableOverride]
machine_id: uuid.UUID
class Meta:
class Meta(ModelWithHealthStats.Meta):
app_label = 'machine'
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
@@ -185,7 +200,7 @@ class NetworkInterface(ModelWithHealthStats):
if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
return _CURRENT_INTERFACE
_CURRENT_INTERFACE = None
machine = Machine.objects.current()
machine = Machine.current()
net_info = get_host_network()
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
@@ -202,7 +217,7 @@ class BinaryManager(models.Manager):
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
return cached
_CURRENT_BINARIES[name], _ = self.update_or_create(
machine=Machine.objects.current(), name=name, binprovider=binprovider,
machine=Machine.current(), name=name, binprovider=binprovider,
version=version, abspath=abspath, sha256=sha256,
)
return _CURRENT_BINARIES[name]
@@ -263,12 +278,14 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
machine_id: uuid.UUID
state_machine_name: str | None = 'archivebox.machine.models.BinaryMachine'
active_state: str = StatusChoices.QUEUED
objects: BinaryManager = BinaryManager()
objects = BinaryManager() # pyright: ignore[reportIncompatibleVariableOverride]
class Meta:
class Meta(ModelWithHealthStats.Meta, ModelWithStateMachine.Meta):
app_label = 'machine'
verbose_name = 'Binary'
verbose_name_plural = 'Binaries'
@@ -321,7 +338,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
}
@staticmethod
def from_json(record: dict, overrides: dict = None):
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
"""
Create/update Binary from JSON dict.
@@ -418,7 +435,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
return None
def update_and_requeue(self, **kwargs):
def update_and_requeue(self, **kwargs) -> bool:
"""
Update binary fields and requeue for worker state machine.
@@ -429,6 +446,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
setattr(self, key, value)
self.modified_at = timezone.now()
self.save()
return True
def _allowed_binproviders(self) -> set[str] | None:
"""Return the allowed binproviders for this binary, or None for wildcard."""
@@ -513,21 +531,14 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
plugin_output_dir = output_dir / plugin_name
plugin_output_dir.mkdir(parents=True, exist_ok=True)
# Build kwargs for hook
hook_kwargs = {
'binary_id': str(self.id),
'machine_id': str(self.machine_id),
'name': self.name,
'binproviders': self.binproviders,
}
custom_cmd = None
overrides_json = None
if plugin_name == 'custom':
custom_cmd = self._get_custom_install_command()
if not custom_cmd:
continue
hook_kwargs['custom_cmd'] = custom_cmd
elif self.overrides:
hook_kwargs['overrides'] = json.dumps(self.overrides)
overrides_json = json.dumps(self.overrides)
# Run the hook
process = run_hook(
@@ -535,7 +546,12 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
output_dir=plugin_output_dir,
config=config,
timeout=600, # 10 min timeout for binary installation
**hook_kwargs
binary_id=str(self.id),
machine_id=str(self.machine_id),
name=self.name,
binproviders=self.binproviders,
custom_cmd=custom_cmd,
overrides=overrides_json,
)
# Background hook (unlikely for binary installation, but handle it)
@@ -679,7 +695,7 @@ class ProcessManager(models.Manager):
"""Get the Process record for the current OS process."""
return Process.current()
def get_by_pid(self, pid: int, machine: 'Machine' = None) -> 'Process | None':
def get_by_pid(self, pid: int, machine: 'Machine | None' = None) -> 'Process | None':
"""
Find a Process by PID with proper validation against PID reuse.
@@ -880,11 +896,17 @@ class Process(models.Model):
help_text='When to retry this process'
)
machine_id: uuid.UUID
parent_id: uuid.UUID | None
binary_id: uuid.UUID | None
children: models.Manager['Process']
archiveresult: 'ArchiveResult'
state_machine_name: str = 'archivebox.machine.models.ProcessMachine'
objects: ProcessManager = ProcessManager()
objects = ProcessManager() # pyright: ignore[reportIncompatibleVariableOverride]
class Meta:
class Meta(TypedModelMeta):
app_label = 'machine'
verbose_name = 'Process'
verbose_name_plural = 'Processes'
@@ -971,7 +993,7 @@ class Process(models.Model):
return self.parse_records_from_text(stdout or '')
@staticmethod
def from_json(record: dict, overrides: dict = None):
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
"""
Create/update Process from JSON dict.
@@ -990,7 +1012,7 @@ class Process(models.Model):
pass
return None
def update_and_requeue(self, **kwargs):
def update_and_requeue(self, **kwargs) -> bool:
"""
Update process fields and requeue for worker state machine.
Sets modified_at to ensure workers pick up changes.
@@ -999,6 +1021,7 @@ class Process(models.Model):
setattr(self, key, value)
self.modified_at = timezone.now()
self.save()
return True
# =========================================================================
# Process.current() and hierarchy methods
@@ -1094,7 +1117,7 @@ class Process(models.Model):
return _CURRENT_PROCESS
@classmethod
def _find_parent_process(cls, machine: 'Machine' = None) -> 'Process | None':
def _find_parent_process(cls, machine: 'Machine | None' = None) -> 'Process | None':
"""
Find the parent Process record by looking up PPID.
@@ -1163,7 +1186,7 @@ class Process(models.Model):
return cls.TypeChoices.BINARY
@classmethod
def cleanup_stale_running(cls, machine: 'Machine' = None) -> int:
def cleanup_stale_running(cls, machine: 'Machine | None' = None) -> int:
"""
Mark stale RUNNING processes as EXITED.
@@ -1374,25 +1397,25 @@ class Process(models.Model):
# =========================================================================
@property
def pid_file(self) -> Path:
def pid_file(self) -> Path | None:
"""Path to PID file for this process."""
runtime_dir = self.runtime_dir
return runtime_dir / 'process.pid' if runtime_dir else None
@property
def cmd_file(self) -> Path:
def cmd_file(self) -> Path | None:
"""Path to cmd.sh script for this process."""
runtime_dir = self.runtime_dir
return runtime_dir / 'cmd.sh' if runtime_dir else None
@property
def stdout_file(self) -> Path:
def stdout_file(self) -> Path | None:
"""Path to stdout log."""
runtime_dir = self.runtime_dir
return runtime_dir / 'stdout.log' if runtime_dir else None
@property
def stderr_file(self) -> Path:
def stderr_file(self) -> Path | None:
"""Path to stderr log."""
runtime_dir = self.runtime_dir
return runtime_dir / 'stderr.log' if runtime_dir else None
@@ -1647,6 +1670,8 @@ class Process(models.Model):
stdout_path.parent.mkdir(parents=True, exist_ok=True)
if stderr_path:
stderr_path.parent.mkdir(parents=True, exist_ok=True)
if stdout_path is None or stderr_path is None:
raise RuntimeError('Process log paths could not be determined')
with open(stdout_path, 'a') as out, open(stderr_path, 'a') as err:
proc = subprocess.Popen(
@@ -2006,7 +2031,7 @@ class Process(models.Model):
# =========================================================================
@classmethod
def get_running(cls, process_type: str = None, machine: 'Machine' = None) -> 'QuerySet[Process]':
def get_running(cls, process_type: str | None = None, machine: 'Machine | None' = None) -> 'QuerySet[Process]':
"""
Get all running processes, optionally filtered by type.
@@ -2031,7 +2056,7 @@ class Process(models.Model):
return qs
@classmethod
def get_running_count(cls, process_type: str = None, machine: 'Machine' = None) -> int:
def get_running_count(cls, process_type: str | None = None, machine: 'Machine | None' = None) -> int:
"""
Get count of running processes.
@@ -2041,7 +2066,7 @@ class Process(models.Model):
return cls.get_running(process_type=process_type, machine=machine).count()
@classmethod
def stop_all(cls, process_type: str = None, machine: 'Machine' = None, graceful: bool = True) -> int:
def stop_all(cls, process_type: str | None = None, machine: 'Machine | None' = None, graceful: bool = True) -> int:
"""
Stop all running processes of a given type.
@@ -2064,7 +2089,7 @@ class Process(models.Model):
return stopped
@classmethod
def get_next_worker_id(cls, process_type: str = 'worker', machine: 'Machine' = None) -> int:
def get_next_worker_id(cls, process_type: str = 'worker', machine: 'Machine | None' = None) -> int:
"""
Get the next available worker ID for spawning new workers.
@@ -2190,6 +2215,7 @@ class BinaryMachine(BaseStateMachine):
"""
model_attr_name = 'binary'
binary: Binary
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
@@ -2293,6 +2319,7 @@ class ProcessMachine(BaseStateMachine):
"""
model_attr_name = 'process'
process: Process
# States
queued = State(value=Process.StatusChoices.QUEUED, initial=True)

View File

@@ -13,6 +13,7 @@ Tests cover:
import os
from datetime import timedelta
from typing import cast
from unittest.mock import patch
import pytest
@@ -20,6 +21,7 @@ from django.test import TestCase
from django.utils import timezone
from archivebox.machine.models import (
BinaryManager,
Machine,
NetworkInterface,
Binary,
@@ -94,7 +96,7 @@ class TestMachineModel(TestCase):
def test_machine_manager_current(self):
"""Machine.objects.current() should return current machine."""
machine = Machine.objects.current()
machine = Machine.current()
self.assertIsNotNone(machine)
self.assertEqual(machine.id, Machine.current().id)
@@ -126,7 +128,7 @@ class TestNetworkInterfaceModel(TestCase):
def test_networkinterface_manager_current(self):
"""NetworkInterface.objects.current() should return current interface."""
interface = NetworkInterface.objects.current()
interface = NetworkInterface.current()
self.assertIsNotNone(interface)
@@ -177,7 +179,7 @@ class TestBinaryModel(TestCase):
version='1.21',
)
result = Binary.objects.get_valid_binary('wget')
result = cast(BinaryManager, Binary.objects).get_valid_binary('wget')
self.assertIsNotNone(result)
assert result is not None

View File

@@ -79,8 +79,8 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if isinstance(text, str):
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}")
else:
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}")
for line in text[1:]:
stderr('{} {}'.format(prefix, line))
stderr(f'{prefix} {line}')

View File

@@ -5,6 +5,8 @@ import os
import stat
import posixpath
import mimetypes
import importlib
from collections.abc import Callable
from pathlib import Path
from django.contrib.staticfiles import finders
@@ -69,9 +71,9 @@ mimetypes.add_type("application/xml", ".xml")
mimetypes.add_type("image/svg+xml", ".svg")
try:
import markdown as _markdown
except Exception:
_markdown = None
_markdown = getattr(importlib.import_module('markdown'), 'markdown')
except ImportError:
_markdown: Callable[..., str] | None = None
MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
@@ -108,7 +110,7 @@ def _looks_like_markdown(text: str) -> bool:
def _render_markdown_fallback(text: str) -> str:
if _markdown is not None and not HTML_TAG_RE.search(text):
try:
return _markdown.markdown(
return _markdown(
text,
extensions=["extra", "toc", "sane_lists"],
output_format="html",

View File

@@ -1,4 +1,4 @@
from typing import Any, List, Callable
from typing import Any, List, Callable, cast
import json
import ast
@@ -94,7 +94,8 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
def better_toml_dump_str(val: Any) -> str:
try:
return toml.encoder._dump_str(val) # type: ignore
dump_str = cast(Callable[[Any], str], getattr(toml.encoder, '_dump_str'))
return dump_str(val)
except Exception:
# if we hit any of toml's numerous encoding bugs,
# fall back to using json representation of string
@@ -108,7 +109,8 @@ class CustomTOMLEncoder(toml.encoder.TomlEncoder):
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.dump_funcs[Path] = lambda x: json.dumps(str(x))
self.dump_funcs[PosixPath] = lambda x: json.dumps(str(x))
self.dump_funcs[str] = better_toml_dump_str
self.dump_funcs[re.RegexFlag] = better_toml_dump_str
dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs)
dump_funcs[Path] = lambda x: json.dumps(str(x))
dump_funcs[PosixPath] = lambda x: json.dumps(str(x))
dump_funcs[str] = better_toml_dump_str
dump_funcs[re.RegexFlag] = better_toml_dump_str

View File

@@ -16,7 +16,7 @@ from datetime import datetime, timezone
from dateparser import parse as dateparser
from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode # type: ignore
from base32_crockford import encode as base32_encode
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
try:
import chardet # type:ignore
@@ -200,7 +200,7 @@ def parse_date(date: Any) -> datetime | None:
"""Parse unix timestamps, iso format, and human-readable strings"""
if date is None:
return None # type: ignore
return None
if isinstance(date, datetime):
if date.tzinfo is None:

View File

@@ -16,7 +16,7 @@ import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from django.db import models
from django.conf import settings
@@ -25,13 +25,18 @@ from django.utils import timezone
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
from archivebox.uuid_compat import uuid7
_fcntl: Any | None = None
try:
import fcntl
import fcntl as _fcntl_import
except ImportError: # pragma: no cover
fcntl = None
pass
else:
_fcntl = _fcntl_import
if TYPE_CHECKING:
pass
import fcntl
else:
fcntl = _fcntl
VOLATILE_PROFILE_DIR_NAMES = {
@@ -79,7 +84,7 @@ class Persona(ModelWithConfig):
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
class Meta:
class Meta(ModelWithConfig.Meta):
app_label = 'personas'
def __str__(self) -> str:

View File

@@ -8,6 +8,7 @@ from django.db import models
from django.core import checks
from django.utils import timezone
from django.utils.functional import classproperty
from django_stubs_ext.db.models import TypedModelMeta
from statemachine import registry, StateMachine, State
@@ -31,7 +32,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
# status: models.CharField
# retry_at: models.DateTimeField
state_machine_name: str | None
state_machine_name: str | None = None
state_field_name: str
state_machine_attr: str = 'sm'
bind_events_as_methods: bool = True
@@ -39,7 +40,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
active_state: ObjectState
retry_at_field_name: str
class Meta:
class Meta(TypedModelMeta):
app_label = 'workers'
abstract = True
@@ -92,7 +93,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
if not found_id_field:
errors.append(checks.Error(
f'{cls.__name__} must have an id field that is a primary key',
hint=f'{cls.__name__}.id = {cls.id!r}',
hint=f'{cls.__name__}.id field missing or not configured as primary key',
obj=cls,
id='workers.E014',
))

View File

@@ -11,14 +11,26 @@ Tests cover:
import os
import time
from datetime import timedelta
from unittest.mock import patch, MagicMock
from datetime import datetime, timedelta
from unittest.mock import patch
from typing import ClassVar
import pytest
from django.test import TestCase
from django.utils import timezone
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import Worker
class FakeWorker(Worker):
name: ClassVar[str] = 'crawl'
MAX_CONCURRENT_TASKS: ClassVar[int] = 5
running_workers: ClassVar[list[dict[str, object]]] = []
@classmethod
def get_running_workers(cls) -> list[dict[str, object]]:
return cls.running_workers
class TestOrchestratorUnit(TestCase):
@@ -99,31 +111,25 @@ class TestOrchestratorUnit(TestCase):
"""should_spawn_worker should return False when queue is empty."""
orchestrator = Orchestrator()
# Create a mock worker class
mock_worker = MagicMock()
mock_worker.get_running_workers.return_value = []
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 0))
FakeWorker.running_workers = []
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
def test_should_spawn_worker_at_limit(self):
"""should_spawn_worker should return False when at per-type limit."""
orchestrator = Orchestrator()
mock_worker = MagicMock()
mock_worker.get_running_workers.return_value = [{}] * orchestrator.MAX_WORKERS_PER_TYPE
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 10))
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
FakeWorker.running_workers = running_workers
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
@patch.object(Orchestrator, 'get_total_worker_count')
def test_should_spawn_worker_at_total_limit(self, mock_total):
"""should_spawn_worker should return False when at total limit."""
orchestrator = Orchestrator()
mock_total.return_value = orchestrator.MAX_TOTAL_WORKERS
mock_worker = MagicMock()
mock_worker.get_running_workers.return_value = []
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 10))
mock_total.return_value = 0
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
FakeWorker.running_workers = running_workers
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
@patch.object(Orchestrator, 'get_total_worker_count')
def test_should_spawn_worker_success(self, mock_total):
@@ -131,11 +137,8 @@ class TestOrchestratorUnit(TestCase):
orchestrator = Orchestrator()
mock_total.return_value = 0
mock_worker = MagicMock()
mock_worker.get_running_workers.return_value = []
mock_worker.MAX_CONCURRENT_TASKS = 5
self.assertTrue(orchestrator.should_spawn_worker(mock_worker, 10))
FakeWorker.running_workers = []
self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
@patch.object(Orchestrator, 'get_total_worker_count')
def test_should_spawn_worker_enough_workers(self, mock_total):
@@ -143,12 +146,8 @@ class TestOrchestratorUnit(TestCase):
orchestrator = Orchestrator()
mock_total.return_value = 2
mock_worker = MagicMock()
mock_worker.get_running_workers.return_value = [{}] # 1 worker running
mock_worker.MAX_CONCURRENT_TASKS = 5 # Can handle 5 items
# Queue size (3) <= running_workers (1) * MAX_CONCURRENT_TASKS (5)
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 3))
FakeWorker.running_workers = [{}] # 1 worker running
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
class TestOrchestratorWithProcess(TestCase):
@@ -178,8 +177,10 @@ class TestOrchestratorWithProcess(TestCase):
def test_is_running_with_orchestrator_process(self):
"""is_running should return True when orchestrator Process exists."""
from archivebox.machine.models import Process, Machine
import psutil
machine = Machine.current()
current_proc = psutil.Process(os.getpid())
# Create an orchestrator Process record
proc = Process.objects.create(
@@ -187,8 +188,8 @@ class TestOrchestratorWithProcess(TestCase):
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(), # Use current PID so it appears alive
started_at=timezone.now(),
cmd=['archivebox', 'manage', 'orchestrator'],
started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
cmd=current_proc.cmdline(),
)
try:
@@ -393,14 +394,7 @@ class TestProcessLifecycle(TestCase):
def test_process_is_running_property(self):
"""Process.is_running should check actual OS process."""
from archivebox.machine.models import Process
# Create a process with current PID (should be running)
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
started_at=timezone.now(),
)
proc = Process.current()
# Should be running (current process exists)
self.assertTrue(proc.is_running)