mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
bump package versions
This commit is contained in:
@@ -69,7 +69,7 @@ class ModelWithNotes(models.Model):
|
||||
"""Mixin for models with a notes field."""
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class ModelWithHealthStats(models.Model):
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
@property
|
||||
@@ -96,7 +96,7 @@ class ModelWithConfig(models.Model):
|
||||
"""Mixin for models with a JSON config field."""
|
||||
config = models.JSONField(default=dict, null=True, blank=True, editable=True)
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
|
||||
|
||||
@@ -297,6 +297,7 @@ def pluginmap(
|
||||
if not quiet:
|
||||
# Show diagram if this model has one
|
||||
if info.get('diagram'):
|
||||
assert info['diagram'] is not None
|
||||
prnt(Panel(
|
||||
info['diagram'],
|
||||
title=f'[bold green]{info["machine"]}[/bold green]',
|
||||
|
||||
@@ -69,8 +69,8 @@ def list_processes(
|
||||
for process in queryset:
|
||||
if is_tty:
|
||||
binary_name_str = process.binary.name if process.binary else 'unknown'
|
||||
exit_code = process.returncode if process.returncode is not None else '?'
|
||||
status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
|
||||
exit_code = process.exit_code if process.exit_code is not None else '?'
|
||||
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
|
||||
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
|
||||
else:
|
||||
write_record(process.to_json())
|
||||
|
||||
@@ -208,7 +208,7 @@ def search(filter_patterns: list[str] | None=None,
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
# Convert to dict for printable_folders
|
||||
folders: dict[str, Snapshot | None] = {snapshot.output_dir: snapshot for snapshot in snapshots}
|
||||
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
|
||||
output = printable_folders(folders, with_headers)
|
||||
|
||||
# Structured exports must be written directly to stdout.
|
||||
|
||||
@@ -11,6 +11,8 @@ import unittest
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
@@ -31,10 +33,9 @@ DATA_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
init = importlib.import_module('archivebox.main').init
|
||||
constants = importlib.import_module('archivebox.config.constants')
|
||||
SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
|
||||
JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
|
||||
HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
|
||||
SQL_INDEX_FILENAME = CONSTANTS.SQL_INDEX_FILENAME
|
||||
JSON_INDEX_FILENAME = CONSTANTS.JSON_INDEX_FILENAME
|
||||
HTML_INDEX_FILENAME = CONSTANTS.HTML_INDEX_FILENAME
|
||||
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
|
||||
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
|
||||
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
|
||||
|
||||
@@ -68,6 +68,15 @@ def require(value: T | None) -> T:
|
||||
return value
|
||||
|
||||
|
||||
class MockTTYStringIO(StringIO):
|
||||
def __init__(self, initial_value: str = '', *, is_tty: bool):
|
||||
super().__init__(initial_value)
|
||||
self._is_tty = is_tty
|
||||
|
||||
def isatty(self) -> bool:
|
||||
return self._is_tty
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# JSONL Utility Tests
|
||||
# =============================================================================
|
||||
@@ -176,10 +185,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin_content = 'https://example1.com\nhttps://example2.com\n'
|
||||
stream = StringIO(stdin_content)
|
||||
|
||||
# Mock isatty to return False (simulating piped input)
|
||||
stream.isatty = lambda: False
|
||||
stream = MockTTYStringIO(stdin_content, is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
@@ -192,8 +198,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
|
||||
stream = StringIO(stdin_content)
|
||||
stream.isatty = lambda: False
|
||||
stream = MockTTYStringIO(stdin_content, is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
@@ -206,8 +211,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
|
||||
|
||||
stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
|
||||
stream = StringIO(stdin_content)
|
||||
stream.isatty = lambda: False
|
||||
stream = MockTTYStringIO(stdin_content, is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
@@ -219,8 +223,7 @@ class TestReadArgsOrStdin(unittest.TestCase):
|
||||
"""Should not read from TTY stdin (would block)."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stream = StringIO('https://example.com')
|
||||
stream.isatty = lambda: True # Simulate TTY
|
||||
stream = MockTTYStringIO('https://example.com', is_tty=True)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
self.assertEqual(len(records), 0)
|
||||
@@ -297,8 +300,7 @@ class TestSnapshotCommand(unittest.TestCase):
|
||||
"""snapshot should accept Crawl JSONL as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
|
||||
|
||||
stdin = StringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n')
|
||||
stdin.isatty = lambda: False
|
||||
stdin = MockTTYStringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n', is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
@@ -311,8 +313,7 @@ class TestSnapshotCommand(unittest.TestCase):
|
||||
"""snapshot should accept JSONL with tags and other metadata."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
|
||||
stdin.isatty = lambda: False
|
||||
stdin = MockTTYStringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n', is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
@@ -353,8 +354,7 @@ class TestArchiveResultCommand(unittest.TestCase):
|
||||
"""archiveresult should accept JSONL Snapshot records."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
|
||||
stdin.isatty = lambda: False
|
||||
stdin = MockTTYStringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n', is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
@@ -461,395 +461,6 @@ class TestURLCollection(unittest.TestCase):
|
||||
self.assertEqual(len(urls), 0)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Integration Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
Integration tests for the complete piping workflow.
|
||||
|
||||
These tests require Django to be set up and use the actual database.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up Django and test database."""
|
||||
cls.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = cls.test_dir
|
||||
|
||||
# Initialize Django
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
# Initialize the archive
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up test database."""
|
||||
shutil.rmtree(cls.test_dir, ignore_errors=True)
|
||||
|
||||
def test_crawl_creates_and_outputs_jsonl(self):
|
||||
"""
|
||||
Test: archivebox crawl URL1 URL2 URL3
|
||||
Should create a single Crawl with all URLs and output JSONL when piped.
|
||||
"""
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Create crawl with multiple URLs (as newline-separated string)
|
||||
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
|
||||
crawl = require(Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}))
|
||||
self.assertIsNotNone(crawl.id)
|
||||
self.assertEqual(crawl.urls, urls)
|
||||
self.assertEqual(crawl.status, 'queued')
|
||||
|
||||
# Verify URLs list
|
||||
urls_list = crawl.get_urls_list()
|
||||
self.assertEqual(len(urls_list), 2)
|
||||
self.assertIn('https://test-crawl-1.example.com', urls_list)
|
||||
self.assertIn('https://test-crawl-2.example.com', urls_list)
|
||||
|
||||
# Verify output format
|
||||
output = crawl.to_json()
|
||||
self.assertEqual(output['type'], TYPE_CRAWL)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['urls'], urls)
|
||||
self.assertIn('schema_version', output)
|
||||
|
||||
def test_snapshot_accepts_crawl_jsonl(self):
|
||||
"""
|
||||
Test: archivebox crawl URL | archivebox snapshot
|
||||
Snapshot should accept Crawl JSONL and create Snapshots for each URL.
|
||||
"""
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin,
|
||||
TYPE_CRAWL, TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Step 1: Create crawl (simulating 'archivebox crawl')
|
||||
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
|
||||
crawl = require(Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}))
|
||||
crawl_output = crawl.to_json()
|
||||
|
||||
# Step 2: Parse crawl output as snapshot input
|
||||
stdin = StringIO(json.dumps(crawl_output) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_CRAWL)
|
||||
|
||||
# Step 3: Create snapshots from crawl URLs
|
||||
created_snapshots = []
|
||||
for url in crawl.get_urls_list():
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
self.assertEqual(len(created_snapshots), 2)
|
||||
|
||||
# Verify snapshot output
|
||||
for snapshot in created_snapshots:
|
||||
output = snapshot.to_json()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn(output['url'], [
|
||||
'https://crawl-to-snap-1.example.com',
|
||||
'https://crawl-to-snap-2.example.com'
|
||||
])
|
||||
|
||||
def test_snapshot_creates_and_outputs_jsonl(self):
|
||||
"""
|
||||
Test: archivebox snapshot URL
|
||||
Should create a Snapshot and output JSONL when piped.
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Simulate input
|
||||
url = 'https://test-snapshot-1.example.com'
|
||||
records = list(read_args_or_stdin((url,)))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], url)
|
||||
|
||||
# Create snapshot
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = require(Snapshot.from_json(records[0], overrides=overrides))
|
||||
|
||||
self.assertIsNotNone(snapshot.id)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
# Verify output format
|
||||
output = snapshot.to_json()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['url'], url)
|
||||
|
||||
def test_extract_accepts_snapshot_from_previous_command(self):
|
||||
"""
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
Extract should accept JSONL output from snapshot command.
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Step 1: Create snapshot (simulating 'archivebox snapshot')
|
||||
url = 'https://test-extract-1.example.com'
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = require(Snapshot.from_json({'url': url}, overrides=overrides))
|
||||
snapshot_output = snapshot.to_json()
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
stdin = StringIO(json.dumps(snapshot_output) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], str(snapshot.id))
|
||||
|
||||
# Step 3: Gather snapshot IDs (as extract does)
|
||||
snapshot_ids = set()
|
||||
for record in records:
|
||||
if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
|
||||
snapshot_ids.add(record['id'])
|
||||
|
||||
self.assertIn(str(snapshot.id), snapshot_ids)
|
||||
|
||||
def test_full_pipeline_crawl_snapshot_extract(self):
|
||||
"""
|
||||
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
|
||||
This is equivalent to: archivebox add --depth=0 URL
|
||||
"""
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin,
|
||||
TYPE_CRAWL, TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# === archivebox crawl https://example.com ===
|
||||
url = 'https://test-pipeline-full.example.com'
|
||||
crawl = require(Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id}))
|
||||
crawl_jsonl = json.dumps(crawl.to_json())
|
||||
|
||||
# === | archivebox snapshot ===
|
||||
stdin = StringIO(crawl_jsonl + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_CRAWL)
|
||||
|
||||
# Create snapshots from crawl
|
||||
created_snapshots = []
|
||||
for record in records:
|
||||
if record.get('type') == TYPE_CRAWL:
|
||||
crawl_id = record.get('id')
|
||||
if crawl_id:
|
||||
db_crawl = Crawl.objects.get(id=crawl_id)
|
||||
for crawl_url in db_crawl.get_urls_list():
|
||||
snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
self.assertEqual(len(created_snapshots), 1)
|
||||
self.assertEqual(created_snapshots[0].url, url)
|
||||
|
||||
# === | archivebox extract ===
|
||||
snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
|
||||
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], str(created_snapshots[0].id))
|
||||
|
||||
|
||||
class TestDepthWorkflows(unittest.TestCase):
|
||||
"""Test various depth crawl workflows."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up Django and test database."""
|
||||
cls.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = cls.test_dir
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up test database."""
|
||||
shutil.rmtree(cls.test_dir, ignore_errors=True)
|
||||
|
||||
def test_depth_0_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
|
||||
Depth 0: Only archive the specified URL, no recursive crawling.
|
||||
"""
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Create crawl with depth 0
|
||||
url = 'https://depth0-test.example.com'
|
||||
crawl = require(Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id}))
|
||||
|
||||
self.assertEqual(crawl.max_depth, 0)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = require(Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id}))
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
def test_depth_metadata_in_crawl(self):
|
||||
"""Test that depth metadata is stored in Crawl."""
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Create crawl with depth
|
||||
crawl = require(Crawl.from_json(
|
||||
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
|
||||
overrides={'created_by_id': created_by_id}
|
||||
))
|
||||
|
||||
self.assertEqual(crawl.max_depth, 2)
|
||||
|
||||
# Verify in JSONL output
|
||||
output = crawl.to_json()
|
||||
self.assertEqual(output['max_depth'], 2)
|
||||
|
||||
|
||||
class TestParserPluginWorkflows(unittest.TestCase):
|
||||
"""Test workflows with specific parser plugins."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up Django and test database."""
|
||||
cls.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = cls.test_dir
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up test database."""
|
||||
shutil.rmtree(cls.test_dir, ignore_errors=True)
|
||||
|
||||
def test_html_parser_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 1)
|
||||
self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
|
||||
self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')
|
||||
|
||||
def test_rss_parser_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
|
||||
'{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 2)
|
||||
self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))
|
||||
|
||||
def test_multiple_parsers_dedupe(self):
|
||||
"""
|
||||
Multiple parsers may discover the same URL - should be deduplicated.
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
# Create mock output with duplicate URLs from different parsers
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://same-url.com"}\n'
|
||||
)
|
||||
|
||||
(snapshot_dir / 'wget').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'wget' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://same-url.com"}\n' # Same URL, different extractor
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
all_discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
|
||||
# Both entries are returned (deduplication happens at the crawl command level)
|
||||
self.assertEqual(len(all_discovered), 2)
|
||||
|
||||
# Verify both extractors found the same URL
|
||||
urls = {d['url'] for d in all_discovered}
|
||||
self.assertEqual(urls, {'https://same-url.com'})
|
||||
|
||||
|
||||
class TestEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases and error handling."""
|
||||
|
||||
@@ -858,8 +469,7 @@ class TestEdgeCases(unittest.TestCase):
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Empty args, TTY stdin (should not block)
|
||||
stdin = StringIO('')
|
||||
stdin.isatty = lambda: True
|
||||
stdin = MockTTYStringIO('', is_tty=True)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 0)
|
||||
@@ -868,12 +478,12 @@ class TestEdgeCases(unittest.TestCase):
|
||||
"""Should skip malformed JSONL lines."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO(
|
||||
stdin = MockTTYStringIO(
|
||||
'{"url": "https://good.com"}\n'
|
||||
'not valid json\n'
|
||||
'{"url": "https://also-good.com"}\n'
|
||||
'{"url": "https://also-good.com"}\n',
|
||||
is_tty=False,
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
@@ -885,12 +495,12 @@ class TestEdgeCases(unittest.TestCase):
|
||||
"""Should handle mixed URLs and JSONL."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO(
|
||||
stdin = MockTTYStringIO(
|
||||
'https://plain-url.com\n'
|
||||
'{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
|
||||
'01234567-89ab-cdef-0123-456789abcdef\n' # UUID
|
||||
'01234567-89ab-cdef-0123-456789abcdef\n', # UUID
|
||||
is_tty=False,
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
@@ -942,12 +552,12 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
url_record = {'url': 'https://example.com'}
|
||||
|
||||
# Mock stdin with both records
|
||||
stdin = StringIO(
|
||||
stdin = MockTTYStringIO(
|
||||
json.dumps(tag_record)
|
||||
+ '\n'
|
||||
+ json.dumps(url_record)
|
||||
+ json.dumps(url_record),
|
||||
is_tty=False,
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
# The Tag should be passed through, the URL should create a Crawl
|
||||
# (This is a unit test of the pass-through logic)
|
||||
|
||||
@@ -5,6 +5,7 @@ import pwd
|
||||
import sys
|
||||
import socket
|
||||
import platform
|
||||
from typing import cast
|
||||
|
||||
from rich import print
|
||||
|
||||
@@ -32,7 +33,7 @@ EGID = os.getegid()
|
||||
SUDO_UID = int(os.environ.get('SUDO_UID', 0))
|
||||
SUDO_GID = int(os.environ.get('SUDO_GID', 0))
|
||||
USER: str = Path('~').expanduser().resolve().name
|
||||
HOSTNAME: str = max([socket.gethostname(), platform.node()], key=len)
|
||||
HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len))
|
||||
|
||||
IS_ROOT = RUNNING_AS_UID == 0
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
|
||||
@@ -33,6 +33,11 @@ def is_superuser(request: HttpRequest) -> bool:
|
||||
return bool(getattr(request.user, 'is_superuser', False))
|
||||
|
||||
|
||||
def format_parsed_datetime(value: object) -> str:
|
||||
parsed = parse_date(value)
|
||||
return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else ""
|
||||
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
indent_str = " " * indent
|
||||
if indent == 0:
|
||||
@@ -412,7 +417,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
rows["Name"].append(ItemLink(proc_name, key=proc_name))
|
||||
rows["State"].append(str(proc_data.get("statename") or ""))
|
||||
rows['PID'].append(proc_description.replace('pid ', ''))
|
||||
rows["Started"].append(parse_date(proc_start).strftime("%Y-%m-%d %H:%M:%S") if proc_start else '')
|
||||
rows["Started"].append(format_parsed_datetime(proc_start))
|
||||
rows["Command"].append(str(proc_config.get("command") or ""))
|
||||
rows["Logfile"].append(
|
||||
format_html(
|
||||
@@ -458,7 +463,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
relevant_config = CONFIG_FILE.read_text()
|
||||
relevant_logs = str(supervisor.readLog(0, 10_000_000))
|
||||
start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0]
|
||||
uptime = str(timezone.now() - parse_date(start_ts)).split(".")[0]
|
||||
start_dt = parse_date(start_ts)
|
||||
uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else ""
|
||||
supervisor_state = supervisor.getState()
|
||||
|
||||
proc: Dict[str, object] = {
|
||||
@@ -485,8 +491,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"Command": str(proc.get("name") or ""),
|
||||
"PID": str(proc.get("pid") or ""),
|
||||
"State": str(proc.get("statename") or ""),
|
||||
"Started": parse_date(proc.get("start")).strftime("%Y-%m-%d %H:%M:%S") if proc.get("start") else "",
|
||||
"Stopped": parse_date(proc.get("stop")).strftime("%Y-%m-%d %H:%M:%S") if proc.get("stop") else "",
|
||||
"Started": format_parsed_datetime(proc.get("start")),
|
||||
"Stopped": format_parsed_datetime(proc.get("stop")),
|
||||
"Exit Status": str(proc.get("exitstatus") or ""),
|
||||
"Logfile": str(proc.get("stdout_logfile") or ""),
|
||||
"Uptime": str(str(proc.get("description") or "").split("uptime ", 1)[-1]),
|
||||
@@ -524,7 +530,7 @@ def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
for logfile in log_files:
|
||||
st = logfile.stat()
|
||||
rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name))
|
||||
rows["Last Updated"].append(parse_date(st.st_mtime).strftime("%Y-%m-%d %H:%M:%S"))
|
||||
rows["Last Updated"].append(format_parsed_datetime(st.st_mtime))
|
||||
rows["Size"].append(f'{st.st_size//1000} kb')
|
||||
|
||||
with open(logfile, 'rb') as f:
|
||||
@@ -557,7 +563,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"fields": {
|
||||
"Path": str(log_file),
|
||||
"Size": f"{log_stat.st_size//1000} kb",
|
||||
"Last Updated": parse_date(log_stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"Last Updated": format_parsed_datetime(log_stat.st_mtime),
|
||||
"Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]),
|
||||
"Full Log": log_text,
|
||||
},
|
||||
|
||||
@@ -1,7 +1,20 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from django.contrib import admin
|
||||
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||
from admin_data_views.admin import (
|
||||
admin_data_index_view as adv_admin_data_index_view,
|
||||
get_admin_data_urls as adv_get_admin_data_urls,
|
||||
get_app_list as adv_get_app_list,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.http import HttpRequest
|
||||
from django.template.response import TemplateResponse
|
||||
from django.urls import URLPattern, URLResolver
|
||||
|
||||
from admin_data_views.typing import AppDict
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
@@ -10,6 +23,20 @@ class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_title = 'Admin'
|
||||
namespace = 'admin'
|
||||
|
||||
def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']:
|
||||
if app_label is None:
|
||||
return adv_get_app_list(self, request)
|
||||
return adv_get_app_list(self, request, app_label)
|
||||
|
||||
def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse':
|
||||
return adv_admin_data_index_view(self, request, **kwargs)
|
||||
|
||||
def get_admin_data_urls(self) -> list['URLResolver | URLPattern']:
|
||||
return adv_get_admin_data_urls(self)
|
||||
|
||||
def get_urls(self) -> list['URLResolver | URLPattern']:
|
||||
return self.get_admin_data_urls() + super().get_urls()
|
||||
|
||||
|
||||
archivebox_admin = ArchiveBoxAdmin()
|
||||
# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin
|
||||
@@ -17,13 +44,6 @@ archivebox_admin = ArchiveBoxAdmin()
|
||||
|
||||
|
||||
|
||||
# patch admin with methods to add data views (implemented by admin_data_views package)
|
||||
# https://github.com/MrThearMan/django-admin-data-views
|
||||
# https://mrthearman.github.io/django-admin-data-views/setup/
|
||||
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
|
||||
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
|
||||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########
|
||||
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Optional, Dict, Iterable, Any, List
|
||||
from typing import Optional, Dict, Iterable, Any, List, Sequence, cast
|
||||
import uuid
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import datetime, timedelta
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
import os
|
||||
import json
|
||||
@@ -20,6 +20,7 @@ from django.core.cache import cache
|
||||
from django.urls import reverse_lazy
|
||||
from django.contrib import admin
|
||||
from django.conf import settings
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.system import get_dir_size, atomic_write
|
||||
@@ -51,7 +52,7 @@ class Tag(ModelWithUUID):
|
||||
|
||||
snapshot_set: models.Manager['Snapshot']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
class Meta(ModelWithUUID.Meta):
|
||||
app_label = 'core'
|
||||
verbose_name = "Tag"
|
||||
verbose_name_plural = "Tags"
|
||||
@@ -88,7 +89,7 @@ class Tag(ModelWithUUID):
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_tag', args=[self.id])
|
||||
return str(reverse_lazy('api-1:get_tag', args=[self.id]))
|
||||
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
@@ -104,7 +105,7 @@ class Tag(ModelWithUUID):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None):
|
||||
"""
|
||||
Create/update Tag from JSON dict.
|
||||
|
||||
@@ -259,7 +260,7 @@ class SnapshotQuerySet(models.QuerySet):
|
||||
})
|
||||
|
||||
|
||||
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base]
|
||||
"""Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
|
||||
|
||||
def filter(self, *args, **kwargs):
|
||||
@@ -283,8 +284,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
from django.db import transaction
|
||||
if atomic:
|
||||
with transaction.atomic():
|
||||
return self.delete()
|
||||
return self.delete()
|
||||
return self.get_queryset().delete()
|
||||
return self.get_queryset().delete()
|
||||
|
||||
|
||||
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
@@ -318,10 +319,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
StatusChoices = ModelWithStateMachine.StatusChoices
|
||||
active_state = StatusChoices.STARTED
|
||||
|
||||
crawl_id: uuid.UUID
|
||||
parent_snapshot_id: uuid.UUID | None
|
||||
_prefetched_objects_cache: dict[str, Any]
|
||||
|
||||
objects = SnapshotManager()
|
||||
archiveresult_set: models.Manager['ArchiveResult']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
class Meta(
|
||||
ModelWithOutputDir.Meta,
|
||||
ModelWithConfig.Meta,
|
||||
ModelWithNotes.Meta,
|
||||
ModelWithHealthStats.Meta,
|
||||
ModelWithStateMachine.Meta,
|
||||
):
|
||||
app_label = 'core'
|
||||
verbose_name = "Snapshot"
|
||||
verbose_name_plural = "Snapshots"
|
||||
@@ -663,6 +674,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
|
||||
if candidates.count() == 1:
|
||||
snapshot = candidates.first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
|
||||
return snapshot
|
||||
elif candidates.count() > 1:
|
||||
@@ -751,14 +764,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
|
||||
def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> Optional[str]:
|
||||
"""
|
||||
Select best timestamp from index.json vs folder name.
|
||||
|
||||
Validates range (1995-2035).
|
||||
Prefers index.json if valid.
|
||||
"""
|
||||
def is_valid_timestamp(ts):
|
||||
def is_valid_timestamp(ts: object | None) -> bool:
|
||||
if not isinstance(ts, (str, int, float)):
|
||||
return False
|
||||
try:
|
||||
ts_int = int(float(ts))
|
||||
# 1995-01-01 to 2035-12-31
|
||||
@@ -769,12 +784,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
|
||||
folder_valid = is_valid_timestamp(folder_name)
|
||||
|
||||
if index_valid:
|
||||
return str(int(float(index_timestamp)))
|
||||
elif folder_valid:
|
||||
return str(int(float(folder_name)))
|
||||
else:
|
||||
return None
|
||||
if index_valid and index_timestamp is not None:
|
||||
return str(int(float(str(index_timestamp))))
|
||||
if folder_valid:
|
||||
return str(int(float(str(folder_name))))
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
|
||||
@@ -1039,7 +1053,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
)
|
||||
|
||||
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
|
||||
result = {
|
||||
result: dict[str, Any] = {
|
||||
'snapshot': None,
|
||||
'archive_results': [],
|
||||
'binaries': [],
|
||||
@@ -1210,7 +1224,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return merged
|
||||
|
||||
@classmethod
|
||||
def _merge_snapshots(cls, snapshots: list['Snapshot']):
|
||||
def _merge_snapshots(cls, snapshots: Sequence['Snapshot']):
|
||||
"""
|
||||
Merge exact duplicates.
|
||||
Keep oldest, union files + ArchiveResults.
|
||||
@@ -1271,19 +1285,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
@admin.display(description='Tags')
|
||||
def tags_str(self, nocache=True) -> str | None:
|
||||
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
|
||||
prefetched_cache = getattr(self, '_prefetched_objects_cache', {})
|
||||
if 'tags' in prefetched_cache:
|
||||
return calc_tags_str()
|
||||
cache_key = f'{self.pk}-tags'
|
||||
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
|
||||
|
||||
def icons(self, path: Optional[str] = None) -> str:
|
||||
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.utils.html import format_html
|
||||
|
||||
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
||||
|
||||
def calc_icons():
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
prefetched_cache = getattr(self, '_prefetched_objects_cache', {})
|
||||
if 'archiveresult_set' in prefetched_cache:
|
||||
archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
|
||||
else:
|
||||
# Filter for results that have either output_files or output_str
|
||||
@@ -1331,7 +1347,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_snapshot', args=[self.id])
|
||||
return str(reverse_lazy('api-1:get_snapshot', args=[self.id]))
|
||||
|
||||
def get_absolute_url(self):
|
||||
return f'/{self.archive_path}'
|
||||
@@ -1341,23 +1357,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return url_domain(self.url)
|
||||
|
||||
@property
|
||||
def output_dir(self):
|
||||
def title_stripped(self) -> str:
|
||||
return (self.title or '').strip()
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
"""The filesystem path to the snapshot's output directory."""
|
||||
import os
|
||||
|
||||
current_path = self.get_storage_path_for_version(self.fs_version)
|
||||
|
||||
if current_path.exists():
|
||||
return str(current_path)
|
||||
return current_path
|
||||
|
||||
# Check for backwards-compat symlink
|
||||
old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
|
||||
if old_path.is_symlink():
|
||||
return str(Path(os.readlink(old_path)).resolve())
|
||||
link_target = Path(os.readlink(old_path))
|
||||
return (old_path.parent / link_target).resolve() if not link_target.is_absolute() else link_target.resolve()
|
||||
elif old_path.exists():
|
||||
return str(old_path)
|
||||
return old_path
|
||||
|
||||
return str(current_path)
|
||||
return current_path
|
||||
|
||||
def ensure_legacy_archive_symlink(self) -> None:
|
||||
"""Ensure the legacy archive/<timestamp> path resolves to this snapshot."""
|
||||
@@ -1405,7 +1426,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
date_base = crawl.created_at or self.created_at or timezone.now()
|
||||
date_str = date_base.strftime('%Y%m%d')
|
||||
domain = self.extract_domain_from_url(self.url)
|
||||
username = crawl.created_by.username if crawl.created_by_id else 'system'
|
||||
username = crawl.created_by.username if getattr(crawl, 'created_by_id', None) else 'system'
|
||||
|
||||
crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id)
|
||||
link_path = crawl_dir / 'snapshots' / domain / str(self.id)
|
||||
@@ -1591,7 +1612,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None, queue_for_extraction: bool = True):
|
||||
"""
|
||||
Create/update Snapshot from JSON dict.
|
||||
|
||||
@@ -1859,7 +1880,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'is_sealed': is_sealed,
|
||||
}
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
|
||||
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
|
||||
@@ -2163,20 +2184,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
cols = cols or ['timestamp', 'is_archived', 'url']
|
||||
return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
|
||||
|
||||
def write_json_details(self, out_dir: Optional[str] = None) -> None:
|
||||
def write_json_details(self, out_dir: Path | str | None = None) -> None:
|
||||
"""Write JSON index file for this snapshot to its output directory"""
|
||||
out_dir = out_dir or self.output_dir
|
||||
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
||||
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
|
||||
path = output_dir / CONSTANTS.JSON_INDEX_FILENAME
|
||||
atomic_write(str(path), self.to_dict(extended=True))
|
||||
|
||||
def write_html_details(self, out_dir: Optional[str] = None) -> None:
|
||||
def write_html_details(self, out_dir: Path | str | None = None) -> None:
|
||||
"""Write HTML detail page for this snapshot to its output directory"""
|
||||
from django.template.loader import render_to_string
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
out_dir = out_dir or self.output_dir
|
||||
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
|
||||
config = get_config()
|
||||
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
@@ -2198,12 +2219,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
for plugin in preview_priority:
|
||||
out = outputs_by_plugin.get(plugin)
|
||||
if out and out.get('path'):
|
||||
best_preview_path = out['path']
|
||||
best_preview_path = str(out['path'])
|
||||
best_result = out
|
||||
break
|
||||
|
||||
if best_preview_path == 'about:blank' and outputs:
|
||||
best_preview_path = outputs[0].get('path') or 'about:blank'
|
||||
best_preview_path = str(outputs[0].get('path') or 'about:blank')
|
||||
best_result = outputs[0]
|
||||
context = {
|
||||
**self.to_dict(extended=True),
|
||||
@@ -2223,7 +2244,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'archiveresults': outputs,
|
||||
}
|
||||
rendered_html = render_to_string('snapshot.html', context)
|
||||
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
||||
atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
||||
|
||||
# =========================================================================
|
||||
# Helper Methods
|
||||
@@ -2285,6 +2306,8 @@ class SnapshotMachine(BaseStateMachine):
|
||||
# Manual event (can also be triggered by last ArchiveResult finishing)
|
||||
seal = started.to(sealed)
|
||||
|
||||
snapshot: Snapshot
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
return can_start
|
||||
@@ -2332,7 +2355,7 @@ class SnapshotMachine(BaseStateMachine):
|
||||
if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
|
||||
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
|
||||
# Seal the parent crawl
|
||||
crawl.sm.seal()
|
||||
cast(Any, crawl).sm.seal()
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
|
||||
@@ -2391,7 +2414,15 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
state_field_name = 'status'
|
||||
active_state = StatusChoices.STARTED
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
snapshot_id: uuid.UUID
|
||||
process_id: uuid.UUID | None
|
||||
|
||||
class Meta(
|
||||
ModelWithOutputDir.Meta,
|
||||
ModelWithConfig.Meta,
|
||||
ModelWithNotes.Meta,
|
||||
ModelWithStateMachine.Meta,
|
||||
):
|
||||
app_label = 'core'
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results Log'
|
||||
@@ -2442,7 +2473,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None):
|
||||
"""
|
||||
Create/update ArchiveResult from JSON dict.
|
||||
|
||||
@@ -2469,7 +2500,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Get or create by snapshot_id + plugin
|
||||
try:
|
||||
from archivebox.core.models import Snapshot
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
result, _ = ArchiveResult.objects.get_or_create(
|
||||
@@ -2531,7 +2561,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_archiveresult', args=[self.id])
|
||||
return str(reverse_lazy('api-1:get_archiveresult', args=[self.id]))
|
||||
|
||||
def get_absolute_url(self):
|
||||
return f'/{self.snapshot.archive_path}/{self.plugin}'
|
||||
@@ -3198,6 +3228,8 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
# Reason: backoff should always retry→started, then started→final states
|
||||
)
|
||||
|
||||
archiveresult: ArchiveResult
|
||||
|
||||
def can_start(self) -> bool:
|
||||
"""Pure function - check if AR can start (has valid URL)."""
|
||||
return bool(self.archiveresult.snapshot.url)
|
||||
@@ -3259,7 +3291,7 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
process = self.archiveresult.process
|
||||
|
||||
# If process is NOT running anymore, reap the background hook
|
||||
if not process.is_running():
|
||||
if not process.is_running:
|
||||
self.archiveresult.update_from_output()
|
||||
# Check if now in final state after reaping
|
||||
return self.archiveresult.status in (
|
||||
@@ -3331,7 +3363,7 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
|
||||
# Seal the parent snapshot
|
||||
snapshot.sm.seal()
|
||||
cast(Any, snapshot).sm.seal()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
|
||||
@@ -3,6 +3,8 @@ __package__ = "archivebox.core"
|
||||
import os
|
||||
import sys
|
||||
import inspect
|
||||
import importlib
|
||||
from typing import Any, cast
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
@@ -119,8 +121,8 @@ try:
|
||||
|
||||
try:
|
||||
# Try to import django-auth-ldap (will fail if not installed)
|
||||
from django_auth_ldap.config import LDAPSearch
|
||||
import ldap
|
||||
LDAPSearch = importlib.import_module("django_auth_ldap.config").LDAPSearch
|
||||
ldap = importlib.import_module("ldap")
|
||||
|
||||
# Configure LDAP authentication
|
||||
AUTH_LDAP_SERVER_URI = LDAP_CONFIG.LDAP_SERVER_URI
|
||||
@@ -130,7 +132,7 @@ try:
|
||||
# Configure user search
|
||||
AUTH_LDAP_USER_SEARCH = LDAPSearch(
|
||||
LDAP_CONFIG.LDAP_USER_BASE,
|
||||
ldap.SCOPE_SUBTREE,
|
||||
getattr(ldap, "SCOPE_SUBTREE", 2),
|
||||
LDAP_CONFIG.LDAP_USER_FILTER,
|
||||
)
|
||||
|
||||
@@ -432,7 +434,7 @@ LOGGING = SETTINGS_LOGGING
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
|
||||
SIGNAL_WEBHOOKS = {
|
||||
SIGNAL_WEBHOOKS: dict[str, object] = {
|
||||
"HOOKS": {
|
||||
# ... is a special sigil value that means "use the default autogenerated hooks"
|
||||
"django.contrib.auth.models.User": ...,
|
||||
@@ -444,7 +446,8 @@ SIGNAL_WEBHOOKS = {
|
||||
}
|
||||
|
||||
# Avoid background threads touching sqlite connections (especially during tests/migrations).
|
||||
if DATABASES["default"]["ENGINE"].endswith("sqlite3"):
|
||||
default_database = cast(dict[str, Any], DATABASES["default"])
|
||||
if str(default_database["ENGINE"]).endswith("sqlite3"):
|
||||
SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler"
|
||||
|
||||
################################################################################
|
||||
@@ -551,10 +554,8 @@ if DEBUG_TOOLBAR:
|
||||
MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"]
|
||||
|
||||
if DEBUG:
|
||||
from django_autotyping.typing import AutotypingSettingsDict
|
||||
|
||||
INSTALLED_APPS += ["django_autotyping"]
|
||||
AUTOTYPING: AutotypingSettingsDict = {
|
||||
AUTOTYPING = {
|
||||
"STUBS_GENERATION": {
|
||||
"LOCAL_STUBS_DIR": PACKAGE_DIR / "typings",
|
||||
}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
"""Template tags for accessing config values in templates."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from django import template
|
||||
|
||||
from archivebox.config.configset import get_config as _get_config
|
||||
@@ -8,7 +10,7 @@ register = template.Library()
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def get_config(key: str) -> any:
|
||||
def get_config(key: str) -> Any:
|
||||
"""
|
||||
Get a config value by key.
|
||||
|
||||
|
||||
@@ -4,6 +4,9 @@ import importlib
|
||||
import os
|
||||
import django
|
||||
from unittest.mock import patch
|
||||
from typing import TypeVar, cast
|
||||
|
||||
from django.forms import BaseForm
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
@@ -18,6 +21,14 @@ CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedul
|
||||
Tag = importlib.import_module('archivebox.core.models').Tag
|
||||
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def require(value: T | None) -> T:
|
||||
if value is None:
|
||||
raise AssertionError('Expected value to be present')
|
||||
return value
|
||||
|
||||
|
||||
class AddViewTests(TestCase):
|
||||
"""Tests for the AddView (crawl creation form)."""
|
||||
@@ -111,7 +122,7 @@ class AddViewTests(TestCase):
|
||||
|
||||
# Check that crawl was created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
crawl = Crawl.objects.first()
|
||||
crawl = require(Crawl.objects.first())
|
||||
|
||||
self.assertIn('https://example.com', crawl.urls)
|
||||
self.assertIn('https://example.org', crawl.urls)
|
||||
@@ -140,8 +151,8 @@ class AddViewTests(TestCase):
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 1)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
schedule = CrawlSchedule.objects.first()
|
||||
crawl = require(Crawl.objects.first())
|
||||
schedule = require(CrawlSchedule.objects.first())
|
||||
|
||||
self.assertEqual(crawl.schedule, schedule)
|
||||
self.assertEqual(schedule.template, crawl)
|
||||
@@ -159,7 +170,7 @@ class AddViewTests(TestCase):
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
schedule = CrawlSchedule.objects.first()
|
||||
schedule = require(CrawlSchedule.objects.first())
|
||||
self.assertEqual(schedule.schedule, '0 */6 * * *')
|
||||
|
||||
def test_add_crawl_with_plugins(self):
|
||||
@@ -173,7 +184,7 @@ class AddViewTests(TestCase):
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
crawl = require(Crawl.objects.first())
|
||||
plugins = crawl.config.get('PLUGINS', '')
|
||||
|
||||
# Should contain the selected plugins
|
||||
@@ -209,7 +220,7 @@ class AddViewTests(TestCase):
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
crawl = require(Crawl.objects.first())
|
||||
config = crawl.config
|
||||
|
||||
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
|
||||
@@ -236,7 +247,7 @@ class AddViewTests(TestCase):
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
crawl = require(Crawl.objects.order_by('-created_at').first())
|
||||
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
|
||||
|
||||
def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
|
||||
@@ -248,7 +259,7 @@ class AddViewTests(TestCase):
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
crawl = require(Crawl.objects.order_by('-created_at').first())
|
||||
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
|
||||
|
||||
def test_add_staff_admin_custom_config_is_allowed(self):
|
||||
@@ -269,7 +280,7 @@ class AddViewTests(TestCase):
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
crawl = require(Crawl.objects.order_by('-created_at').first())
|
||||
self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
|
||||
|
||||
def test_add_empty_urls_fails(self):
|
||||
@@ -281,7 +292,7 @@ class AddViewTests(TestCase):
|
||||
|
||||
# Should show form again with errors, not redirect
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertFormError(response, 'form', 'url', 'This field is required.')
|
||||
self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
|
||||
|
||||
def test_add_invalid_urls_fails(self):
|
||||
"""Test that invalid URLs fail validation."""
|
||||
@@ -355,7 +366,7 @@ class AddViewTests(TestCase):
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
crawl = require(Crawl.objects.first())
|
||||
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
|
||||
|
||||
def test_crawl_redirects_to_admin_change_page(self):
|
||||
@@ -365,7 +376,7 @@ class AddViewTests(TestCase):
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
crawl = require(Crawl.objects.first())
|
||||
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
|
||||
|
||||
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
|
||||
|
||||
@@ -4,6 +4,7 @@ from django.urls import path, re_path, include
|
||||
from django.views import static
|
||||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
from django.http import HttpRequest
|
||||
|
||||
from archivebox.misc.serve_static import serve_static
|
||||
|
||||
@@ -53,7 +54,7 @@ urlpatterns = [
|
||||
path("api/", include('archivebox.api.urls'), name='api'),
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
path('error/', lambda *_: 1/0), # type: ignore
|
||||
path('error/', lambda request: _raise_test_error(request)),
|
||||
|
||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||
|
||||
@@ -61,6 +62,10 @@ urlpatterns = [
|
||||
path('', HomepageView.as_view(), name='Home'),
|
||||
]
|
||||
|
||||
|
||||
def _raise_test_error(_request: HttpRequest):
|
||||
raise ZeroDivisionError('Intentional test error route')
|
||||
|
||||
if settings.DEBUG_TOOLBAR:
|
||||
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
|
||||
|
||||
|
||||
@@ -5,13 +5,14 @@ import posixpath
|
||||
from glob import glob, escape
|
||||
from django.utils import timezone
|
||||
import inspect
|
||||
from typing import Callable, get_type_hints
|
||||
from typing import Callable, cast, get_type_hints
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.views import View
|
||||
from django.views.generic.list import ListView
|
||||
from django.views.generic import FormView
|
||||
@@ -21,7 +22,7 @@ from django.contrib.auth.mixins import UserPassesTestMixin
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.utils.decorators import method_decorator
|
||||
|
||||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.typing import TableContext, ItemContext, SectionData
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
@@ -854,7 +855,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
|
||||
def _can_override_crawl_config(self) -> bool:
|
||||
user = self.request.user
|
||||
return bool(user.is_authenticated and (user.is_superuser or user.is_staff))
|
||||
return bool(user.is_authenticated and (getattr(user, 'is_superuser', False) or getattr(user, 'is_staff', False)))
|
||||
|
||||
def _get_custom_config_overrides(self, form: AddLinkForm) -> dict:
|
||||
custom_config = form.cleaned_data.get("config") or {}
|
||||
@@ -906,7 +907,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'
|
||||
created_by_name = getattr(self.request.user, 'username', 'web') if self.request.user.is_authenticated else 'web'
|
||||
|
||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
|
||||
@@ -1015,8 +1016,8 @@ class WebAddView(AddView):
|
||||
|
||||
return super().dispatch(request, *args, **kwargs)
|
||||
|
||||
def get(self, request, url: str):
|
||||
requested_url = urldecode(url)
|
||||
def get(self, request: HttpRequest, *args: object, **kwargs: object):
|
||||
requested_url = urldecode(str(kwargs.get('url') or (args[0] if args else '')))
|
||||
if not requested_url:
|
||||
raise Http404
|
||||
|
||||
@@ -1025,6 +1026,7 @@ class WebAddView(AddView):
|
||||
return redirect(f'/{snapshot.url_path}')
|
||||
|
||||
add_url = self._normalize_add_url(requested_url)
|
||||
assert self.form_class is not None
|
||||
defaults_form = self.form_class()
|
||||
form_data = {
|
||||
'url': add_url,
|
||||
@@ -1045,6 +1047,7 @@ class WebAddView(AddView):
|
||||
|
||||
crawl = self._create_crawl_from_form(form)
|
||||
snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
|
||||
assert snapshot is not None
|
||||
return redirect(f'/{snapshot.url_path}')
|
||||
|
||||
|
||||
@@ -1385,7 +1388,7 @@ def find_config_type(key: str) -> str:
|
||||
# Try to get from pydantic model_fields first (more reliable)
|
||||
if hasattr(config, 'model_fields') and key in config.model_fields:
|
||||
field = config.model_fields[key]
|
||||
if hasattr(field, 'annotation'):
|
||||
if hasattr(field, 'annotation') and field.annotation is not None:
|
||||
try:
|
||||
return str(field.annotation.__name__)
|
||||
except AttributeError:
|
||||
@@ -1448,7 +1451,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.'
|
||||
|
||||
# Get merged config that includes Machine.config overrides
|
||||
try:
|
||||
@@ -1519,7 +1522,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
CONFIGS = get_all_configs()
|
||||
FLAT_CONFIG = get_flat_config()
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.'
|
||||
|
||||
# Get merged config
|
||||
merged_config = get_config()
|
||||
@@ -1575,62 +1578,62 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
section_header = mark_safe(f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, calculated at runtime)</small>')
|
||||
|
||||
|
||||
section_data = cast(SectionData, {
|
||||
"name": section_header,
|
||||
"description": None,
|
||||
"fields": {
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': final_value,
|
||||
'Source': find_config_source(key, merged_config),
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
||||
<span style="display: {"inline" if aliases else "none"}">
|
||||
Aliases: {", ".join(aliases)}
|
||||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
|
||||
See full definition in <code>archivebox/config</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
<br/><hr/><br/>
|
||||
<b>Configuration Sources (in priority order):</b><br/><br/>
|
||||
{sources_html}
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
|
||||
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||
<br/><br/>
|
||||
<code>archivebox config --set {key}="{
|
||||
val.strip("'")
|
||||
if (val := find_config_default(key)) else
|
||||
(str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
'Source': mark_safe(f'''
|
||||
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
|
||||
<br/><br/>
|
||||
Priority order (highest to lowest):
|
||||
<ol>
|
||||
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
|
||||
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
|
||||
</li>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
|
||||
<li><b style="color: gray">Default</b> - Default value from code</li>
|
||||
</ol>
|
||||
{f'<br/><b>Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
|
||||
'''),
|
||||
},
|
||||
})
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[
|
||||
{
|
||||
"name": section_header,
|
||||
"description": None,
|
||||
"fields": {
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': final_value,
|
||||
'Source': find_config_source(key, merged_config),
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
||||
<span style="display: {"inline" if aliases else "none"}">
|
||||
Aliases: {", ".join(aliases)}
|
||||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
|
||||
See full definition in <code>archivebox/config</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
<br/><hr/><br/>
|
||||
<b>Configuration Sources (in priority order):</b><br/><br/>
|
||||
{sources_html}
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
|
||||
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||
<br/><br/>
|
||||
<code>archivebox config --set {key}="{
|
||||
val.strip("'")
|
||||
if (val := find_config_default(key)) else
|
||||
(str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
'Source': mark_safe(f'''
|
||||
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
|
||||
<br/><br/>
|
||||
Priority order (highest to lowest):
|
||||
<ol>
|
||||
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
|
||||
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
|
||||
</li>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
|
||||
<li><b style="color: gray">Default</b> - Default value from code</li>
|
||||
</ol>
|
||||
{f'<br/><b>💡 Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
|
||||
'''),
|
||||
},
|
||||
},
|
||||
],
|
||||
data=[section_data],
|
||||
)
|
||||
|
||||
@@ -16,7 +16,7 @@ class TagEditorWidget(forms.Widget):
|
||||
- Press Enter or Space to create new tags (auto-creates if doesn't exist)
|
||||
- Uses AJAX for autocomplete and tag creation
|
||||
"""
|
||||
template_name = None # We render manually
|
||||
template_name = "" # We render manually
|
||||
|
||||
class Media:
|
||||
css = {'all': []}
|
||||
|
||||
@@ -2,7 +2,8 @@ __package__ = 'archivebox.crawls'
|
||||
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html, format_html_join, mark_safe
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.contrib import admin, messages
|
||||
from django.db.models import Count, Q
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import uuid
|
||||
from datetime import timedelta
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
@@ -10,7 +11,6 @@ from django.core.validators import MaxValueValidator, MinValueValidator
|
||||
from django.conf import settings
|
||||
from django.urls import reverse_lazy
|
||||
from django.utils import timezone
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
from statemachine import State, registry
|
||||
from rich import print
|
||||
|
||||
@@ -36,7 +36,7 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes):
|
||||
|
||||
crawl_set: models.Manager['Crawl']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
class Meta(ModelWithUUID.Meta, ModelWithNotes.Meta):
|
||||
app_label = 'crawls'
|
||||
verbose_name = 'Scheduled Crawl'
|
||||
verbose_name_plural = 'Scheduled Crawls'
|
||||
@@ -47,7 +47,7 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes):
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_any', args=[self.id])
|
||||
return str(reverse_lazy('api-1:get_any', args=[self.id]))
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.schedule = (self.schedule or '').strip()
|
||||
@@ -119,9 +119,17 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
StatusChoices = ModelWithStateMachine.StatusChoices
|
||||
active_state = StatusChoices.STARTED
|
||||
|
||||
schedule_id: uuid.UUID | None
|
||||
sm: 'CrawlMachine'
|
||||
|
||||
snapshot_set: models.Manager['Snapshot']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
class Meta(
|
||||
ModelWithOutputDir.Meta,
|
||||
ModelWithConfig.Meta,
|
||||
ModelWithHealthStats.Meta,
|
||||
ModelWithStateMachine.Meta,
|
||||
):
|
||||
app_label = 'crawls'
|
||||
verbose_name = 'Crawl'
|
||||
verbose_name_plural = 'Crawls'
|
||||
@@ -152,7 +160,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||
return str(reverse_lazy('api-1:get_crawl', args=[self.id]))
|
||||
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
@@ -172,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict, overrides: dict | None = None):
|
||||
"""
|
||||
Create or get a Crawl from a JSON dict.
|
||||
|
||||
@@ -746,6 +754,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
# =============================================================================
|
||||
|
||||
class CrawlMachine(BaseStateMachine):
|
||||
crawl: Crawl
|
||||
|
||||
"""
|
||||
State machine for managing Crawl lifecycle.
|
||||
|
||||
|
||||
@@ -1013,7 +1013,7 @@ def get_plugin_icon(plugin: str) -> str:
|
||||
|
||||
|
||||
|
||||
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
|
||||
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] | None = None) -> Dict[str, int]:
|
||||
"""
|
||||
Process JSONL records from hook output.
|
||||
Dispatches to Model.from_json() for each record type.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'archivebox.ideas'
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
@@ -13,12 +14,14 @@ from typing import Any, Callable, Mapping, MutableMapping, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
try:
|
||||
from bubus import BaseEvent, EventBus
|
||||
bubus = importlib.import_module("bubus")
|
||||
BaseEvent = bubus.BaseEvent
|
||||
EventBus = bubus.EventBus
|
||||
except Exception as exc: # pragma: no cover - optional dependency
|
||||
raise ImportError('ProcessPlugin requires bubus to be installed') from exc
|
||||
|
||||
try:
|
||||
from bubus.service import uuid7str
|
||||
uuid7str = importlib.import_module("bubus.service").uuid7str
|
||||
except Exception: # pragma: no cover - optional dependency
|
||||
from uuid import uuid4 as _uuid4
|
||||
|
||||
|
||||
@@ -6,18 +6,15 @@ This module extends django-auth-ldap to support the LDAP_CREATE_SUPERUSER flag.
|
||||
|
||||
__package__ = "archivebox.ldap"
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import importlib
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
|
||||
else:
|
||||
try:
|
||||
from django_auth_ldap.backend import LDAPBackend as BaseLDAPBackend
|
||||
except ImportError:
|
||||
# If django-auth-ldap is not installed, create a dummy base class
|
||||
class BaseLDAPBackend:
|
||||
"""Dummy LDAP backend when django-auth-ldap is not installed."""
|
||||
pass
|
||||
try:
|
||||
BaseLDAPBackend = importlib.import_module("django_auth_ldap.backend").LDAPBackend
|
||||
except ImportError:
|
||||
class BaseLDAPBackend:
|
||||
"""Dummy LDAP backend when django-auth-ldap is not installed."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ArchiveBoxLDAPBackend(BaseLDAPBackend):
|
||||
@@ -36,7 +33,11 @@ class ArchiveBoxLDAPBackend(BaseLDAPBackend):
|
||||
"""
|
||||
from archivebox.config.ldap import LDAP_CONFIG
|
||||
|
||||
user = super().authenticate_ldap_user(ldap_user, password)
|
||||
base_authenticate = getattr(super(), "authenticate_ldap_user", None)
|
||||
if base_authenticate is None:
|
||||
return None
|
||||
|
||||
user = base_authenticate(ldap_user, password)
|
||||
|
||||
if user and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
|
||||
# Grant superuser privileges to all LDAP-authenticated users
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
__package__ = 'archivebox.machine'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import socket
|
||||
from pathlib import Path
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import timedelta, datetime
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from statemachine import State, registry
|
||||
|
||||
@@ -13,21 +17,31 @@ from django.db import models
|
||||
from django.db.models import QuerySet
|
||||
from django.utils import timezone
|
||||
from django.utils.functional import cached_property
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
from archivebox.base_models.models import ModelWithHealthStats
|
||||
from archivebox.workers.models import BaseStateMachine, ModelWithStateMachine
|
||||
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
|
||||
|
||||
_psutil: Any | None = None
|
||||
try:
|
||||
import psutil
|
||||
import psutil as _psutil_import
|
||||
PSUTIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
PSUTIL_AVAILABLE = False
|
||||
else:
|
||||
_psutil = _psutil_import
|
||||
|
||||
_CURRENT_MACHINE = None
|
||||
_CURRENT_INTERFACE = None
|
||||
_CURRENT_BINARIES = {}
|
||||
_CURRENT_PROCESS = None
|
||||
if TYPE_CHECKING:
|
||||
import psutil
|
||||
from archivebox.core.models import ArchiveResult
|
||||
else:
|
||||
psutil = cast(Any, _psutil)
|
||||
|
||||
_CURRENT_MACHINE: Machine | None = None
|
||||
_CURRENT_INTERFACE: NetworkInterface | None = None
|
||||
_CURRENT_BINARIES: dict[str, Binary] = {}
|
||||
_CURRENT_PROCESS: Process | None = None
|
||||
|
||||
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
|
||||
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
|
||||
@@ -64,10 +78,10 @@ class Machine(ModelWithHealthStats):
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
objects: MachineManager = MachineManager()
|
||||
objects = MachineManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
networkinterface_set: models.Manager['NetworkInterface']
|
||||
|
||||
class Meta:
|
||||
class Meta(ModelWithHealthStats.Meta):
|
||||
app_label = 'machine'
|
||||
|
||||
@classmethod
|
||||
@@ -127,7 +141,7 @@ class Machine(ModelWithHealthStats):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
||||
"""
|
||||
Update Machine config from JSON dict.
|
||||
|
||||
@@ -172,9 +186,10 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
|
||||
# num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
|
||||
|
||||
objects: NetworkInterfaceManager = NetworkInterfaceManager()
|
||||
objects = NetworkInterfaceManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
machine_id: uuid.UUID
|
||||
|
||||
class Meta:
|
||||
class Meta(ModelWithHealthStats.Meta):
|
||||
app_label = 'machine'
|
||||
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
|
||||
|
||||
@@ -185,7 +200,7 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
|
||||
return _CURRENT_INTERFACE
|
||||
_CURRENT_INTERFACE = None
|
||||
machine = Machine.objects.current()
|
||||
machine = Machine.current()
|
||||
net_info = get_host_network()
|
||||
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
|
||||
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
|
||||
@@ -202,7 +217,7 @@ class BinaryManager(models.Manager):
|
||||
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
|
||||
return cached
|
||||
_CURRENT_BINARIES[name], _ = self.update_or_create(
|
||||
machine=Machine.objects.current(), name=name, binprovider=binprovider,
|
||||
machine=Machine.current(), name=name, binprovider=binprovider,
|
||||
version=version, abspath=abspath, sha256=sha256,
|
||||
)
|
||||
return _CURRENT_BINARIES[name]
|
||||
@@ -263,12 +278,14 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
|
||||
machine_id: uuid.UUID
|
||||
|
||||
state_machine_name: str | None = 'archivebox.machine.models.BinaryMachine'
|
||||
active_state: str = StatusChoices.QUEUED
|
||||
|
||||
objects: BinaryManager = BinaryManager()
|
||||
objects = BinaryManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
|
||||
class Meta:
|
||||
class Meta(ModelWithHealthStats.Meta, ModelWithStateMachine.Meta):
|
||||
app_label = 'machine'
|
||||
verbose_name = 'Binary'
|
||||
verbose_name_plural = 'Binaries'
|
||||
@@ -321,7 +338,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
||||
"""
|
||||
Create/update Binary from JSON dict.
|
||||
|
||||
@@ -418,7 +435,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
|
||||
return None
|
||||
|
||||
def update_and_requeue(self, **kwargs):
|
||||
def update_and_requeue(self, **kwargs) -> bool:
|
||||
"""
|
||||
Update binary fields and requeue for worker state machine.
|
||||
|
||||
@@ -429,6 +446,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
setattr(self, key, value)
|
||||
self.modified_at = timezone.now()
|
||||
self.save()
|
||||
return True
|
||||
|
||||
def _allowed_binproviders(self) -> set[str] | None:
|
||||
"""Return the allowed binproviders for this binary, or None for wildcard."""
|
||||
@@ -513,21 +531,14 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
plugin_output_dir = output_dir / plugin_name
|
||||
plugin_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build kwargs for hook
|
||||
hook_kwargs = {
|
||||
'binary_id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
'name': self.name,
|
||||
'binproviders': self.binproviders,
|
||||
}
|
||||
|
||||
custom_cmd = None
|
||||
overrides_json = None
|
||||
if plugin_name == 'custom':
|
||||
custom_cmd = self._get_custom_install_command()
|
||||
if not custom_cmd:
|
||||
continue
|
||||
hook_kwargs['custom_cmd'] = custom_cmd
|
||||
elif self.overrides:
|
||||
hook_kwargs['overrides'] = json.dumps(self.overrides)
|
||||
overrides_json = json.dumps(self.overrides)
|
||||
|
||||
# Run the hook
|
||||
process = run_hook(
|
||||
@@ -535,7 +546,12 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
output_dir=plugin_output_dir,
|
||||
config=config,
|
||||
timeout=600, # 10 min timeout for binary installation
|
||||
**hook_kwargs
|
||||
binary_id=str(self.id),
|
||||
machine_id=str(self.machine_id),
|
||||
name=self.name,
|
||||
binproviders=self.binproviders,
|
||||
custom_cmd=custom_cmd,
|
||||
overrides=overrides_json,
|
||||
)
|
||||
|
||||
# Background hook (unlikely for binary installation, but handle it)
|
||||
@@ -679,7 +695,7 @@ class ProcessManager(models.Manager):
|
||||
"""Get the Process record for the current OS process."""
|
||||
return Process.current()
|
||||
|
||||
def get_by_pid(self, pid: int, machine: 'Machine' = None) -> 'Process | None':
|
||||
def get_by_pid(self, pid: int, machine: 'Machine | None' = None) -> 'Process | None':
|
||||
"""
|
||||
Find a Process by PID with proper validation against PID reuse.
|
||||
|
||||
@@ -880,11 +896,17 @@ class Process(models.Model):
|
||||
help_text='When to retry this process'
|
||||
)
|
||||
|
||||
machine_id: uuid.UUID
|
||||
parent_id: uuid.UUID | None
|
||||
binary_id: uuid.UUID | None
|
||||
children: models.Manager['Process']
|
||||
archiveresult: 'ArchiveResult'
|
||||
|
||||
state_machine_name: str = 'archivebox.machine.models.ProcessMachine'
|
||||
|
||||
objects: ProcessManager = ProcessManager()
|
||||
objects = ProcessManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'machine'
|
||||
verbose_name = 'Process'
|
||||
verbose_name_plural = 'Processes'
|
||||
@@ -971,7 +993,7 @@ class Process(models.Model):
|
||||
return self.parse_records_from_text(stdout or '')
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
||||
"""
|
||||
Create/update Process from JSON dict.
|
||||
|
||||
@@ -990,7 +1012,7 @@ class Process(models.Model):
|
||||
pass
|
||||
return None
|
||||
|
||||
def update_and_requeue(self, **kwargs):
|
||||
def update_and_requeue(self, **kwargs) -> bool:
|
||||
"""
|
||||
Update process fields and requeue for worker state machine.
|
||||
Sets modified_at to ensure workers pick up changes.
|
||||
@@ -999,6 +1021,7 @@ class Process(models.Model):
|
||||
setattr(self, key, value)
|
||||
self.modified_at = timezone.now()
|
||||
self.save()
|
||||
return True
|
||||
|
||||
# =========================================================================
|
||||
# Process.current() and hierarchy methods
|
||||
@@ -1094,7 +1117,7 @@ class Process(models.Model):
|
||||
return _CURRENT_PROCESS
|
||||
|
||||
@classmethod
|
||||
def _find_parent_process(cls, machine: 'Machine' = None) -> 'Process | None':
|
||||
def _find_parent_process(cls, machine: 'Machine | None' = None) -> 'Process | None':
|
||||
"""
|
||||
Find the parent Process record by looking up PPID.
|
||||
|
||||
@@ -1163,7 +1186,7 @@ class Process(models.Model):
|
||||
return cls.TypeChoices.BINARY
|
||||
|
||||
@classmethod
|
||||
def cleanup_stale_running(cls, machine: 'Machine' = None) -> int:
|
||||
def cleanup_stale_running(cls, machine: 'Machine | None' = None) -> int:
|
||||
"""
|
||||
Mark stale RUNNING processes as EXITED.
|
||||
|
||||
@@ -1374,25 +1397,25 @@ class Process(models.Model):
|
||||
# =========================================================================
|
||||
|
||||
@property
|
||||
def pid_file(self) -> Path:
|
||||
def pid_file(self) -> Path | None:
|
||||
"""Path to PID file for this process."""
|
||||
runtime_dir = self.runtime_dir
|
||||
return runtime_dir / 'process.pid' if runtime_dir else None
|
||||
|
||||
@property
|
||||
def cmd_file(self) -> Path:
|
||||
def cmd_file(self) -> Path | None:
|
||||
"""Path to cmd.sh script for this process."""
|
||||
runtime_dir = self.runtime_dir
|
||||
return runtime_dir / 'cmd.sh' if runtime_dir else None
|
||||
|
||||
@property
|
||||
def stdout_file(self) -> Path:
|
||||
def stdout_file(self) -> Path | None:
|
||||
"""Path to stdout log."""
|
||||
runtime_dir = self.runtime_dir
|
||||
return runtime_dir / 'stdout.log' if runtime_dir else None
|
||||
|
||||
@property
|
||||
def stderr_file(self) -> Path:
|
||||
def stderr_file(self) -> Path | None:
|
||||
"""Path to stderr log."""
|
||||
runtime_dir = self.runtime_dir
|
||||
return runtime_dir / 'stderr.log' if runtime_dir else None
|
||||
@@ -1647,6 +1670,8 @@ class Process(models.Model):
|
||||
stdout_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if stderr_path:
|
||||
stderr_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if stdout_path is None or stderr_path is None:
|
||||
raise RuntimeError('Process log paths could not be determined')
|
||||
|
||||
with open(stdout_path, 'a') as out, open(stderr_path, 'a') as err:
|
||||
proc = subprocess.Popen(
|
||||
@@ -2006,7 +2031,7 @@ class Process(models.Model):
|
||||
# =========================================================================
|
||||
|
||||
@classmethod
|
||||
def get_running(cls, process_type: str = None, machine: 'Machine' = None) -> 'QuerySet[Process]':
|
||||
def get_running(cls, process_type: str | None = None, machine: 'Machine | None' = None) -> 'QuerySet[Process]':
|
||||
"""
|
||||
Get all running processes, optionally filtered by type.
|
||||
|
||||
@@ -2031,7 +2056,7 @@ class Process(models.Model):
|
||||
return qs
|
||||
|
||||
@classmethod
|
||||
def get_running_count(cls, process_type: str = None, machine: 'Machine' = None) -> int:
|
||||
def get_running_count(cls, process_type: str | None = None, machine: 'Machine | None' = None) -> int:
|
||||
"""
|
||||
Get count of running processes.
|
||||
|
||||
@@ -2041,7 +2066,7 @@ class Process(models.Model):
|
||||
return cls.get_running(process_type=process_type, machine=machine).count()
|
||||
|
||||
@classmethod
|
||||
def stop_all(cls, process_type: str = None, machine: 'Machine' = None, graceful: bool = True) -> int:
|
||||
def stop_all(cls, process_type: str | None = None, machine: 'Machine | None' = None, graceful: bool = True) -> int:
|
||||
"""
|
||||
Stop all running processes of a given type.
|
||||
|
||||
@@ -2064,7 +2089,7 @@ class Process(models.Model):
|
||||
return stopped
|
||||
|
||||
@classmethod
|
||||
def get_next_worker_id(cls, process_type: str = 'worker', machine: 'Machine' = None) -> int:
|
||||
def get_next_worker_id(cls, process_type: str = 'worker', machine: 'Machine | None' = None) -> int:
|
||||
"""
|
||||
Get the next available worker ID for spawning new workers.
|
||||
|
||||
@@ -2190,6 +2215,7 @@ class BinaryMachine(BaseStateMachine):
|
||||
"""
|
||||
|
||||
model_attr_name = 'binary'
|
||||
binary: Binary
|
||||
|
||||
# States
|
||||
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
|
||||
@@ -2293,6 +2319,7 @@ class ProcessMachine(BaseStateMachine):
|
||||
"""
|
||||
|
||||
model_attr_name = 'process'
|
||||
process: Process
|
||||
|
||||
# States
|
||||
queued = State(value=Process.StatusChoices.QUEUED, initial=True)
|
||||
|
||||
@@ -13,6 +13,7 @@ Tests cover:
|
||||
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import cast
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
@@ -20,6 +21,7 @@ from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.machine.models import (
|
||||
BinaryManager,
|
||||
Machine,
|
||||
NetworkInterface,
|
||||
Binary,
|
||||
@@ -94,7 +96,7 @@ class TestMachineModel(TestCase):
|
||||
|
||||
def test_machine_manager_current(self):
|
||||
"""Machine.objects.current() should return current machine."""
|
||||
machine = Machine.objects.current()
|
||||
machine = Machine.current()
|
||||
self.assertIsNotNone(machine)
|
||||
self.assertEqual(machine.id, Machine.current().id)
|
||||
|
||||
@@ -126,7 +128,7 @@ class TestNetworkInterfaceModel(TestCase):
|
||||
|
||||
def test_networkinterface_manager_current(self):
|
||||
"""NetworkInterface.objects.current() should return current interface."""
|
||||
interface = NetworkInterface.objects.current()
|
||||
interface = NetworkInterface.current()
|
||||
self.assertIsNotNone(interface)
|
||||
|
||||
|
||||
@@ -177,7 +179,7 @@ class TestBinaryModel(TestCase):
|
||||
version='1.21',
|
||||
)
|
||||
|
||||
result = Binary.objects.get_valid_binary('wget')
|
||||
result = cast(BinaryManager, Binary.objects).get_valid_binary('wget')
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
assert result is not None
|
||||
|
||||
@@ -79,8 +79,8 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||
|
||||
if isinstance(text, str):
|
||||
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
|
||||
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}")
|
||||
else:
|
||||
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
|
||||
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}")
|
||||
for line in text[1:]:
|
||||
stderr('{} {}'.format(prefix, line))
|
||||
stderr(f'{prefix} {line}')
|
||||
|
||||
@@ -5,6 +5,8 @@ import os
|
||||
import stat
|
||||
import posixpath
|
||||
import mimetypes
|
||||
import importlib
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from django.contrib.staticfiles import finders
|
||||
@@ -69,9 +71,9 @@ mimetypes.add_type("application/xml", ".xml")
|
||||
mimetypes.add_type("image/svg+xml", ".svg")
|
||||
|
||||
try:
|
||||
import markdown as _markdown
|
||||
except Exception:
|
||||
_markdown = None
|
||||
_markdown = getattr(importlib.import_module('markdown'), 'markdown')
|
||||
except ImportError:
|
||||
_markdown: Callable[..., str] | None = None
|
||||
|
||||
MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
|
||||
MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
||||
@@ -108,7 +110,7 @@ def _looks_like_markdown(text: str) -> bool:
|
||||
def _render_markdown_fallback(text: str) -> str:
|
||||
if _markdown is not None and not HTML_TAG_RE.search(text):
|
||||
try:
|
||||
return _markdown.markdown(
|
||||
return _markdown(
|
||||
text,
|
||||
extensions=["extra", "toc", "sane_lists"],
|
||||
output_format="html",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Any, List, Callable
|
||||
from typing import Any, List, Callable, cast
|
||||
|
||||
import json
|
||||
import ast
|
||||
@@ -94,7 +94,8 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
|
||||
|
||||
def better_toml_dump_str(val: Any) -> str:
|
||||
try:
|
||||
return toml.encoder._dump_str(val) # type: ignore
|
||||
dump_str = cast(Callable[[Any], str], getattr(toml.encoder, '_dump_str'))
|
||||
return dump_str(val)
|
||||
except Exception:
|
||||
# if we hit any of toml's numerous encoding bugs,
|
||||
# fall back to using json representation of string
|
||||
@@ -108,7 +109,8 @@ class CustomTOMLEncoder(toml.encoder.TomlEncoder):
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.dump_funcs[Path] = lambda x: json.dumps(str(x))
|
||||
self.dump_funcs[PosixPath] = lambda x: json.dumps(str(x))
|
||||
self.dump_funcs[str] = better_toml_dump_str
|
||||
self.dump_funcs[re.RegexFlag] = better_toml_dump_str
|
||||
dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs)
|
||||
dump_funcs[Path] = lambda x: json.dumps(str(x))
|
||||
dump_funcs[PosixPath] = lambda x: json.dumps(str(x))
|
||||
dump_funcs[str] = better_toml_dump_str
|
||||
dump_funcs[re.RegexFlag] = better_toml_dump_str
|
||||
|
||||
@@ -16,7 +16,7 @@ from datetime import datetime, timezone
|
||||
from dateparser import parse as dateparser
|
||||
from requests.exceptions import RequestException, ReadTimeout
|
||||
|
||||
from base32_crockford import encode as base32_encode # type: ignore
|
||||
from base32_crockford import encode as base32_encode
|
||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||
try:
|
||||
import chardet # type:ignore
|
||||
@@ -200,7 +200,7 @@ def parse_date(date: Any) -> datetime | None:
|
||||
"""Parse unix timestamps, iso format, and human-readable strings"""
|
||||
|
||||
if date is None:
|
||||
return None # type: ignore
|
||||
return None
|
||||
|
||||
if isinstance(date, datetime):
|
||||
if date.tzinfo is None:
|
||||
|
||||
@@ -16,7 +16,7 @@ import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from django.db import models
|
||||
from django.conf import settings
|
||||
@@ -25,13 +25,18 @@ from django.utils import timezone
|
||||
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
_fcntl: Any | None = None
|
||||
try:
|
||||
import fcntl
|
||||
import fcntl as _fcntl_import
|
||||
except ImportError: # pragma: no cover
|
||||
fcntl = None
|
||||
pass
|
||||
else:
|
||||
_fcntl = _fcntl_import
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
import fcntl
|
||||
else:
|
||||
fcntl = _fcntl
|
||||
|
||||
|
||||
VOLATILE_PROFILE_DIR_NAMES = {
|
||||
@@ -79,7 +84,7 @@ class Persona(ModelWithConfig):
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
||||
|
||||
class Meta:
|
||||
class Meta(ModelWithConfig.Meta):
|
||||
app_label = 'personas'
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
||||
@@ -8,6 +8,7 @@ from django.db import models
|
||||
from django.core import checks
|
||||
from django.utils import timezone
|
||||
from django.utils.functional import classproperty
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
from statemachine import registry, StateMachine, State
|
||||
|
||||
@@ -31,7 +32,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
|
||||
# status: models.CharField
|
||||
# retry_at: models.DateTimeField
|
||||
|
||||
state_machine_name: str | None
|
||||
state_machine_name: str | None = None
|
||||
state_field_name: str
|
||||
state_machine_attr: str = 'sm'
|
||||
bind_events_as_methods: bool = True
|
||||
@@ -39,7 +40,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
|
||||
active_state: ObjectState
|
||||
retry_at_field_name: str
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'workers'
|
||||
abstract = True
|
||||
|
||||
@@ -92,7 +93,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
|
||||
if not found_id_field:
|
||||
errors.append(checks.Error(
|
||||
f'{cls.__name__} must have an id field that is a primary key',
|
||||
hint=f'{cls.__name__}.id = {cls.id!r}',
|
||||
hint=f'{cls.__name__}.id field missing or not configured as primary key',
|
||||
obj=cls,
|
||||
id='workers.E014',
|
||||
))
|
||||
|
||||
@@ -11,14 +11,26 @@ Tests cover:
|
||||
|
||||
import os
|
||||
import time
|
||||
from datetime import timedelta
|
||||
from unittest.mock import patch, MagicMock
|
||||
from datetime import datetime, timedelta
|
||||
from unittest.mock import patch
|
||||
from typing import ClassVar
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import Worker
|
||||
|
||||
|
||||
class FakeWorker(Worker):
|
||||
name: ClassVar[str] = 'crawl'
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 5
|
||||
running_workers: ClassVar[list[dict[str, object]]] = []
|
||||
|
||||
@classmethod
|
||||
def get_running_workers(cls) -> list[dict[str, object]]:
|
||||
return cls.running_workers
|
||||
|
||||
|
||||
class TestOrchestratorUnit(TestCase):
|
||||
@@ -99,31 +111,25 @@ class TestOrchestratorUnit(TestCase):
|
||||
"""should_spawn_worker should return False when queue is empty."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
# Create a mock worker class
|
||||
mock_worker = MagicMock()
|
||||
mock_worker.get_running_workers.return_value = []
|
||||
|
||||
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 0))
|
||||
FakeWorker.running_workers = []
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
|
||||
|
||||
def test_should_spawn_worker_at_limit(self):
|
||||
"""should_spawn_worker should return False when at per-type limit."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
mock_worker = MagicMock()
|
||||
mock_worker.get_running_workers.return_value = [{}] * orchestrator.MAX_WORKERS_PER_TYPE
|
||||
|
||||
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 10))
|
||||
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
|
||||
FakeWorker.running_workers = running_workers
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_at_total_limit(self, mock_total):
|
||||
"""should_spawn_worker should return False when at total limit."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = orchestrator.MAX_TOTAL_WORKERS
|
||||
|
||||
mock_worker = MagicMock()
|
||||
mock_worker.get_running_workers.return_value = []
|
||||
|
||||
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 10))
|
||||
mock_total.return_value = 0
|
||||
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
|
||||
FakeWorker.running_workers = running_workers
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_success(self, mock_total):
|
||||
@@ -131,11 +137,8 @@ class TestOrchestratorUnit(TestCase):
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 0
|
||||
|
||||
mock_worker = MagicMock()
|
||||
mock_worker.get_running_workers.return_value = []
|
||||
mock_worker.MAX_CONCURRENT_TASKS = 5
|
||||
|
||||
self.assertTrue(orchestrator.should_spawn_worker(mock_worker, 10))
|
||||
FakeWorker.running_workers = []
|
||||
self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_enough_workers(self, mock_total):
|
||||
@@ -143,12 +146,8 @@ class TestOrchestratorUnit(TestCase):
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 2
|
||||
|
||||
mock_worker = MagicMock()
|
||||
mock_worker.get_running_workers.return_value = [{}] # 1 worker running
|
||||
mock_worker.MAX_CONCURRENT_TASKS = 5 # Can handle 5 items
|
||||
|
||||
# Queue size (3) <= running_workers (1) * MAX_CONCURRENT_TASKS (5)
|
||||
self.assertFalse(orchestrator.should_spawn_worker(mock_worker, 3))
|
||||
FakeWorker.running_workers = [{}] # 1 worker running
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
|
||||
|
||||
|
||||
class TestOrchestratorWithProcess(TestCase):
|
||||
@@ -178,8 +177,10 @@ class TestOrchestratorWithProcess(TestCase):
|
||||
def test_is_running_with_orchestrator_process(self):
|
||||
"""is_running should return True when orchestrator Process exists."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
import psutil
|
||||
|
||||
machine = Machine.current()
|
||||
current_proc = psutil.Process(os.getpid())
|
||||
|
||||
# Create an orchestrator Process record
|
||||
proc = Process.objects.create(
|
||||
@@ -187,8 +188,8 @@ class TestOrchestratorWithProcess(TestCase):
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(), # Use current PID so it appears alive
|
||||
started_at=timezone.now(),
|
||||
cmd=['archivebox', 'manage', 'orchestrator'],
|
||||
started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
|
||||
cmd=current_proc.cmdline(),
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -393,14 +394,7 @@ class TestProcessLifecycle(TestCase):
|
||||
def test_process_is_running_property(self):
|
||||
"""Process.is_running should check actual OS process."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create a process with current PID (should be running)
|
||||
proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(),
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
proc = Process.current()
|
||||
|
||||
# Should be running (current process exists)
|
||||
self.assertTrue(proc.is_running)
|
||||
|
||||
25
bin/lint.sh
25
bin/lint.sh
@@ -16,18 +16,31 @@ source "$DIR/.venv/bin/activate"
|
||||
|
||||
cd "$DIR"
|
||||
|
||||
FAILED=0
|
||||
|
||||
echo "[*] Running ruff..."
|
||||
ruff check archivebox
|
||||
echo "√ No errors found."
|
||||
if ruff check --fix archivebox; then
|
||||
echo "√ No errors found."
|
||||
else
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
echo
|
||||
|
||||
echo "[*] Running pyright..."
|
||||
pyright
|
||||
echo "√ No errors found."
|
||||
if pyright; then
|
||||
echo "√ No errors found."
|
||||
else
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
echo
|
||||
|
||||
echo "[*] Running ty..."
|
||||
ty check archivebox
|
||||
echo "√ No errors found."
|
||||
if ty check --force-exclude --exclude '**/migrations/**' archivebox; then
|
||||
echo "√ No errors found."
|
||||
else
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
exit "$FAILED"
|
||||
|
||||
@@ -82,7 +82,7 @@ dependencies = [
|
||||
"yt-dlp[default]>=2026.03.13", # for: media extractor
|
||||
### Binary/Package Management
|
||||
"abx-pkg>=1.9.10", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
|
||||
"abx-plugins>=1.9.11", # shared ArchiveBox plugin package with install_args-only overrides
|
||||
"abx-plugins>=1.9.14", # shared ArchiveBox plugin package with install_args-only overrides
|
||||
"gallery-dl>=1.31.1",
|
||||
### UUID7 backport for Python <3.14
|
||||
"uuid7>=0.1.0; python_version < '3.14'", # provides the uuid_extensions module on Python 3.13
|
||||
@@ -265,6 +265,10 @@ reportMissingTypeStubs = false
|
||||
pythonVersion = "3.13"
|
||||
pythonPlatform = "Linux"
|
||||
|
||||
[tool.ty]
|
||||
environment = { python-version = "3.13", python-platform = "linux" }
|
||||
src = { include = ["archivebox"], exclude = [".venv", "**/*.pyi", "**/__init__.pyi", "**/node_modules", "**/__pycache__", "**/migrations"] }
|
||||
|
||||
|
||||
[project.scripts]
|
||||
archivebox = "archivebox.cli:main"
|
||||
|
||||
Reference in New Issue
Block a user