This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME
if TYPE_CHECKING:
from core.models import Snapshot
from archivebox.core.models import Snapshot
@enforce_types
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
# import models once django is set up
from core.models import Snapshot
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator

View File

@@ -66,18 +66,38 @@ def config(*keys,
raise SystemExit(1)
else:
matching_config = FLAT_CONFIG
# Display core config sections
for config_section in CONFIGS.values():
if hasattr(config_section, 'toml_section_header'):
print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
else:
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
# Display plugin config section
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
plugin_keys = {}
# Collect all plugin config keys
for plugin_name, schema in plugin_configs.items():
if 'properties' not in schema:
continue
for key in schema['properties'].keys():
if key in matching_config:
plugin_keys[key] = matching_config[key]
# Display all plugin config in single [PLUGINS] section
if plugin_keys:
print(f'[grey53]\\[PLUGINS][/grey53]')
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
raise SystemExit(not matching_config)
elif set:

View File

@@ -72,11 +72,11 @@ def discover_outlinks(
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, get_or_create_snapshot
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot, ArchiveResult
from crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator
@@ -130,8 +130,10 @@ def discover_outlinks(
record['crawl_id'] = str(crawl.id)
record['depth'] = record.get('depth', 0)
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
snapshot_ids.append(str(snapshot.id))
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
if snapshot:
snapshot_ids.append(str(snapshot.id))
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -162,7 +164,6 @@ def discover_outlinks(
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
'created_by_id': snapshot.created_by_id,
}
)
else:
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
- Transition from started -> sealed (when all snapshots done)
"""
from rich import print as rprint
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
try:
crawl = Crawl.objects.get(id=crawl_id)
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
if not uuid_pattern.match(value):
return False
# Verify it's actually a Crawl (not a Snapshot or other object)
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
return Crawl.objects.filter(id=value).exists()

View File

@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
"""
from rich import print as rprint
from core.models import ArchiveResult
from archivebox.core.models import ArchiveResult
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -95,7 +95,7 @@ def run_plugins(
read_args_or_stdin, write_record, archiveresult_to_jsonl,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot, ArchiveResult
from workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()
@@ -155,7 +155,6 @@ def run_plugins(
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
'created_by_id': snapshot.created_by_id,
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from core.models import ArchiveResult
from archivebox.core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()

View File

@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print()
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
from core.models import Snapshot
from archivebox.core.models import Snapshot
all_links = Snapshot.objects.none()
pending_links: dict[str, SnapshotDict] = {}

View File

@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
setup_django()
from django.utils import timezone
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
# Create a crawl for dependency detection
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
# Verify the crawl is in the queue
from crawls.models import Crawl as CrawlModel
from archivebox.crawls.models import Crawl as CrawlModel
queued_crawls = CrawlModel.objects.filter(
retry_at__lte=timezone.now()
).exclude(

View File

@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
to_remove = snapshots.count()
from archivebox.search import flush_search_index
from core.models import Snapshot
from archivebox.core.models import Snapshot
flush_search_index(snapshots=snapshots)
snapshots.delete()

View File

@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Filter and return Snapshots matching the given criteria."""
from core.models import Snapshot
from archivebox.core.models import Snapshot
if snapshots:
result = snapshots
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')

View File

@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
- Transition from started -> sealed (when all ArchiveResults done)
"""
from rich import print as rprint
from core.models import Snapshot
from archivebox.core.models import Snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
@@ -88,11 +88,11 @@ def create_snapshots(
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, snapshot_to_jsonl,
TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
TYPE_SNAPSHOT, TYPE_TAG
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -137,8 +137,10 @@ def create_snapshots(
record['tags'] = tag
# Get or create the snapshot
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
created_snapshots.append(snapshot)
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
if snapshot:
created_snapshots.append(snapshot)
# Output JSONL record (only when piped)
if not is_tty:

View File

@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
from django.contrib.auth import get_user_model
from archivebox.misc.db import get_admins
from core.models import Snapshot
from archivebox.core.models import Snapshot
User = get_user_model()
print('[green]\\[*] Scanning archive main index...[/green]')

View File

@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
from archivebox.config.django import setup_django
setup_django()
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.utils import timezone
while True:
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
Skip symlinks (already migrated).
Create DB records and trigger migration on save().
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.config import CONSTANTS
from django.db import transaction
@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
Process all snapshots in DB.
Reconcile index.json and queue for archiving.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
@@ -189,7 +189,7 @@ def process_filtered_snapshots(
batch_size: int
) -> dict:
"""Process snapshots matching filters (DB query only)."""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
from datetime import datetime

View File

@@ -107,7 +107,7 @@ def version(quiet: bool=False,
from archivebox.config.django import setup_django
setup_django()
from machine.models import Machine, Binary
from archivebox.machine.models import Machine, Binary
machine = Machine.current()

View File

@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox snapshot URL
Should create a Snapshot and output JSONL when piped.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, snapshot_to_jsonl,
TYPE_SNAPSHOT, get_or_create_snapshot
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
self.assertEqual(records[0]['url'], url)
# Create snapshot
snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox snapshot URL | archivebox extract
Extract should accept JSONL output from snapshot command.
"""
from core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.misc.jsonl import (
snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
snapshot_output = snapshot_to_jsonl(snapshot)
# Step 2: Parse snapshot output as extract input
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
This is equivalent to: archivebox add URL
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
This is equivalent to: archivebox add --depth=1 URL
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
Depth 0: Only archive the specified URL, no crawling.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import get_or_create_snapshot
from archivebox.base_models.models import get_or_create_system_user_pk