wip

2026-01-03 17:35:45 +10:00 · 2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME


 if TYPE_CHECKING:
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot


@enforce_types
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
    assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'

    # import models once django is set up
-    from core.models import Snapshot
-    from crawls.models import Crawl
+    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk
    from workers.orchestrator import Orchestrator

--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -66,18 +66,38 @@ def config(*keys,
                raise SystemExit(1)
        else:
            matching_config = FLAT_CONFIG
-        
+
+        # Display core config sections
        for config_section in CONFIGS.values():
            if hasattr(config_section, 'toml_section_header'):
                print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
            else:
                print('[grey53]\\[CONSTANTS]                                        # (read-only)[/grey53]')
-            
+
            kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
            print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
            print('[grey53]################################################################[/grey53]')
-            
-        
+
+        # Display plugin config section
+        from archivebox.hooks import discover_plugin_configs
+
+        plugin_configs = discover_plugin_configs()
+        plugin_keys = {}
+
+        # Collect all plugin config keys
+        for plugin_name, schema in plugin_configs.items():
+            if 'properties' not in schema:
+                continue
+            for key in schema['properties'].keys():
+                if key in matching_config:
+                    plugin_keys[key] = matching_config[key]
+
+        # Display all plugin config in single [PLUGINS] section
+        if plugin_keys:
+            print(f'[grey53]\\[PLUGINS][/grey53]')
+            print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
+            print('[grey53]################################################################[/grey53]')
+
        raise SystemExit(not matching_config)

    elif set:
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -72,11 +72,11 @@ def discover_outlinks(

    from archivebox.misc.jsonl import (
        read_args_or_stdin, write_record,
-        TYPE_SNAPSHOT, get_or_create_snapshot
+        TYPE_SNAPSHOT
    )
    from archivebox.base_models.models import get_or_create_system_user_pk
-    from core.models import Snapshot, ArchiveResult
-    from crawls.models import Crawl
+    from archivebox.core.models import Snapshot, ArchiveResult
+    from archivebox.crawls.models import Crawl
    from archivebox.config import CONSTANTS
    from workers.orchestrator import Orchestrator

@@ -130,8 +130,10 @@ def discover_outlinks(
                record['crawl_id'] = str(crawl.id)
                record['depth'] = record.get('depth', 0)

-                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
-                snapshot_ids.append(str(snapshot.id))
+                overrides = {'created_by_id': created_by_id}
+                snapshot = Snapshot.from_jsonl(record, overrides=overrides)
+                if snapshot:
+                    snapshot_ids.append(str(snapshot.id))

            except Exception as e:
                rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -162,7 +164,6 @@ def discover_outlinks(
                    defaults={
                        'status': ArchiveResult.StatusChoices.QUEUED,
                        'retry_at': timezone.now(),
-                        'created_by_id': snapshot.created_by_id,
                    }
                )
            else:
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
    - Transition from started -> sealed (when all snapshots done)
    """
    from rich import print as rprint
-    from crawls.models import Crawl
+    from archivebox.crawls.models import Crawl

    try:
        crawl = Crawl.objects.get(id=crawl_id)
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
    if not uuid_pattern.match(value):
        return False
    # Verify it's actually a Crawl (not a Snapshot or other object)
-    from crawls.models import Crawl
+    from archivebox.crawls.models import Crawl
    return Crawl.objects.filter(id=value).exists()


--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
    Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
    """
    from rich import print as rprint
-    from core.models import ArchiveResult
+    from archivebox.core.models import ArchiveResult

    try:
        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -95,7 +95,7 @@ def run_plugins(
        read_args_or_stdin, write_record, archiveresult_to_jsonl,
        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
    )
-    from core.models import Snapshot, ArchiveResult
+    from archivebox.core.models import Snapshot, ArchiveResult
    from workers.orchestrator import Orchestrator

    is_tty = sys.stdout.isatty()
@@ -155,7 +155,6 @@ def run_plugins(
                defaults={
                    'status': ArchiveResult.StatusChoices.QUEUED,
                    'retry_at': timezone.now(),
-                    'created_by_id': snapshot.created_by_id,
                }
            )
            if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
    if not uuid_pattern.match(value):
        return False
    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
-    from core.models import ArchiveResult
+    from archivebox.core.models import ArchiveResult
    return ArchiveResult.objects.filter(id=value).exists()


--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
    print()
    print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')

-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot

    all_links = Snapshot.objects.none()
    pending_links: dict[str, SnapshotDict] = {}
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
    setup_django()

    from django.utils import timezone
-    from crawls.models import Crawl
+    from archivebox.crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk

    # Create a crawl for dependency detection
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
    print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')

    # Verify the crawl is in the queue
-    from crawls.models import Crawl as CrawlModel
+    from archivebox.crawls.models import Crawl as CrawlModel
    queued_crawls = CrawlModel.objects.filter(
        retry_at__lte=timezone.now()
    ).exclude(
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
    to_remove = snapshots.count()

    from archivebox.search import flush_search_index
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot

    flush_search_index(snapshots=snapshots)
    snapshots.delete()
--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
                  before: Optional[float]=None,
                  out_dir: Path=DATA_DIR) -> QuerySet:
    """Filter and return Snapshots matching the given criteria."""
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot

    if snapshots:
        result = snapshots
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
           csv: str | None=None,
           with_headers: bool=False):
    """List, filter, and export information about archive entries"""
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot

    if with_headers and not (json or html or csv):
        stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
    - Transition from started -> sealed (when all ArchiveResults done)
    """
    from rich import print as rprint
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot

    try:
        snapshot = Snapshot.objects.get(id=snapshot_id)
@@ -88,11 +88,11 @@ def create_snapshots(

    from archivebox.misc.jsonl import (
        read_args_or_stdin, write_record, snapshot_to_jsonl,
-        TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
+        TYPE_SNAPSHOT, TYPE_TAG
    )
    from archivebox.base_models.models import get_or_create_system_user_pk
-    from core.models import Snapshot
-    from crawls.models import Crawl
+    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl
    from archivebox.config import CONSTANTS

    created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -137,8 +137,10 @@ def create_snapshots(
                record['tags'] = tag

            # Get or create the snapshot
-            snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
-            created_snapshots.append(snapshot)
+            overrides = {'created_by_id': created_by_id}
+            snapshot = Snapshot.from_jsonl(record, overrides=overrides)
+            if snapshot:
+                created_snapshots.append(snapshot)

            # Output JSONL record (only when piped)
            if not is_tty:
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:

    from django.contrib.auth import get_user_model
    from archivebox.misc.db import get_admins
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
    User = get_user_model()

    print('[green]\\[*] Scanning archive main index...[/green]')
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
    from archivebox.config.django import setup_django
    setup_django()

-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
    from django.utils import timezone

    while True:
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
    Skip symlinks (already migrated).
    Create DB records and trigger migration on save().
    """
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
    from archivebox.config import CONSTANTS
    from django.db import transaction

@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
    Process all snapshots in DB.
    Reconcile index.json and queue for archiving.
    """
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
    from django.db import transaction
    from django.utils import timezone

@@ -189,7 +189,7 @@ def process_filtered_snapshots(
    batch_size: int
 ) -> dict:
    """Process snapshots matching filters (DB query only)."""
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
    from django.db import transaction
    from django.utils import timezone
    from datetime import datetime
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -107,7 +107,7 @@ def version(quiet: bool=False,
    from archivebox.config.django import setup_django
    setup_django()

-    from machine.models import Machine, Binary
+    from archivebox.machine.models import Machine, Binary

    machine = Machine.current()

--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        Test: archivebox snapshot URL
        Should create a Snapshot and output JSONL when piped.
        """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            read_args_or_stdin, write_record, snapshot_to_jsonl,
-            TYPE_SNAPSHOT, get_or_create_snapshot
+            TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk

@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        self.assertEqual(records[0]['url'], url)

        # Create snapshot
-        snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
+        overrides = {'created_by_id': created_by_id}
+        snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)

        self.assertIsNotNone(snapshot.id)
        self.assertEqual(snapshot.url, url)
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        Test: archivebox snapshot URL | archivebox extract
        Extract should accept JSONL output from snapshot command.
        """
-        from core.models import Snapshot, ArchiveResult
+        from archivebox.core.models import Snapshot, ArchiveResult
        from archivebox.misc.jsonl import (
-            snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
+            snapshot_to_jsonl, read_args_or_stdin,
            TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):

        # Step 1: Create snapshot (simulating 'archivebox snapshot')
        url = 'https://test-extract-1.example.com'
-        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+        overrides = {'created_by_id': created_by_id}
+        snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
        snapshot_output = snapshot_to_jsonl(snapshot)

        # Step 2: Parse snapshot output as extract input
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):

        This is equivalent to: archivebox add URL
        """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
            TYPE_SNAPSHOT
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):

        This is equivalent to: archivebox add --depth=1 URL
        """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
            TYPE_SNAPSHOT
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):

        Depth 0: Only archive the specified URL, no crawling.
        """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import get_or_create_snapshot
        from archivebox.base_models.models import get_or_create_system_user_pk