use full dotted paths for all archivebox imports, add migrations and more fixes

2026-04-04 14:57:56 +10:00 · 2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -56,7 +56,7 @@ def add(urls: str | list[str],
    from archivebox.core.models import Snapshot
    from archivebox.crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator

    created_by_id = created_by_id or get_or_create_system_user_pk()

--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -78,7 +78,7 @@ def discover_outlinks(
    from archivebox.core.models import Snapshot, ArchiveResult
    from archivebox.crawls.models import Crawl
    from archivebox.config import CONSTANTS
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator

    created_by_id = get_or_create_system_user_pk()
    is_tty = sys.stdout.isatty()
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -96,7 +96,7 @@ def run_plugins(
        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
    )
    from archivebox.core.models import Snapshot, ArchiveResult
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator

    is_tty = sys.stdout.isatty()

--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types


@enforce_types
-def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None:
+def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
    """Initialize a new ArchiveBox collection in the current directory"""
    
-    install = install or setup
-    
    from archivebox.config import CONSTANTS, VERSION, DATA_DIR
    from archivebox.config.common import SERVER_CONFIG
    from archivebox.config.collection import write_config_file
@@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
                print(f'    [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')

            if pending_links:
-                Snapshot.objects.create_from_dicts(list(pending_links.values()))
+                for link_dict in pending_links.values():
+                    Snapshot.from_jsonl(link_dict)

            # Hint for orphaned snapshot directories
            print()
@@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
-@click.option('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install')
@docstring(init.__doc__)
 def main(**kwargs) -> None:
    init(**kwargs)
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None:
    print()

    # Run the crawl synchronously (this triggers on_Crawl hooks)
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
    orchestrator = Orchestrator(exit_on_idle=True)
    orchestrator.runloop()

--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int:
        0: All work completed successfully
        1: Error occurred
    """
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
    
    if Orchestrator.is_running():
        print('[yellow]Orchestrator is already running[/yellow]')
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
            tail_multiple_worker_logs,
            is_port_in_use,
        )
-        from workers.orchestrator import Orchestrator
+        from archivebox.workers.orchestrator import Orchestrator
        import sys

        # Check if port is already in use
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -163,7 +163,7 @@ def create_snapshots(

    # If --plugins is passed, run the orchestrator for those plugins
    if plugins:
-        from workers.orchestrator import Orchestrator
+        from archivebox.workers.orchestrator import Orchestrator
        rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
        orchestrator = Orchestrator(exit_on_idle=True)
        orchestrator.runloop()
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
    total = Snapshot.objects.count()
    print(f'[*] Processing {total} snapshots from database...')

-    for snapshot in Snapshot.objects.iterator():
+    for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
        # Reconcile index.json with DB
        snapshot.reconcile_with_index_json()

@@ -209,7 +209,7 @@ def process_filtered_snapshots(
    total = snapshots.count()
    print(f'[*] Found {total} matching snapshots')

-    for snapshot in snapshots.iterator():
+    for snapshot in snapshots.iterator(chunk_size=batch_size):
        # Reconcile index.json with DB
        snapshot.reconcile_with_index_json()

--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -17,7 +17,7 @@ TEST_CONFIG = {

    'DATA_DIR': 'data.tests',
    
-    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_ARCHIVEDOTORG': 'False',
    'SAVE_TITLE': 'False',
    
    'USE_CURL': 'False',
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock
 TEST_CONFIG = {
    'USE_COLOR': 'False',
    'SHOW_PROGRESS': 'False',
-    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_ARCHIVEDOTORG': 'False',
    'SAVE_TITLE': 'True',  # Fast extractor
    'SAVE_FAVICON': 'False',
    'SAVE_WGET': 'False',