use full dotted paths for all archivebox imports, add migrations and more fixes

This commit is contained in:
Nick Sweeting
2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions

View File

@@ -56,7 +56,7 @@ def add(urls: str | list[str],
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
created_by_id = created_by_id or get_or_create_system_user_pk()

View File

@@ -78,7 +78,7 @@ def discover_outlinks(
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
created_by_id = get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()

View File

@@ -96,7 +96,7 @@ def run_plugins(
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from archivebox.core.models import Snapshot, ArchiveResult
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()

View File

@@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None:
def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
install = install or setup
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file
@@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
if pending_links:
Snapshot.objects.create_from_dicts(list(pending_links.values()))
for link_dict in pending_links.values():
Snapshot.from_jsonl(link_dict)
# Hint for orphaned snapshot directories
print()
@@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
@click.option('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install')
@docstring(init.__doc__)
def main(**kwargs) -> None:
init(**kwargs)

View File

@@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None:
print()
# Run the crawl synchronously (this triggers on_Crawl hooks)
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()

View File

@@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int:
0: All work completed successfully
1: Error occurred
"""
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]')

View File

@@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
tail_multiple_worker_logs,
is_port_in_use,
)
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
import sys
# Check if port is already in use

View File

@@ -163,7 +163,7 @@ def create_snapshots(
# If --plugins is passed, run the orchestrator for those plugins
if plugins:
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()

View File

@@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database...')
for snapshot in Snapshot.objects.iterator():
for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
@@ -209,7 +209,7 @@ def process_filtered_snapshots(
total = snapshots.count()
print(f'[*] Found {total} matching snapshots')
for snapshot in snapshots.iterator():
for snapshot in snapshots.iterator(chunk_size=batch_size):
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()

View File

@@ -17,7 +17,7 @@ TEST_CONFIG = {
'DATA_DIR': 'data.tests',
'SAVE_ARCHIVE_DOT_ORG': 'False',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'USE_CURL': 'False',

View File

@@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'SAVE_ARCHIVE_DOT_ORG': 'False',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'True', # Fast extractor
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',