fix lint

2026-04-06 07:47:53 +10:00 · 2026-03-15 18:45:29 -07:00
parent f97725d16f
commit 934e02695b
111 changed files with 919 additions and 461 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -57,6 +57,7 @@ def add(urls: str | list[str],
    from archivebox.core.models import Snapshot
    from archivebox.crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk
+    from archivebox.personas.models import Persona
    from archivebox.workers.orchestrator import Orchestrator
    from archivebox.misc.logging_util import printable_filesize
    from archivebox.misc.system import get_dir_size
@@ -79,11 +80,15 @@ def add(urls: str | list[str],

    # Read URLs directly into crawl
    urls_content = sources_file.read_text()
+    persona_name = (persona or 'Default').strip() or 'Default'
+    persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
+    persona_obj.ensure_dirs()

    crawl = Crawl.objects.create(
        urls=urls_content,
        max_depth=depth,
        tags_str=tag,
+        persona_id=persona_obj.id,
        label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
        created_by_id=created_by_id,
        config={
@@ -91,7 +96,7 @@ def add(urls: str | list[str],
            'INDEX_ONLY': index_only,
            'OVERWRITE': overwrite,
            'PLUGINS': plugins,
-            'DEFAULT_PERSONA': persona or 'Default',
+            'DEFAULT_PERSONA': persona_name,
            'PARSER': parser,
        }
    )
@@ -135,8 +140,7 @@ def add(urls: str | list[str],
        print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
    else:
        # Foreground mode: run full orchestrator until all work is done
-        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
-        from archivebox.workers.orchestrator import Orchestrator
+        print('[green]\\[*] Starting orchestrator to process crawl...[/green]')
        orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
        orchestrator.runloop()  # Block until complete

--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -94,7 +94,7 @@ def config(*keys,

        # Display all plugin config in single [PLUGINS] section
        if plugin_keys:
-            print(f'[grey53]\\[PLUGINS][/grey53]')
+            print('[grey53]\\[PLUGINS][/grey53]')
            print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
            print('[grey53]################################################################[/grey53]')

--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -31,7 +31,6 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox extract'

 import sys
-from typing import Optional, List

 import rich_click as click

--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -3,8 +3,6 @@
 __package__ = 'archivebox.cli'

 import os
-import sys
-import shutil

 import rich_click as click
 from rich import print
--- a/archivebox/cli/archivebox_persona.py
+++ b/archivebox/cli/archivebox_persona.py
@@ -410,7 +410,6 @@ def create_personas(
    """
    from archivebox.misc.jsonl import write_record
    from archivebox.personas.models import Persona
-    from archivebox.config.constants import CONSTANTS

    is_tty = sys.stdout.isatty()
    name_list = list(names) if names else []
@@ -493,10 +492,10 @@ def create_personas(
                        'SingletonLock', 'SingletonSocket', 'SingletonCookie',
                    ),
                )
-                rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
+                rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)

                # Extract cookies via CDP
-                rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
+                rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)

                if extract_cookies_via_cdp(
                    persona_chrome_dir,
@@ -506,8 +505,8 @@ def create_personas(
                ):
                    rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
                else:
-                    rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
-                    rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
+                    rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
+                    rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)

            except Exception as e:
                rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
--- a/archivebox/cli/archivebox_pluginmap.py
+++ b/archivebox/cli/archivebox_pluginmap.py
@@ -3,7 +3,6 @@
 __package__ = 'archivebox.cli'

 from typing import Optional
-from pathlib import Path

 import rich_click as click

--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox search'

 from pathlib import Path
-from typing import Optional, List, Any
+from typing import Optional, List

 import rich_click as click
 from rich import print
@@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None,
           csv: str | None=None,
           with_headers: bool=False):
    """List, filter, and export information about archive entries"""
-    from archivebox.core.models import Snapshot

    if with_headers and not (json or html or csv):
        stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
            get_existing_supervisord_process,
            get_worker,
            start_server_workers,
-            tail_multiple_worker_logs,
            is_port_in_use,
        )
        from archivebox.workers.orchestrator import Orchestrator
@@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
        if is_port_in_use(host, int(port)):
            print(f'[red][X] Error: Port {port} is already in use[/red]')
            print(f'    Another process (possibly daphne) is already listening on {host}:{port}')
-            print(f'    Stop the conflicting process or choose a different port')
+            print('    Stop the conflicting process or choose a different port')
            sys.exit(1)

        # Check if orchestrator is already running for this data directory
        if Orchestrator.is_running():
-            print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
-            print(f'    Stop the existing orchestrator before starting a new server')
-            print(f'    To stop: pkill -f "archivebox manage orchestrator"')
+            print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
+            print('    Stop the existing orchestrator before starting a new server')
+            print('    To stop: pkill -f "archivebox manage orchestrator"')
            sys.exit(1)

        # Check if supervisord is already running
@@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
                print('[red][X] Error: ArchiveBox server is already running[/red]')
                print(f'    [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
                if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
-                    print(f'    [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
+                    print('    [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
                print()
                print('[yellow]To stop the existing server, run:[/yellow]')
                print('    pkill -f "archivebox server"')
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None:
        if not snapshot.downloaded_at:
            continue
        print(
-            '[grey53] ' +
            (
+                '[grey53] '
                f'   > {str(snapshot.downloaded_at)[:16]} '
                f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
                f'"{snapshot.title}": {snapshot.url}'
-            )[:SHELL_CONFIG.TERM_WIDTH]
-            + '[grey53]',
+                '[/grey53]'
+            )[:SHELL_CONFIG.TERM_WIDTH],
        )
    print('[grey53]   ...')

--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (),
    from archivebox.config.django import setup_django
    setup_django()

-    from archivebox.core.models import Snapshot
-    from django.utils import timezone
    from django.core.management import call_command

    # Run migrations first to ensure DB schema is up-to-date
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -6,7 +6,7 @@ import sys
 import os
 import platform
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Iterable

 import rich_click as click

--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -3,13 +3,13 @@
 __package__ = 'archivebox.cli'


+import importlib
 import os
-import sys
 import shutil
+import sys
 import unittest
-from pathlib import Path
-
 from contextlib import contextmanager
+from pathlib import Path

 TEST_CONFIG = {
    'USE_COLOR': 'False',
@@ -30,18 +30,15 @@ TEST_CONFIG = {
 DATA_DIR = 'data.tests'
 os.environ.update(TEST_CONFIG)

-from ..main import init
-from archivebox.config.constants import (
-    SQL_INDEX_FILENAME,
-    JSON_INDEX_FILENAME,
-    HTML_INDEX_FILENAME,
-)
-
-from . import (
-    archivebox_init,
-    archivebox_add,
-    archivebox_remove,
-)
+init = importlib.import_module('archivebox.main').init
+constants = importlib.import_module('archivebox.config.constants')
+SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
+JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
+HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
+archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
+archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
+archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
+parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index

 HIDE_CLI_OUTPUT = True

@@ -68,6 +65,13 @@ stdout = sys.stdout
 stderr = sys.stderr


+def load_main_index(*, out_dir: str):
+    index_path = Path(out_dir) / JSON_INDEX_FILENAME
+    if not index_path.exists():
+        raise FileNotFoundError(index_path)
+    return list(parse_json_main_index(Path(out_dir)))
+
+
@contextmanager
 def output_hidden(show_failing=True):
    if not HIDE_CLI_OUTPUT:
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -23,7 +23,6 @@ Each command should:
 __package__ = 'archivebox.cli'

 import os
-import sys
 import json
 import shutil
 import tempfile
@@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase):

    def test_parse_jsonl_with_id(self):
        """JSONL with id field should be recognized."""
-        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+        from archivebox.misc.jsonl import parse_line

        line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
        result = parse_line(line)
@@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        """
        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
-            read_args_or_stdin, write_record,
-            TYPE_SNAPSHOT
+            read_args_or_stdin, TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk

@@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        Test: archivebox snapshot URL | archivebox extract
        Extract should accept JSONL output from snapshot command.
        """
-        from archivebox.core.models import Snapshot, ArchiveResult
+        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            read_args_or_stdin,
            TYPE_SNAPSHOT
@@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase):
        Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
        """
        from archivebox.hooks import collect_urls_from_plugins
-        from archivebox.misc.jsonl import TYPE_SNAPSHOT

        # Create mock output directory
        snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
@@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase):

    def test_crawl_passes_through_other_types(self):
        """crawl create should pass through records with other types."""
-        from archivebox.misc.jsonl import TYPE_CRAWL

        # Input: a Tag record (not a Crawl or URL)
        tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
@@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase):

        # Mock stdin with both records
        stdin = StringIO(
-            json.dumps(tag_record) + '\n' +
-            json.dumps(url_record)
+            json.dumps(tag_record)
+            + '\n'
+            + json.dumps(url_record)
        )
        stdin.isatty = lambda: False

@@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase):

    def test_snapshot_passes_through_crawl(self):
        """snapshot create should pass through Crawl records."""
-        from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
+        from archivebox.misc.jsonl import TYPE_CRAWL

        crawl_record = {
            'type': TYPE_CRAWL,