mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
fix lint
This commit is contained in:
@@ -57,6 +57,7 @@ def add(urls: str | list[str],
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.personas.models import Persona
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.misc.system import get_dir_size
|
||||
@@ -79,11 +80,15 @@ def add(urls: str | list[str],
|
||||
|
||||
# Read URLs directly into crawl
|
||||
urls_content = sources_file.read_text()
|
||||
persona_name = (persona or 'Default').strip() or 'Default'
|
||||
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
|
||||
persona_obj.ensure_dirs()
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
persona_id=persona_obj.id,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
created_by_id=created_by_id,
|
||||
config={
|
||||
@@ -91,7 +96,7 @@ def add(urls: str | list[str],
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
}
|
||||
)
|
||||
@@ -135,8 +140,7 @@ def add(urls: str | list[str],
|
||||
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
|
||||
else:
|
||||
# Foreground mode: run full orchestrator until all work is done
|
||||
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
print('[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
|
||||
orchestrator.runloop() # Block until complete
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ def config(*keys,
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print(f'[grey53]\\[PLUGINS][/grey53]')
|
||||
print('[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
|
||||
import sys
|
||||
from typing import Optional, List
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
|
||||
@@ -410,7 +410,6 @@ def create_personas(
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.personas.models import Persona
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
name_list = list(names) if names else []
|
||||
@@ -493,10 +492,10 @@ def create_personas(
|
||||
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
|
||||
),
|
||||
)
|
||||
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
|
||||
# Extract cookies via CDP
|
||||
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(
|
||||
persona_chrome_dir,
|
||||
@@ -506,8 +505,8 @@ def create_personas(
|
||||
):
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Any
|
||||
from typing import Optional, List
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
|
||||
@@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
start_server_workers,
|
||||
tail_multiple_worker_logs,
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
@@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
|
||||
print(f' Stop the conflicting process or choose a different port')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if orchestrator is already running for this data directory
|
||||
if Orchestrator.is_running():
|
||||
print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
|
||||
print(f' Stop the existing orchestrator before starting a new server')
|
||||
print(f' To stop: pkill -f "archivebox manage orchestrator"')
|
||||
print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
|
||||
print(' Stop the existing orchestrator before starting a new server')
|
||||
print(' To stop: pkill -f "archivebox manage orchestrator"')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if supervisord is already running
|
||||
@@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
|
||||
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
|
||||
print(' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print(' pkill -f "archivebox server"')
|
||||
|
||||
@@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
if not snapshot.downloaded_at:
|
||||
continue
|
||||
print(
|
||||
'[grey53] ' +
|
||||
(
|
||||
'[grey53] '
|
||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||
f'"{snapshot.title}": {snapshot.url}'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH]
|
||||
+ '[grey53]',
|
||||
'[/grey53]'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||
)
|
||||
print('[grey53] ...')
|
||||
|
||||
|
||||
@@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.utils import timezone
|
||||
from django.core.management import call_command
|
||||
|
||||
# Run migrations first to ensure DB schema is up-to-date
|
||||
|
||||
@@ -6,7 +6,7 @@ import sys
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
from typing import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import sys
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
@@ -30,18 +30,15 @@ TEST_CONFIG = {
|
||||
DATA_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
from ..main import init
|
||||
from archivebox.config.constants import (
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
)
|
||||
|
||||
from . import (
|
||||
archivebox_init,
|
||||
archivebox_add,
|
||||
archivebox_remove,
|
||||
)
|
||||
init = importlib.import_module('archivebox.main').init
|
||||
constants = importlib.import_module('archivebox.config.constants')
|
||||
SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
|
||||
JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
|
||||
HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
|
||||
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
|
||||
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
|
||||
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
|
||||
parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
|
||||
|
||||
HIDE_CLI_OUTPUT = True
|
||||
|
||||
@@ -68,6 +65,13 @@ stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
|
||||
|
||||
def load_main_index(*, out_dir: str):
|
||||
index_path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
if not index_path.exists():
|
||||
raise FileNotFoundError(index_path)
|
||||
return list(parse_json_main_index(Path(out_dir)))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def output_hidden(show_failing=True):
|
||||
if not HIDE_CLI_OUTPUT:
|
||||
|
||||
@@ -23,7 +23,6 @@ Each command should:
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
@@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase):
|
||||
|
||||
def test_parse_jsonl_with_id(self):
|
||||
"""JSONL with id field should be recognized."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
|
||||
result = parse_line(line)
|
||||
@@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT
|
||||
read_args_or_stdin, TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
@@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
Extract should accept JSONL output from snapshot command.
|
||||
"""
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
@@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase):
|
||||
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
|
||||
@@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
|
||||
def test_crawl_passes_through_other_types(self):
|
||||
"""crawl create should pass through records with other types."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Input: a Tag record (not a Crawl or URL)
|
||||
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
|
||||
@@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
|
||||
# Mock stdin with both records
|
||||
stdin = StringIO(
|
||||
json.dumps(tag_record) + '\n' +
|
||||
json.dumps(url_record)
|
||||
json.dumps(tag_record)
|
||||
+ '\n'
|
||||
+ json.dumps(url_record)
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
@@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase):
|
||||
|
||||
def test_snapshot_passes_through_crawl(self):
|
||||
"""snapshot create should pass through Crawl records."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
crawl_record = {
|
||||
'type': TYPE_CRAWL,
|
||||
|
||||
Reference in New Issue
Block a user