This commit is contained in:
Nick Sweeting
2026-03-15 18:45:29 -07:00
parent f97725d16f
commit 934e02695b
111 changed files with 919 additions and 461 deletions

View File

@@ -57,6 +57,7 @@ def add(urls: str | list[str],
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.personas.models import Persona
from archivebox.workers.orchestrator import Orchestrator
from archivebox.misc.logging_util import printable_filesize
from archivebox.misc.system import get_dir_size
@@ -79,11 +80,15 @@ def add(urls: str | list[str],
# Read URLs directly into crawl
urls_content = sources_file.read_text()
persona_name = (persona or 'Default').strip() or 'Default'
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
persona_obj.ensure_dirs()
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
tags_str=tag,
persona_id=persona_obj.id,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
created_by_id=created_by_id,
config={
@@ -91,7 +96,7 @@ def add(urls: str | list[str],
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona or 'Default',
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
}
)
@@ -135,8 +140,7 @@ def add(urls: str | list[str],
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
# Foreground mode: run full orchestrator until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
from archivebox.workers.orchestrator import Orchestrator
print('[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
orchestrator.runloop() # Block until complete

View File

@@ -94,7 +94,7 @@ def config(*keys,
# Display all plugin config in single [PLUGINS] section
if plugin_keys:
print(f'[grey53]\\[PLUGINS][/grey53]')
print('[grey53]\\[PLUGINS][/grey53]')
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')

View File

@@ -31,7 +31,6 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
import sys
from typing import Optional, List
import rich_click as click

View File

@@ -3,8 +3,6 @@
__package__ = 'archivebox.cli'
import os
import sys
import shutil
import rich_click as click
from rich import print

View File

@@ -410,7 +410,6 @@ def create_personas(
"""
from archivebox.misc.jsonl import write_record
from archivebox.personas.models import Persona
from archivebox.config.constants import CONSTANTS
is_tty = sys.stdout.isatty()
name_list = list(names) if names else []
@@ -493,10 +492,10 @@ def create_personas(
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
),
)
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(
persona_chrome_dir,
@@ -506,8 +505,8 @@ def create_personas(
):
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
else:
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
except Exception as e:
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)

View File

@@ -3,7 +3,6 @@
__package__ = 'archivebox.cli'
from typing import Optional
from pathlib import Path
import rich_click as click

View File

@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox search'
from pathlib import Path
from typing import Optional, List, Any
from typing import Optional, List
import rich_click as click
from rich import print
@@ -71,7 +71,6 @@ def search(filter_patterns: list[str] | None=None,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')

View File

@@ -99,7 +99,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
get_existing_supervisord_process,
get_worker,
start_server_workers,
tail_multiple_worker_logs,
is_port_in_use,
)
from archivebox.workers.orchestrator import Orchestrator
@@ -108,14 +107,14 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
print(f' Stop the conflicting process or choose a different port')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
# Check if orchestrator is already running for this data directory
if Orchestrator.is_running():
print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
print(f' Stop the existing orchestrator before starting a new server')
print(f' To stop: pkill -f "archivebox manage orchestrator"')
print('[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
print(' Stop the existing orchestrator before starting a new server')
print(' To stop: pkill -f "archivebox manage orchestrator"')
sys.exit(1)
# Check if supervisord is already running
@@ -129,7 +128,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print(' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')

View File

@@ -128,13 +128,13 @@ def status(out_dir: Path=DATA_DIR) -> None:
if not snapshot.downloaded_at:
continue
print(
'[grey53] ' +
(
'[grey53] '
f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
f'"{snapshot.title}": {snapshot.url}'
)[:SHELL_CONFIG.TERM_WIDTH]
+ '[grey53]',
'[/grey53]'
)[:SHELL_CONFIG.TERM_WIDTH],
)
print('[grey53] ...')

View File

@@ -36,8 +36,6 @@ def update(filter_patterns: Iterable[str] = (),
from archivebox.config.django import setup_django
setup_django()
from archivebox.core.models import Snapshot
from django.utils import timezone
from django.core.management import call_command
# Run migrations first to ensure DB schema is up-to-date

View File

@@ -6,7 +6,7 @@ import sys
import os
import platform
from pathlib import Path
from typing import Iterable, Optional
from typing import Iterable
import rich_click as click

View File

@@ -3,13 +3,13 @@
__package__ = 'archivebox.cli'
import importlib
import os
import sys
import shutil
import sys
import unittest
from pathlib import Path
from contextlib import contextmanager
from pathlib import Path
TEST_CONFIG = {
'USE_COLOR': 'False',
@@ -30,18 +30,15 @@ TEST_CONFIG = {
DATA_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
from ..main import init
from archivebox.config.constants import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from . import (
archivebox_init,
archivebox_add,
archivebox_remove,
)
init = importlib.import_module('archivebox.main').init
constants = importlib.import_module('archivebox.config.constants')
SQL_INDEX_FILENAME = constants.SQL_INDEX_FILENAME
JSON_INDEX_FILENAME = constants.JSON_INDEX_FILENAME
HTML_INDEX_FILENAME = constants.HTML_INDEX_FILENAME
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
HIDE_CLI_OUTPUT = True
@@ -68,6 +65,13 @@ stdout = sys.stdout
stderr = sys.stderr
def load_main_index(*, out_dir: str):
index_path = Path(out_dir) / JSON_INDEX_FILENAME
if not index_path.exists():
raise FileNotFoundError(index_path)
return list(parse_json_main_index(Path(out_dir)))
@contextmanager
def output_hidden(show_failing=True):
if not HIDE_CLI_OUTPUT:

View File

@@ -23,7 +23,6 @@ Each command should:
__package__ = 'archivebox.cli'
import os
import sys
import json
import shutil
import tempfile
@@ -101,7 +100,7 @@ class TestJSONLParsing(unittest.TestCase):
def test_parse_jsonl_with_id(self):
"""JSONL with id field should be recognized."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
from archivebox.misc.jsonl import parse_line
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
result = parse_line(line)
@@ -576,8 +575,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
"""
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT
read_args_or_stdin, TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
@@ -608,7 +606,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox snapshot URL | archivebox extract
Extract should accept JSONL output from snapshot command.
"""
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin,
TYPE_SNAPSHOT
@@ -783,7 +781,6 @@ class TestParserPluginWorkflows(unittest.TestCase):
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_plugins
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
@@ -938,7 +935,6 @@ class TestPassThroughBehavior(unittest.TestCase):
def test_crawl_passes_through_other_types(self):
"""crawl create should pass through records with other types."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Input: a Tag record (not a Crawl or URL)
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
@@ -946,8 +942,9 @@ class TestPassThroughBehavior(unittest.TestCase):
# Mock stdin with both records
stdin = StringIO(
json.dumps(tag_record) + '\n' +
json.dumps(url_record)
json.dumps(tag_record)
+ '\n'
+ json.dumps(url_record)
)
stdin.isatty = lambda: False
@@ -964,7 +961,7 @@ class TestPassThroughBehavior(unittest.TestCase):
def test_snapshot_passes_through_crawl(self):
"""snapshot create should pass through Crawl records."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
from archivebox.misc.jsonl import TYPE_CRAWL
crawl_record = {
'type': TYPE_CRAWL,