WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

View File

@@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
def add(urls: str | list[str],
depth: int | str=0,
tag: str='',
url_allowlist: str='',
url_denylist: str='',
parser: str="auto",
plugins: str="",
persona: str='Default',
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
update: bool | None=None,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
@@ -85,6 +87,8 @@ def add(urls: str | list[str],
created_by_id = created_by_id or get_or_create_system_user_pk()
started_at = timezone.now()
if update is None:
update = not ARCHIVING_CONFIG.ONLY_NEW
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
@@ -120,6 +124,8 @@ def add(urls: str | list[str],
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
}
)
@@ -150,6 +156,9 @@ def add(urls: str | list[str],
snapshot.ensure_crawl_symlink()
return crawl, crawl.snapshot_set.all()
if bg:
crawl.create_snapshots_from_urls()
# 5. Start the crawl runner to process the queue
# The runner will:
# - Process Crawl -> create Snapshots from all URLs
@@ -192,8 +201,7 @@ def add(urls: str | list[str],
except Exception:
rel_output_str = str(crawl.output_dir)
# Build admin URL from SERVER_CONFIG
bind_addr = SERVER_CONFIG.BIND_ADDR
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
base_url = bind_addr
else:
@@ -218,11 +226,13 @@ def add(urls: str | list[str],
@click.command()
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())

View File

@@ -42,6 +42,16 @@ from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
return {
'type': 'ArchiveResult',
'snapshot_id': str(snapshot_id),
'plugin': plugin,
'hook_name': hook_name,
'status': status,
}
# =============================================================================
# CREATE
# =============================================================================
@@ -52,21 +62,21 @@ def create_archiveresults(
status: str = 'queued',
) -> int:
"""
Create ArchiveResults for Snapshots.
Create ArchiveResult request records for Snapshots.
Reads Snapshot records from stdin and creates ArchiveResult entries.
Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
If --plugin is specified, only creates results for that plugin.
Otherwise, creates results for all pending plugins.
If --plugin is specified, only emits requests for that plugin.
Otherwise, emits requests for all enabled snapshot hooks.
Exit codes:
0: Success
1: Failure
"""
from django.utils import timezone
from archivebox.config.configset import get_config
from archivebox.hooks import discover_hooks
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
@@ -135,33 +145,20 @@ def create_archiveresults(
created_count = 0
for snapshot in snapshots:
if plugin:
# Create for specific plugin only
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin,
defaults={
'status': status,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = status
result.retry_at = timezone.now()
result.save()
if not is_tty:
write_record(result.to_json())
write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
created_count += 1
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
hooks = discover_hooks('Snapshot', config=config)
for hook_path in hooks:
hook_name = hook_path.name
plugin_name = hook_path.parent.name
if not is_tty:
write_record(result.to_json())
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
created_count += 1
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
return 0
@@ -205,6 +202,7 @@ def list_archiveresults(
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
'noresults': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
@@ -233,8 +231,6 @@ def update_archiveresults(
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import ArchiveResult
@@ -257,7 +253,6 @@ def update_archiveresults(
# Apply updates from CLI flags
if status:
result.status = status
result.retry_at = timezone.now()
result.save()
updated_count += 1

View File

@@ -38,15 +38,16 @@ import rich_click as click
def process_archiveresult_by_id(archiveresult_id: str) -> int:
"""
Run extraction for a single ArchiveResult by ID (used by workers).
Re-run extraction for a single ArchiveResult by ID.
Triggers the ArchiveResult's state machine tick() to run the extractor
plugin, but only after claiming ownership via retry_at. This keeps direct
CLI execution aligned with the worker lifecycle and prevents duplicate hook
runs if another process already owns the same ArchiveResult.
ArchiveResults are projected status rows, not queued work items. Re-running
a single result means resetting that row and queueing its parent snapshot
through the shared crawl runner with the corresponding plugin selected.
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.core.models import ArchiveResult
from archivebox.services.runner import run_crawl
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
try:
# Claim-before-tick is the required calling pattern for direct
# state-machine drivers. If another worker already owns this row,
# report that and exit without running duplicate extractor side effects.
if not archiveresult.tick_claimed(lock_seconds=120):
print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
return 0
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
snapshot.status = snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl = snapshot.crawl
if crawl.status != crawl.StatusChoices.STARTED:
crawl.status = crawl.StatusChoices.QUEUED
crawl.retry_at = timezone.now()
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
return 1
@@ -121,8 +133,9 @@ def run_plugins(
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Gather snapshot IDs to process
# Gather snapshot IDs and optional plugin constraints to process
snapshot_ids = set()
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
for record in records:
record_type = record.get('type')
@@ -142,6 +155,9 @@ def run_plugins(
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
plugin_name = record.get('plugin')
if plugin_name and not plugins_list:
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
elif 'id' in record:
# Assume it's a snapshot ID
@@ -160,26 +176,15 @@ def run_plugins(
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
continue
# Create pending ArchiveResults if needed
if plugins_list:
# Only create for specific plugins
for plugin_name in plugins_list:
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin_name,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
if existing_result and existing_result.status in [
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
ArchiveResult.StatusChoices.BACKOFF,
]:
existing_result.reset_for_retry()
# Reset snapshot status to allow processing
if snapshot.status == Snapshot.StatusChoices.SEALED:
@@ -207,10 +212,15 @@ def run_plugins(
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
selected_plugins = plugins_list or sorted({
plugin
for snapshot_id in crawl_snapshot_ids
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
}) or None
run_crawl(
crawl_id,
snapshot_ids=sorted(crawl_snapshot_ids),
selected_plugins=plugins_list or None,
selected_plugins=selected_plugins,
)
# Output results as JSONL (when piped) or human-readable (when TTY)

View File

@@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None:
"""List Snapshots as JSONL."""
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
"""List Snapshots."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
@@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith:
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
))

View File

@@ -42,6 +42,7 @@ import rich_click as click
from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
from archivebox.personas import importers as persona_importers
# =============================================================================
@@ -440,8 +441,6 @@ def create_personas(
browser_binary = get_browser_binary(import_from)
if browser_binary:
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
else:
browser_binary = None
created_count = 0
for name in name_list:
@@ -450,7 +449,7 @@ def create_personas(
continue
# Validate persona name to prevent path traversal
is_valid, error_msg = validate_persona_name(name)
is_valid, error_msg = persona_importers.validate_persona_name(name)
if not is_valid:
rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
continue
@@ -468,49 +467,29 @@ def create_personas(
# Import browser profile if requested
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
# Copy the browser profile
rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
try:
# Remove existing chrome_user_data if it exists
if persona_chrome_dir.exists():
shutil.rmtree(persona_chrome_dir)
# Copy the profile directory
# We copy the entire user data dir, not just Default profile
shutil.copytree(
source_profile_dir,
persona_chrome_dir,
symlinks=True,
ignore=shutil.ignore_patterns(
'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
'Service Worker', 'GCM Store', '*.log', 'Crashpad',
'BrowserMetrics', 'BrowserMetrics-spare.pma',
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
),
import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile)
import_result = persona_importers.import_persona_from_source(
persona,
import_source,
copy_profile=True,
import_cookies=True,
capture_storage=False,
)
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(
persona_chrome_dir,
cookies_file,
profile_dir=profile,
chrome_binary=browser_binary,
):
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
else:
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
except Exception as e:
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
return 1
if import_result.profile_copied:
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
if import_result.cookies_imported:
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
elif not import_result.profile_copied:
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
for warning in import_result.warnings:
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
@@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int:
# Apply updates from CLI flags
if name:
# Validate new name to prevent path traversal
is_valid, error_msg = validate_persona_name(name)
is_valid, error_msg = persona_importers.validate_persona_name(name)
if not is_valid:
rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
continue

View File

@@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """
└─────────────────────────────────────────────────────────────────────────────┘
"""
ARCHIVERESULT_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ ArchiveResultMachine │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄─────────────────┐ │
│ │ (initial) │ │ │
│ └──┬───────┬──┘ │ │
│ │ │ │ tick() unless can_start() │
│ │ │ exceeded_max_ │ │
│ │ │ attempts │ │
│ │ ▼ │ │
│ │ ┌──────────┐ │ │
│ │ │ SKIPPED │ │ │
│ │ │ (final) │ │ │
│ │ └──────────┘ │ │
│ │ tick() when │ │
│ │ can_start() │ │
│ ▼ │ │
│ ┌─────────────┐ │ │
│ │ STARTED │──────────────────┘ │
│ │ │◄─────────────────────────────────────────────────┐ │
│ │ enter: │ │ │ │
│ │ result.run()│ tick() unless │ │ │
│ │ (execute │ is_finished() │ │ │
│ │ hook via │──────────────────────┘ │ │
│ │ run_hook())│ │ │
│ └──────┬──────┘ │ │
│ │ │ │
│ │ tick() checks status set by hook output │ │
│ ├─────────────┬─────────────┬─────────────┐ │ │
│ ▼ ▼ ▼ ▼ │ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
│ │ │ │ │
│ exceeded_max_ │ │ can_start()│ │
│ attempts │ │ loops back │ │
│ ▼ │ └────────────┘ │
│ ┌──────────┐ │ │
│ │ SKIPPED │◄─┘ │
│ │ (final) │ │
│ └──────────┘ │
│ │
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
└─────────────────────────────────────────────────────────────────────────────┘
"""
BINARY_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ BinaryMachine │
@@ -193,8 +143,8 @@ def pluginmap(
"""
Show a map of all state machines and their associated plugin hooks.
Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
Displays ASCII art diagrams of the core queued model state machines (Crawl,
Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
that will run for each model's transitions.
"""
from rich.console import Console
@@ -257,17 +207,6 @@ def pluginmap(
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
prnt()
# Show diagrams first (unless quiet mode)
if not quiet:
# Show ArchiveResult diagram separately since it's different
prnt(Panel(
ARCHIVERESULT_MACHINE_DIAGRAM,
title='[bold green]ArchiveResultMachine[/bold green]',
border_style='green',
expand=False,
))
prnt()
for event_name, info in model_events.items():
# Discover hooks for this event
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)

View File

@@ -145,17 +145,25 @@ def process_stdin_records() -> int:
try:
archiveresult = ArchiveResult.objects.get(id=record_id)
except ArchiveResult.DoesNotExist:
archiveresult = ArchiveResult.from_json(record)
archiveresult = None
else:
# New archiveresult - create it
archiveresult = ArchiveResult.from_json(record)
archiveresult = None
snapshot_id = record.get('snapshot_id')
plugin_name = record.get('plugin')
snapshot = None
if archiveresult:
archiveresult.retry_at = timezone.now()
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
archiveresult.save()
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
plugin_name = plugin_name or archiveresult.plugin
elif snapshot_id:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
snapshot = None
if snapshot:
snapshot.retry_at = timezone.now()
if snapshot.status != Snapshot.StatusChoices.STARTED:
snapshot.status = Snapshot.StatusChoices.QUEUED
@@ -167,9 +175,9 @@ def process_stdin_records() -> int:
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
if archiveresult.plugin:
plugin_names_by_crawl[crawl_id].add(archiveresult.plugin)
output_records.append(archiveresult.to_json())
if plugin_name:
plugin_names_by_crawl[crawl_id].add(str(plugin_name))
output_records.append(record if not archiveresult else archiveresult.to_json())
queued_count += 1
elif record_type == TYPE_BINARY:
@@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int:
"""
from django.utils import timezone
from archivebox.machine.models import Machine, Process
from archivebox.services.runner import run_pending_crawls
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
Process.cleanup_stale_running()
recover_orphaned_snapshots()
recover_orphaned_crawls()
Machine.current()
current = Process.current()
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
@@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
traceback.print_exc()
sys.exit(1)
if daemon:
if not sys.stdin.isatty():
exit_code = process_stdin_records()
if exit_code != 0:
sys.exit(exit_code)
sys.exit(run_runner(daemon=True))
if not sys.stdin.isatty():
sys.exit(process_stdin_records())
else:

View File

@@ -3,9 +3,7 @@
__package__ = 'archivebox.cli'
from typing import Iterable
import os
import sys
import subprocess
import rich_click as click
from rich import print
@@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types
from archivebox.config.common import SERVER_CONFIG
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
"""Stop any existing orchestrator process so the server can take ownership."""
process_model.cleanup_stale_running(machine=machine)
running_runners = list(process_model.objects.filter(
machine=machine,
status=process_model.StatusChoices.RUNNING,
process_type=process_model.TypeChoices.ORCHESTRATOR,
).order_by('created_at'))
if not running_runners:
return 0
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
if supervisor is not None and stop_worker_fn is not None:
for worker_name in ('worker_runner', 'worker_runner_watch'):
try:
stop_worker_fn(supervisor, worker_name)
except Exception:
pass
for proc in running_runners:
try:
proc.kill_tree(graceful_timeout=2.0)
except Exception:
try:
proc.terminate(graceful_timeout=2.0)
except Exception:
pass
process_model.cleanup_stale_running(machine=machine)
return len(running_runners)
@enforce_types
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
reload: bool=False,
@@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
if debug or reload:
SHELL_CONFIG.DEBUG = True
if run_in_debug:
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
if reload:
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
from archivebox.config.common import STORAGE_CONFIG
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if not is_reloader_child:
env = os.environ.copy()
subprocess.Popen(
[sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
@@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
from archivebox.workers.supervisord_util import (
get_existing_supervisord_process,
get_worker,
stop_worker,
start_server_workers,
is_port_in_use,
)
from archivebox.machine.models import Machine, Process
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
machine = Machine.current()
stop_existing_background_runner(
machine=machine,
process_model=Process,
supervisor=get_existing_supervisord_process(),
stop_worker_fn=stop_worker,
)
supervisor = get_existing_supervisord_process()
if supervisor:
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
server_proc = get_worker(supervisor, server_worker_name)
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
if server_state == 'RUNNING':
runner_proc = get_worker(supervisor, 'worker_runner')
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if runner_state == 'RUNNING':
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
if runner_watch_state == 'RUNNING':
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
sys.exit(1)
if run_in_debug:
from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if not reload:
runserver_args.append('--noreload') # '--insecure'
if nothreading:
runserver_args.append('--nothreading')
call_command("runserver", *runserver_args)
else:
from archivebox.workers.supervisord_util import (
get_existing_supervisord_process,
get_worker,
start_server_workers,
is_port_in_use,
)
from archivebox.machine.models import Machine, Process
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
# Check if the background crawl runner is already running for this data directory
if Process.objects.filter(
machine=Machine.current(),
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists():
print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]')
print(' Stop the existing runner before starting a new server')
print(' To stop: pkill -f "archivebox run --daemon"')
sys.exit(1)
# Check if supervisord is already running
supervisor = get_existing_supervisord_process()
if supervisor:
daphne_proc = get_worker(supervisor, 'worker_daphne')
daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None
# If daphne is already running, error out
if daphne_state == 'RUNNING':
runner_proc = get_worker(supervisor, 'worker_runner')
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if runner_state == 'RUNNING':
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
sys.exit(1)
# Otherwise, daphne is not running - fall through to start it
# No existing workers found - start new ones
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
@click.command()

View File

@@ -172,6 +172,9 @@ def list_snapshots(
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
sort: Optional[str] = None,
csv: Optional[str] = None,
with_headers: bool = False,
) -> int:
"""
List Snapshots as JSONL with optional filters.
@@ -182,7 +185,11 @@ def list_snapshots(
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
if with_headers and not csv:
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
return 2
is_tty = sys.stdout.isatty() and not csv
queryset = Snapshot.objects.all().order_by('-created_at')
@@ -199,7 +206,29 @@ def list_snapshots(
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
if sort:
queryset = queryset.order_by(sort)
count = 0
if csv:
cols = [col.strip() for col in csv.split(',') if col.strip()]
if not cols:
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
return 2
rows: list[str] = []
if with_headers:
rows.append(','.join(cols))
for snapshot in queryset.iterator(chunk_size=500):
rows.append(snapshot.to_csv(cols=cols, separator=','))
count += 1
output = '\n'.join(rows)
if output:
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
return 0
for snapshot in queryset:
if is_tty:
status_color = {