mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
WIP: checkpoint working tree before rebasing onto dev
This commit is contained in:
@@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
tag: str='',
|
||||
url_allowlist: str='',
|
||||
url_denylist: str='',
|
||||
parser: str="auto",
|
||||
plugins: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
update: bool | None=None,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
|
||||
@@ -85,6 +87,8 @@ def add(urls: str | list[str],
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
started_at = timezone.now()
|
||||
if update is None:
|
||||
update = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
@@ -120,6 +124,8 @@ def add(urls: str | list[str],
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
|
||||
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -150,6 +156,9 @@ def add(urls: str | list[str],
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return crawl, crawl.snapshot_set.all()
|
||||
|
||||
if bg:
|
||||
crawl.create_snapshots_from_urls()
|
||||
|
||||
# 5. Start the crawl runner to process the queue
|
||||
# The runner will:
|
||||
# - Process Crawl -> create Snapshots from all URLs
|
||||
@@ -192,8 +201,7 @@ def add(urls: str | list[str],
|
||||
except Exception:
|
||||
rel_output_str = str(crawl.output_dir)
|
||||
|
||||
# Build admin URL from SERVER_CONFIG
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
|
||||
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
|
||||
base_url = bind_addr
|
||||
else:
|
||||
@@ -218,11 +226,13 @@ def add(urls: str | list[str],
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
|
||||
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
|
||||
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
|
||||
@@ -42,6 +42,16 @@ from rich import print as rprint
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
|
||||
return {
|
||||
'type': 'ArchiveResult',
|
||||
'snapshot_id': str(snapshot_id),
|
||||
'plugin': plugin,
|
||||
'hook_name': hook_name,
|
||||
'status': status,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
@@ -52,21 +62,21 @@ def create_archiveresults(
|
||||
status: str = 'queued',
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResults for Snapshots.
|
||||
Create ArchiveResult request records for Snapshots.
|
||||
|
||||
Reads Snapshot records from stdin and creates ArchiveResult entries.
|
||||
Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
|
||||
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
|
||||
If --plugin is specified, only creates results for that plugin.
|
||||
Otherwise, creates results for all pending plugins.
|
||||
If --plugin is specified, only emits requests for that plugin.
|
||||
Otherwise, emits requests for all enabled snapshot hooks.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.hooks import discover_hooks
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
@@ -135,33 +145,20 @@ def create_archiveresults(
|
||||
created_count = 0
|
||||
for snapshot in snapshots:
|
||||
if plugin:
|
||||
# Create for specific plugin only
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'status': status,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
|
||||
created_count += 1
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
|
||||
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name
|
||||
plugin_name = hook_path.parent.name
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -205,6 +202,7 @@ def list_archiveresults(
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'noresults': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
@@ -233,8 +231,6 @@ def update_archiveresults(
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
@@ -257,7 +253,6 @@ def update_archiveresults(
|
||||
# Apply updates from CLI flags
|
||||
if status:
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
|
||||
result.save()
|
||||
updated_count += 1
|
||||
|
||||
@@ -38,15 +38,16 @@ import rich_click as click
|
||||
|
||||
def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
"""
|
||||
Run extraction for a single ArchiveResult by ID (used by workers).
|
||||
Re-run extraction for a single ArchiveResult by ID.
|
||||
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor
|
||||
plugin, but only after claiming ownership via retry_at. This keeps direct
|
||||
CLI execution aligned with the worker lifecycle and prevents duplicate hook
|
||||
runs if another process already owns the same ArchiveResult.
|
||||
ArchiveResults are projected status rows, not queued work items. Re-running
|
||||
a single result means resetting that row and queueing its parent snapshot
|
||||
through the shared crawl runner with the corresponding plugin selected.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
@@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Claim-before-tick is the required calling pattern for direct
|
||||
# state-machine drivers. If another worker already owns this row,
|
||||
# report that and exit without running duplicate extractor side effects.
|
||||
if not archiveresult.tick_claimed(lock_seconds=120):
|
||||
print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
|
||||
return 0
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
snapshot.status = snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.status != crawl.StatusChoices.STARTED:
|
||||
crawl.status = crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
|
||||
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
@@ -121,8 +133,9 @@ def run_plugins(
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs to process
|
||||
# Gather snapshot IDs and optional plugin constraints to process
|
||||
snapshot_ids = set()
|
||||
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
|
||||
@@ -142,6 +155,9 @@ def run_plugins(
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
plugin_name = record.get('plugin')
|
||||
if plugin_name and not plugins_list:
|
||||
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
|
||||
|
||||
elif 'id' in record:
|
||||
# Assume it's a snapshot ID
|
||||
@@ -160,26 +176,15 @@ def run_plugins(
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Create pending ArchiveResults if needed
|
||||
if plugins_list:
|
||||
# Only create for specific plugins
|
||||
for plugin_name in plugins_list:
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
|
||||
if existing_result and existing_result.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
existing_result.reset_for_retry()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
@@ -207,10 +212,15 @@ def run_plugins(
|
||||
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
|
||||
|
||||
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
|
||||
selected_plugins = plugins_list or sorted({
|
||||
plugin
|
||||
for snapshot_id in crawl_snapshot_ids
|
||||
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
|
||||
}) or None
|
||||
run_crawl(
|
||||
crawl_id,
|
||||
snapshot_ids=sorted(crawl_snapshot_ids),
|
||||
selected_plugins=plugins_list or None,
|
||||
selected_plugins=selected_plugins,
|
||||
)
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
|
||||
@@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
|
||||
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
|
||||
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None:
|
||||
"""List Snapshots as JSONL."""
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
|
||||
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
|
||||
"""List Snapshots."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
@@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith:
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
))
|
||||
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
from archivebox.personas import importers as persona_importers
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -440,8 +441,6 @@ def create_personas(
|
||||
browser_binary = get_browser_binary(import_from)
|
||||
if browser_binary:
|
||||
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
|
||||
else:
|
||||
browser_binary = None
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
@@ -450,7 +449,7 @@ def create_personas(
|
||||
continue
|
||||
|
||||
# Validate persona name to prevent path traversal
|
||||
is_valid, error_msg = validate_persona_name(name)
|
||||
is_valid, error_msg = persona_importers.validate_persona_name(name)
|
||||
if not is_valid:
|
||||
rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
|
||||
continue
|
||||
@@ -468,49 +467,29 @@ def create_personas(
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
|
||||
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
|
||||
|
||||
# Copy the browser profile
|
||||
rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Remove existing chrome_user_data if it exists
|
||||
if persona_chrome_dir.exists():
|
||||
shutil.rmtree(persona_chrome_dir)
|
||||
|
||||
# Copy the profile directory
|
||||
# We copy the entire user data dir, not just Default profile
|
||||
shutil.copytree(
|
||||
source_profile_dir,
|
||||
persona_chrome_dir,
|
||||
symlinks=True,
|
||||
ignore=shutil.ignore_patterns(
|
||||
'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
|
||||
'Service Worker', 'GCM Store', '*.log', 'Crashpad',
|
||||
'BrowserMetrics', 'BrowserMetrics-spare.pma',
|
||||
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
|
||||
),
|
||||
import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile)
|
||||
import_result = persona_importers.import_persona_from_source(
|
||||
persona,
|
||||
import_source,
|
||||
copy_profile=True,
|
||||
import_cookies=True,
|
||||
capture_storage=False,
|
||||
)
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
|
||||
# Extract cookies via CDP
|
||||
rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(
|
||||
persona_chrome_dir,
|
||||
cookies_file,
|
||||
profile_dir=profile,
|
||||
chrome_binary=browser_binary,
|
||||
):
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
|
||||
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if import_result.profile_copied:
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
if import_result.cookies_imported:
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
elif not import_result.profile_copied:
|
||||
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
|
||||
|
||||
for warning in import_result.warnings:
|
||||
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
@@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
# Apply updates from CLI flags
|
||||
if name:
|
||||
# Validate new name to prevent path traversal
|
||||
is_valid, error_msg = validate_persona_name(name)
|
||||
is_valid, error_msg = persona_importers.validate_persona_name(name)
|
||||
if not is_valid:
|
||||
rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
@@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
ARCHIVERESULT_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ArchiveResultMachine │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄─────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──┬───────┬──┘ │ │
|
||||
│ │ │ │ tick() unless can_start() │
|
||||
│ │ │ exceeded_max_ │ │
|
||||
│ │ │ attempts │ │
|
||||
│ │ ▼ │ │
|
||||
│ │ ┌──────────┐ │ │
|
||||
│ │ │ SKIPPED │ │ │
|
||||
│ │ │ (final) │ │ │
|
||||
│ │ └──────────┘ │ │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │──────────────────┘ │
|
||||
│ │ │◄─────────────────────────────────────────────────┐ │
|
||||
│ │ enter: │ │ │ │
|
||||
│ │ result.run()│ tick() unless │ │ │
|
||||
│ │ (execute │ is_finished() │ │ │
|
||||
│ │ hook via │──────────────────────┘ │ │
|
||||
│ │ run_hook())│ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ tick() checks status set by hook output │ │
|
||||
│ ├─────────────┬─────────────┬─────────────┐ │ │
|
||||
│ ▼ ▼ ▼ ▼ │ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
|
||||
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
|
||||
│ │ │ │ │
|
||||
│ exceeded_max_ │ │ can_start()│ │
|
||||
│ attempts │ │ loops back │ │
|
||||
│ ▼ │ └────────────┘ │
|
||||
│ ┌──────────┐ │ │
|
||||
│ │ SKIPPED │◄─┘ │
|
||||
│ │ (final) │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
BINARY_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ BinaryMachine │
|
||||
@@ -193,8 +143,8 @@ def pluginmap(
|
||||
"""
|
||||
Show a map of all state machines and their associated plugin hooks.
|
||||
|
||||
Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
|
||||
ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
|
||||
Displays ASCII art diagrams of the core queued model state machines (Crawl,
|
||||
Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
|
||||
that will run for each model's transitions.
|
||||
"""
|
||||
from rich.console import Console
|
||||
@@ -257,17 +207,6 @@ def pluginmap(
|
||||
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
|
||||
prnt()
|
||||
|
||||
# Show diagrams first (unless quiet mode)
|
||||
if not quiet:
|
||||
# Show ArchiveResult diagram separately since it's different
|
||||
prnt(Panel(
|
||||
ARCHIVERESULT_MACHINE_DIAGRAM,
|
||||
title='[bold green]ArchiveResultMachine[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
prnt()
|
||||
|
||||
for event_name, info in model_events.items():
|
||||
# Discover hooks for this event
|
||||
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
|
||||
|
||||
@@ -145,17 +145,25 @@ def process_stdin_records() -> int:
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=record_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
archiveresult = ArchiveResult.from_json(record)
|
||||
archiveresult = None
|
||||
else:
|
||||
# New archiveresult - create it
|
||||
archiveresult = ArchiveResult.from_json(record)
|
||||
archiveresult = None
|
||||
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin_name = record.get('plugin')
|
||||
snapshot = None
|
||||
if archiveresult:
|
||||
archiveresult.retry_at = timezone.now()
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
|
||||
archiveresult.save()
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
plugin_name = plugin_name or archiveresult.plugin
|
||||
elif snapshot_id:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = None
|
||||
|
||||
if snapshot:
|
||||
snapshot.retry_at = timezone.now()
|
||||
if snapshot.status != Snapshot.StatusChoices.STARTED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
@@ -167,9 +175,9 @@ def process_stdin_records() -> int:
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
if archiveresult.plugin:
|
||||
plugin_names_by_crawl[crawl_id].add(archiveresult.plugin)
|
||||
output_records.append(archiveresult.to_json())
|
||||
if plugin_name:
|
||||
plugin_names_by_crawl[crawl_id].add(str(plugin_name))
|
||||
output_records.append(record if not archiveresult else archiveresult.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_BINARY:
|
||||
@@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int:
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.machine.models import Machine, Process
|
||||
from archivebox.services.runner import run_pending_crawls
|
||||
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
recover_orphaned_snapshots()
|
||||
recover_orphaned_crawls()
|
||||
Machine.current()
|
||||
current = Process.current()
|
||||
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
|
||||
@@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if daemon:
|
||||
if not sys.stdin.isatty():
|
||||
exit_code = process_stdin_records()
|
||||
if exit_code != 0:
|
||||
sys.exit(exit_code)
|
||||
sys.exit(run_runner(daemon=True))
|
||||
|
||||
if not sys.stdin.isatty():
|
||||
sys.exit(process_stdin_records())
|
||||
else:
|
||||
|
||||
@@ -3,9 +3,7 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Iterable
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
|
||||
|
||||
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
|
||||
"""Stop any existing orchestrator process so the server can take ownership."""
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
|
||||
running_runners = list(process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by('created_at'))
|
||||
|
||||
if not running_runners:
|
||||
return 0
|
||||
|
||||
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
|
||||
|
||||
if supervisor is not None and stop_worker_fn is not None:
|
||||
for worker_name in ('worker_runner', 'worker_runner_watch'):
|
||||
try:
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for proc in running_runners:
|
||||
try:
|
||||
proc.kill_tree(graceful_timeout=2.0)
|
||||
except Exception:
|
||||
try:
|
||||
proc.terminate(graceful_timeout=2.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
return len(running_runners)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool=False,
|
||||
@@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
if debug or reload:
|
||||
SHELL_CONFIG.DEBUG = True
|
||||
|
||||
if run_in_debug:
|
||||
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
|
||||
if reload:
|
||||
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
|
||||
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
|
||||
|
||||
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
|
||||
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if not is_reloader_child:
|
||||
env = os.environ.copy()
|
||||
subprocess.Popen(
|
||||
[sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'],
|
||||
env=env,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
@@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
from archivebox.workers.supervisord_util import (
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
stop_worker,
|
||||
start_server_workers,
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
machine = Machine.current()
|
||||
stop_existing_background_runner(
|
||||
machine=machine,
|
||||
process_model=Process,
|
||||
supervisor=get_existing_supervisord_process(),
|
||||
stop_worker_fn=stop_worker,
|
||||
)
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
|
||||
server_proc = get_worker(supervisor, server_worker_name)
|
||||
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
|
||||
if server_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
if runner_watch_state == 'RUNNING':
|
||||
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
sys.exit(1)
|
||||
|
||||
if run_in_debug:
|
||||
from django.core.management import call_command
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
if not reload:
|
||||
runserver_args.append('--noreload') # '--insecure'
|
||||
if nothreading:
|
||||
runserver_args.append('--nothreading')
|
||||
call_command("runserver", *runserver_args)
|
||||
else:
|
||||
from archivebox.workers.supervisord_util import (
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
start_server_workers,
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if the background crawl runner is already running for this data directory
|
||||
if Process.objects.filter(
|
||||
machine=Machine.current(),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
).exists():
|
||||
print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]')
|
||||
print(' Stop the existing runner before starting a new server')
|
||||
print(' To stop: pkill -f "archivebox run --daemon"')
|
||||
sys.exit(1)
|
||||
|
||||
# Check if supervisord is already running
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
daphne_proc = get_worker(supervisor, 'worker_daphne')
|
||||
daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None
|
||||
|
||||
# If daphne is already running, error out
|
||||
if daphne_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
sys.exit(1)
|
||||
# Otherwise, daphne is not running - fall through to start it
|
||||
|
||||
# No existing workers found - start new ones
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
@click.command()
|
||||
|
||||
@@ -172,6 +172,9 @@ def list_snapshots(
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
sort: Optional[str] = None,
|
||||
csv: Optional[str] = None,
|
||||
with_headers: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
@@ -182,7 +185,11 @@ def list_snapshots(
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
if with_headers and not csv:
|
||||
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
|
||||
return 2
|
||||
|
||||
is_tty = sys.stdout.isatty() and not csv
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
|
||||
@@ -199,7 +206,29 @@ def list_snapshots(
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
if sort:
|
||||
queryset = queryset.order_by(sort)
|
||||
|
||||
count = 0
|
||||
if csv:
|
||||
cols = [col.strip() for col in csv.split(',') if col.strip()]
|
||||
if not cols:
|
||||
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
|
||||
return 2
|
||||
rows: list[str] = []
|
||||
if with_headers:
|
||||
rows.append(','.join(cols))
|
||||
for snapshot in queryset.iterator(chunk_size=500):
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=','))
|
||||
count += 1
|
||||
output = '\n'.join(rows)
|
||||
if output:
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
|
||||
Reference in New Issue
Block a user