Files
ArchiveBox/archivebox/cli/archivebox_machine.py
Claude f3e11b61fd Implement JSONL CLI pipeline architecture (Phases 1-4, 6)
Phase 1: Model Prerequisites
- Add ArchiveResult.from_json() and from_jsonl() methods
- Fix Snapshot.to_json() to use tags_str (consistent with Crawl)

Phase 2: Shared Utilities
- Create archivebox/cli/cli_utils.py with shared apply_filters()
- Update 7 CLI files to import from cli_utils.py instead of duplicating

Phase 3: Pass-Through Behavior
- Add pass-through to crawl create (non-Crawl records pass unchanged)
- Add pass-through to snapshot create (Crawl records + others pass through)
- Add pass-through to archiveresult create (Snapshot records + others)
- Add create-or-update behavior to run command:
  - Records WITHOUT id: Create via Model.from_json()
  - Records WITH id: Lookup existing, re-queue
  - Outputs JSONL of all processed records for chaining

Phase 4: Test Infrastructure
- Create archivebox/tests/conftest.py with pytest-django fixtures
- Include CLI helpers, output assertions, database assertions

Phase 6: Config Update
- Update supervisord_util.py: orchestrator -> run command

This enables Unix-style piping:
  archivebox crawl create URL | archivebox run
  archivebox archiveresult list --status=failed | archivebox run
  curl API | jq transform | archivebox crawl create | archivebox run
2025-12-31 10:07:14 +00:00

100 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""
archivebox machine <action> [--filters]
Manage Machine records (system-managed, mostly read-only).
Machine records track the host machines where ArchiveBox runs.
They are created automatically by the system and are primarily for debugging.
Actions:
list - List Machines as JSONL (with optional filters)
Examples:
# List all machines
archivebox machine list
# List machines by hostname
archivebox machine list --hostname__icontains=myserver
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox machine'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
# =============================================================================
# LIST
# =============================================================================
def list_machines(
hostname__icontains: Optional[str] = None,
os_platform: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Machines as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Machine
is_tty = sys.stdout.isatty()
queryset = Machine.objects.all().order_by('-created_at')
# Apply filters
filter_kwargs = {
'hostname__icontains': hostname__icontains,
'os_platform': os_platform,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for machine in queryset:
if is_tty:
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
else:
write_record(machine.to_json())
count += 1
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Machine records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--hostname__icontains', help='Filter by hostname contains')
@click.option('--os-platform', help='Filter by OS platform')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
"""List Machines as JSONL."""
sys.exit(list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
))
if __name__ == '__main__':
main()