mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Phase 1: Model Prerequisites - Add ArchiveResult.from_json() and from_jsonl() methods - Fix Snapshot.to_json() to use tags_str (consistent with Crawl) Phase 2: Shared Utilities - Create archivebox/cli/cli_utils.py with shared apply_filters() - Update 7 CLI files to import from cli_utils.py instead of duplicating Phase 3: Pass-Through Behavior - Add pass-through to crawl create (non-Crawl records pass unchanged) - Add pass-through to snapshot create (Crawl records + others pass through) - Add pass-through to archiveresult create (Snapshot records + others) - Add create-or-update behavior to run command: - Records WITHOUT id: Create via Model.from_json() - Records WITH id: Lookup existing, re-queue - Outputs JSONL of all processed records for chaining Phase 4: Test Infrastructure - Create archivebox/tests/conftest.py with pytest-django fixtures - Include CLI helpers, output assertions, database assertions Phase 6: Config Update - Update supervisord_util.py: orchestrator -> run command This enables Unix-style piping: archivebox crawl create URL | archivebox run archivebox archiveresult list --status=failed | archivebox run curl API | jq transform | archivebox crawl create | archivebox run
47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
"""
|
|
Shared CLI utilities for ArchiveBox commands.
|
|
|
|
This module contains common utilities used across multiple CLI commands,
|
|
extracted to avoid code duplication.
|
|
"""
|
|
|
|
__package__ = 'archivebox.cli'
|
|
|
|
from typing import Optional
|
|
|
|
|
|
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
"""
|
|
Apply Django-style filters from CLI kwargs to a QuerySet.
|
|
|
|
Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2
|
|
|
|
Args:
|
|
queryset: Django QuerySet to filter
|
|
filter_kwargs: Dict of filter key-value pairs from CLI
|
|
limit: Optional limit on results
|
|
|
|
Returns:
|
|
Filtered QuerySet
|
|
|
|
Example:
|
|
queryset = Snapshot.objects.all()
|
|
filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'}
|
|
filtered = apply_filters(queryset, filter_kwargs, limit=10)
|
|
"""
|
|
filters = {}
|
|
for key, value in filter_kwargs.items():
|
|
if value is None or key in ('limit', 'offset'):
|
|
continue
|
|
# Handle CSV lists for __in filters
|
|
if key.endswith('__in') and isinstance(value, str):
|
|
value = [v.strip() for v in value.split(',')]
|
|
filters[key] = value
|
|
|
|
if filters:
|
|
queryset = queryset.filter(**filters)
|
|
if limit:
|
|
queryset = queryset[:limit]
|
|
|
|
return queryset
|