Files
ArchiveBox/archivebox/cli/archivebox_schedule.py
2026-03-15 18:31:56 -07:00

175 lines
7.2 KiB
Python

#!/usr/bin/env python3
__package__ = 'archivebox.cli'
import rich_click as click
from rich import print
from archivebox.misc.util import enforce_types, docstring
from archivebox.config.common import ARCHIVING_CONFIG
@enforce_types
def schedule(add: bool = False,
show: bool = False,
clear: bool = False,
foreground: bool = False,
run_all: bool = False,
quiet: bool = False,
every: str | None = None,
tag: str = '',
depth: int | str = 0,
overwrite: bool = False,
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
import_path: str | None = None):
"""Manage database-backed scheduled crawls processed by the orchestrator."""
from django.utils import timezone
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.crawls.schedule_utils import validate_schedule
from archivebox.workers.orchestrator import Orchestrator
depth = int(depth)
result: dict[str, object] = {
'created_schedule_ids': [],
'disabled_count': 0,
'run_all_enqueued': 0,
'active_schedule_ids': [],
}
def _active_schedules():
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
if clear:
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
is_enabled=False,
modified_at=timezone.now(),
)
result['disabled_count'] = disabled_count
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
if every or add:
schedule_str = (every or 'day').strip()
validate_schedule(schedule_str)
created_by_id = get_or_create_system_user_pk()
is_update_schedule = not import_path
template_urls = import_path or 'archivebox://update'
template_label = (
f'Scheduled import: {template_urls}'
if import_path else
'Scheduled ArchiveBox update'
)[:64]
template_notes = (
f'Created by archivebox schedule for {template_urls}'
if import_path else
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
)
template = Crawl.objects.create(
urls=template_urls,
max_depth=0 if is_update_schedule else depth,
tags_str='' if is_update_schedule else tag,
label=template_label,
notes=template_notes,
created_by_id=created_by_id,
status=Crawl.StatusChoices.SEALED,
retry_at=None,
config={
'ONLY_NEW': not update,
'OVERWRITE': overwrite,
'DEPTH': 0 if is_update_schedule else depth,
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
},
)
crawl_schedule = CrawlSchedule.objects.create(
template=template,
schedule=schedule_str,
is_enabled=True,
label=template_label,
notes=template_notes,
created_by_id=created_by_id,
)
result['created_schedule_ids'] = [str(crawl_schedule.id)]
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
print(f' id={crawl_schedule.id}')
print(f' every={crawl_schedule.schedule}')
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
if import_path:
print(f' source={import_path}')
schedules = list(_active_schedules())
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
if show:
if schedules:
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
for scheduled_crawl in schedules:
template = scheduled_crawl.template
print(
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
f'source={template.urls.splitlines()[0] if template.urls else ""}'
)
else:
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
if run_all:
enqueued = 0
now = timezone.now()
for scheduled_crawl in schedules:
scheduled_crawl.enqueue(queued_at=now)
enqueued += 1
result['run_all_enqueued'] = enqueued
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
if enqueued and not Orchestrator.is_running():
print('[yellow]\\[*] No orchestrator is running yet. Start `archivebox server` or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
if foreground:
print('[green]\\[*] Starting global orchestrator in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
if Orchestrator.is_running():
print('[yellow]\\[*] Orchestrator is already running.[/yellow]')
else:
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.runloop()
if quiet:
return result
if not any((every, add, show, clear, foreground, run_all)):
if schedules:
print('[green]\\[*] Active scheduled crawls:[/green]')
for scheduled_crawl in schedules:
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
else:
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
return result
@click.command()
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
@click.option('--foreground', '-f', is_flag=True, help='Run the global orchestrator in the foreground (no crontab required)')
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
@click.argument('import_path', required=False)
@docstring(schedule.__doc__)
def main(**kwargs):
"""Manage database-backed scheduled crawls processed by the orchestrator."""
schedule(**kwargs)
if __name__ == '__main__':
main()