#!/usr/bin/env python3 __package__ = 'archivebox.cli' import rich_click as click from rich import print from archivebox.misc.util import enforce_types, docstring from archivebox.config.common import ARCHIVING_CONFIG @enforce_types def schedule(add: bool = False, show: bool = False, clear: bool = False, foreground: bool = False, run_all: bool = False, quiet: bool = False, every: str | None = None, tag: str = '', depth: int | str = 0, overwrite: bool = False, update: bool = not ARCHIVING_CONFIG.ONLY_NEW, import_path: str | None = None): """Manage database-backed scheduled crawls processed by the crawl runner.""" from django.utils import timezone from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.crawls.models import Crawl, CrawlSchedule from archivebox.crawls.schedule_utils import validate_schedule from archivebox.services.runner import run_pending_crawls depth = int(depth) result: dict[str, object] = { 'created_schedule_ids': [], 'disabled_count': 0, 'run_all_enqueued': 0, 'active_schedule_ids': [], } def _active_schedules(): return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at') if clear: disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update( is_enabled=False, modified_at=timezone.now(), ) result['disabled_count'] = disabled_count print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]') if every or add: schedule_str = (every or 'day').strip() validate_schedule(schedule_str) created_by_id = get_or_create_system_user_pk() is_update_schedule = not import_path template_urls = import_path or 'archivebox://update' template_label = ( f'Scheduled import: {template_urls}' if import_path else 'Scheduled ArchiveBox update' )[:64] template_notes = ( f'Created by archivebox schedule for {template_urls}' if import_path else 'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.' ) template = Crawl.objects.create( urls=template_urls, max_depth=0 if is_update_schedule else depth, tags_str='' if is_update_schedule else tag, label=template_label, notes=template_notes, created_by_id=created_by_id, status=Crawl.StatusChoices.SEALED, retry_at=None, config={ 'ONLY_NEW': not update, 'OVERWRITE': overwrite, 'DEPTH': 0 if is_update_schedule else depth, 'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl', }, ) crawl_schedule = CrawlSchedule.objects.create( template=template, schedule=schedule_str, is_enabled=True, label=template_label, notes=template_notes, created_by_id=created_by_id, ) result['created_schedule_ids'] = [str(crawl_schedule.id)] schedule_type = 'maintenance update' if is_update_schedule else 'crawl' print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]') print(f' id={crawl_schedule.id}') print(f' every={crawl_schedule.schedule}') print(f' next_run={crawl_schedule.next_run_at.isoformat()}') if import_path: print(f' source={import_path}') schedules = list(_active_schedules()) result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules] if show: if schedules: print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]') for scheduled_crawl in schedules: template = scheduled_crawl.template print( f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} ' f'next_run={scheduled_crawl.next_run_at.isoformat()} ' f'source={template.urls.splitlines()[0] if template.urls else ""}' ) else: print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]') if run_all: enqueued = 0 now = timezone.now() for scheduled_crawl in schedules: scheduled_crawl.enqueue(queued_at=now) enqueued += 1 result['run_all_enqueued'] = enqueued print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]') if enqueued: print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]') if foreground: print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]') run_pending_crawls(daemon=True) if quiet: return result if not any((every, add, show, clear, foreground, run_all)): if schedules: print('[green]\\[*] Active scheduled crawls:[/green]') for scheduled_crawl in schedules: print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}') else: print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]') return result @click.command() @click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output") @click.option('--add', is_flag=True, help='Create a new scheduled crawl') @click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"') @click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots') @click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away') @click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously') @click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run') @click.option('--clear', is_flag=True, help='Disable all currently enabled schedules') @click.option('--show', is_flag=True, help='Print all currently enabled schedules') @click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)') @click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once') @click.argument('import_path', required=False) @docstring(schedule.__doc__) def main(**kwargs): """Manage database-backed scheduled crawls processed by the crawl runner.""" schedule(**kwargs) if __name__ == '__main__': main()