ArchiveBox/archivebox/cli/archivebox_schedule.py

#!/usr/bin/env python3

__package__ = 'archivebox.cli'

import rich_click as click
from rich import print

from archivebox.misc.util import enforce_types, docstring
from archivebox.config.common import ARCHIVING_CONFIG


@enforce_types
def schedule(add: bool = False,
            show: bool = False,
            clear: bool = False,
            foreground: bool = False,
            run_all: bool = False,
            quiet: bool = False,
            every: str | None = None,
            tag: str = '',
            depth: int | str = 0,
            overwrite: bool = False,
            update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
            import_path: str | None = None):
    """Manage database-backed scheduled crawls processed by the crawl runner."""

    from django.utils import timezone

    from archivebox.base_models.models import get_or_create_system_user_pk
    from archivebox.crawls.models import Crawl, CrawlSchedule
    from archivebox.crawls.schedule_utils import validate_schedule
    from archivebox.services.runner import run_pending_crawls

    depth = int(depth)
    result: dict[str, object] = {
        'created_schedule_ids': [],
        'disabled_count': 0,
        'run_all_enqueued': 0,
        'active_schedule_ids': [],
    }

    def _active_schedules():
        return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')

    if clear:
        disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
            is_enabled=False,
            modified_at=timezone.now(),
        )
        result['disabled_count'] = disabled_count
        print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')

    if every or add:
        schedule_str = (every or 'day').strip()
        validate_schedule(schedule_str)

        created_by_id = get_or_create_system_user_pk()
        is_update_schedule = not import_path
        template_urls = import_path or 'archivebox://update'
        template_label = (
            f'Scheduled import: {template_urls}'
            if import_path else
            'Scheduled ArchiveBox update'
        )[:64]
        template_notes = (
            f'Created by archivebox schedule for {template_urls}'
            if import_path else
            'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
        )

        template = Crawl.objects.create(
            urls=template_urls,
            max_depth=0 if is_update_schedule else depth,
            tags_str='' if is_update_schedule else tag,
            label=template_label,
            notes=template_notes,
            created_by_id=created_by_id,
            status=Crawl.StatusChoices.SEALED,
            retry_at=None,
            config={
                'ONLY_NEW': not update,
                'OVERWRITE': overwrite,
                'DEPTH': 0 if is_update_schedule else depth,
                'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
            },
        )
        crawl_schedule = CrawlSchedule.objects.create(
            template=template,
            schedule=schedule_str,
            is_enabled=True,
            label=template_label,
            notes=template_notes,
            created_by_id=created_by_id,
        )
        result['created_schedule_ids'] = [str(crawl_schedule.id)]

        schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
        print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
        print(f'    id={crawl_schedule.id}')
        print(f'    every={crawl_schedule.schedule}')
        print(f'    next_run={crawl_schedule.next_run_at.isoformat()}')
        if import_path:
            print(f'    source={import_path}')

    schedules = list(_active_schedules())
    result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]

    if show:
        if schedules:
            print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
            for scheduled_crawl in schedules:
                template = scheduled_crawl.template
                print(
                    f'  - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
                    f'next_run={scheduled_crawl.next_run_at.isoformat()} '
                    f'source={template.urls.splitlines()[0] if template.urls else ""}'
                )
        else:
            print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')

    if run_all:
        enqueued = 0
        now = timezone.now()
        for scheduled_crawl in schedules:
            scheduled_crawl.enqueue(queued_at=now)
            enqueued += 1
        result['run_all_enqueued'] = enqueued
        print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
        if enqueued:
            print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')

    if foreground:
        print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
        run_pending_crawls(daemon=True)

    if quiet:
        return result

    if not any((every, add, show, clear, foreground, run_all)):
        if schedules:
            print('[green]\\[*] Active scheduled crawls:[/green]')
            for scheduled_crawl in schedules:
                print(f'  - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
        else:
            print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')

    return result


@click.command()
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
@click.argument('import_path', required=False)
@docstring(schedule.__doc__)
def main(**kwargs):
    """Manage database-backed scheduled crawls processed by the crawl runner."""
    schedule(**kwargs)


if __name__ == '__main__':
    main()