ArchiveBox/archivebox/cli/archivebox_schedule.py

#!/usr/bin/env python3

__package__ = "archivebox.cli"

import rich_click as click
from rich import print

from archivebox.misc.util import enforce_types, docstring
from archivebox.config.common import ARCHIVING_CONFIG


@enforce_types
def schedule(
    add: bool = False,
    show: bool = False,
    clear: bool = False,
    foreground: bool = False,
    run_all: bool = False,
    quiet: bool = False,
    every: str | None = None,
    tag: str = "",
    depth: int | str = 0,
    overwrite: bool = False,
    update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
    import_path: str | None = None,
):
    """Manage database-backed scheduled crawls processed by the crawl runner."""

    from django.utils import timezone

    from archivebox.base_models.models import get_or_create_system_user_pk
    from archivebox.crawls.models import Crawl, CrawlSchedule
    from archivebox.crawls.schedule_utils import validate_schedule
    from archivebox.services.runner import run_pending_crawls

    depth = int(depth)
    result: dict[str, object] = {
        "created_schedule_ids": [],
        "disabled_count": 0,
        "run_all_enqueued": 0,
        "active_schedule_ids": [],
    }

    def _active_schedules():
        return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at")

    if clear:
        disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
            is_enabled=False,
            modified_at=timezone.now(),
        )
        result["disabled_count"] = disabled_count
        print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]")

    if every or add:
        schedule_str = (every or "day").strip()
        validate_schedule(schedule_str)

        created_by_id = get_or_create_system_user_pk()
        is_update_schedule = not import_path
        template_urls = import_path or "archivebox://update"
        template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64]
        template_notes = (
            f"Created by archivebox schedule for {template_urls}"
            if import_path
            else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls."
        )

        template = Crawl.objects.create(
            urls=template_urls,
            max_depth=0 if is_update_schedule else depth,
            tags_str="" if is_update_schedule else tag,
            label=template_label,
            notes=template_notes,
            created_by_id=created_by_id,
            status=Crawl.StatusChoices.SEALED,
            retry_at=None,
            config={
                "ONLY_NEW": not update,
                "OVERWRITE": overwrite,
                "DEPTH": 0 if is_update_schedule else depth,
                "SCHEDULE_KIND": "update" if is_update_schedule else "crawl",
            },
        )
        crawl_schedule = CrawlSchedule.objects.create(
            template=template,
            schedule=schedule_str,
            is_enabled=True,
            label=template_label,
            notes=template_notes,
            created_by_id=created_by_id,
        )
        result["created_schedule_ids"] = [str(crawl_schedule.id)]

        schedule_type = "maintenance update" if is_update_schedule else "crawl"
        print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]")
        print(f"    id={crawl_schedule.id}")
        print(f"    every={crawl_schedule.schedule}")
        print(f"    next_run={crawl_schedule.next_run_at.isoformat()}")
        if import_path:
            print(f"    source={import_path}")

    schedules = list(_active_schedules())
    result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules]

    if show:
        if schedules:
            print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]")
            for scheduled_crawl in schedules:
                template = scheduled_crawl.template
                print(
                    f"  - id={scheduled_crawl.id} every={scheduled_crawl.schedule} "
                    f"next_run={scheduled_crawl.next_run_at.isoformat()} "
                    f"source={template.urls.splitlines()[0] if template.urls else ''}",
                )
        else:
            print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")

    if run_all:
        enqueued = 0
        now = timezone.now()
        for scheduled_crawl in schedules:
            scheduled_crawl.enqueue(queued_at=now)
            enqueued += 1
        result["run_all_enqueued"] = enqueued
        print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]")
        if enqueued:
            print(
                "[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]",
            )

    if foreground:
        print(
            "[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]",
        )
        run_pending_crawls(daemon=True)

    if quiet:
        return result

    if not any((every, add, show, clear, foreground, run_all)):
        if schedules:
            print("[green]\\[*] Active scheduled crawls:[/green]")
            for scheduled_crawl in schedules:
                print(f"  - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}")
        else:
            print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")

    return result


@click.command()
@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output")
@click.option("--add", is_flag=True, help="Create a new scheduled crawl")
@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots")
@click.option(
    "--depth",
    type=click.Choice([str(i) for i in range(5)]),
    default="0",
    help="Recursively archive linked pages up to N hops away",
)
@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run")
@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules")
@click.option("--show", is_flag=True, help="Print all currently enabled schedules")
@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)")
@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once")
@click.argument("import_path", required=False)
@docstring(schedule.__doc__)
def main(**kwargs):
    """Manage database-backed scheduled crawls processed by the crawl runner."""
    schedule(**kwargs)


if __name__ == "__main__":
    main()