ArchiveBox/archivebox/workers/tasks.py

"""
Background task functions for queuing work to the background runner.

These functions queue Snapshots/Crawls for processing by setting their status
to QUEUED so `archivebox run --daemon` or `archivebox server` can pick them up.

NOTE: These functions do NOT start the runner. They assume it's already
running via `archivebox server` or will be run inline by the CLI.
"""

__package__ = 'archivebox.workers'

from django.utils import timezone


def bg_add(add_kwargs: dict) -> int:
    """
    Add URLs and queue them for archiving.

    Returns the number of snapshots created.
    """
    from archivebox.cli.archivebox_add import add

    assert add_kwargs and add_kwargs.get("urls")

    # When called as background task, always run in background mode
    add_kwargs = add_kwargs.copy()
    add_kwargs['bg'] = True

    _, result = add(**add_kwargs)

    return len(result) if result else 0


def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
    """
    Queue multiple snapshots for archiving via the shared runner loop.

    Returns the number of snapshots queued.
    """
    from archivebox.core.models import Snapshot
    from archivebox.crawls.models import Crawl

    kwargs = kwargs or {}

    # Queue snapshots by setting status to queued with immediate retry_at
    queued_count = 0
    for snapshot in snapshots:
        if hasattr(snapshot, 'id'):
            Snapshot.objects.filter(id=snapshot.id).update(
                status=Snapshot.StatusChoices.QUEUED,
                retry_at=timezone.now(),
            )
            crawl_id = getattr(snapshot, 'crawl_id', None)
            if crawl_id:
                Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
                    status=Crawl.StatusChoices.QUEUED,
                    retry_at=timezone.now(),
                )
            queued_count += 1

    return queued_count


def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int:
    """
    Queue a single snapshot for archiving via the shared runner loop.

    Returns 1 if queued, 0 otherwise.
    """
    from archivebox.core.models import Snapshot
    from archivebox.crawls.models import Crawl

    if hasattr(snapshot, 'id'):
        Snapshot.objects.filter(id=snapshot.id).update(
            status=Snapshot.StatusChoices.QUEUED,
            retry_at=timezone.now(),
        )
        crawl_id = getattr(snapshot, 'crawl_id', None)
        if crawl_id:
            Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
                status=Crawl.StatusChoices.QUEUED,
                retry_at=timezone.now(),
            )
        return 1

    return 0