Files
ArchiveBox/archivebox/workers/tasks.py
2026-03-21 11:47:57 -07:00

88 lines
2.7 KiB
Python

"""
Background task functions for queuing work to the background runner.
These functions queue Snapshots/Crawls for processing by setting their status
to QUEUED so `archivebox run --daemon` or `archivebox server` can pick them up.
NOTE: These functions do NOT start the runner. They assume it's already
running via `archivebox server` or will be run inline by the CLI.
"""
__package__ = 'archivebox.workers'
from django.utils import timezone
def bg_add(add_kwargs: dict) -> int:
"""
Add URLs and queue them for archiving.
Returns the number of snapshots created.
"""
from archivebox.cli.archivebox_add import add
assert add_kwargs and add_kwargs.get("urls")
# When called as background task, always run in background mode
add_kwargs = add_kwargs.copy()
add_kwargs['bg'] = True
_, result = add(**add_kwargs)
return len(result) if result else 0
def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
"""
Queue multiple snapshots for archiving via the shared runner loop.
Returns the number of snapshots queued.
"""
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
kwargs = kwargs or {}
# Queue snapshots by setting status to queued with immediate retry_at
queued_count = 0
for snapshot in snapshots:
if hasattr(snapshot, 'id'):
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
crawl_id = getattr(snapshot, 'crawl_id', None)
if crawl_id:
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
queued_count += 1
return queued_count
def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int:
"""
Queue a single snapshot for archiving via the shared runner loop.
Returns 1 if queued, 0 otherwise.
"""
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
if hasattr(snapshot, 'id'):
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
crawl_id = getattr(snapshot, 'crawl_id', None)
if crawl_id:
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
return 1
return 0