mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
171 lines
7.0 KiB
Python
171 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
|
|
__package__ = 'archivebox.cli'
|
|
|
|
import rich_click as click
|
|
from rich import print
|
|
|
|
from archivebox.misc.util import enforce_types, docstring
|
|
from archivebox.config.common import ARCHIVING_CONFIG
|
|
|
|
|
|
@enforce_types
|
|
def schedule(add: bool = False,
|
|
show: bool = False,
|
|
clear: bool = False,
|
|
foreground: bool = False,
|
|
run_all: bool = False,
|
|
quiet: bool = False,
|
|
every: str | None = None,
|
|
tag: str = '',
|
|
depth: int | str = 0,
|
|
overwrite: bool = False,
|
|
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
|
import_path: str | None = None):
|
|
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
|
|
|
from django.utils import timezone
|
|
|
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
|
from archivebox.crawls.models import Crawl, CrawlSchedule
|
|
from archivebox.crawls.schedule_utils import validate_schedule
|
|
from archivebox.services.runner import run_pending_crawls
|
|
|
|
depth = int(depth)
|
|
result: dict[str, object] = {
|
|
'created_schedule_ids': [],
|
|
'disabled_count': 0,
|
|
'run_all_enqueued': 0,
|
|
'active_schedule_ids': [],
|
|
}
|
|
|
|
def _active_schedules():
|
|
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
|
|
|
|
if clear:
|
|
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
|
|
is_enabled=False,
|
|
modified_at=timezone.now(),
|
|
)
|
|
result['disabled_count'] = disabled_count
|
|
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
|
|
|
|
if every or add:
|
|
schedule_str = (every or 'day').strip()
|
|
validate_schedule(schedule_str)
|
|
|
|
created_by_id = get_or_create_system_user_pk()
|
|
is_update_schedule = not import_path
|
|
template_urls = import_path or 'archivebox://update'
|
|
template_label = (
|
|
f'Scheduled import: {template_urls}'
|
|
if import_path else
|
|
'Scheduled ArchiveBox update'
|
|
)[:64]
|
|
template_notes = (
|
|
f'Created by archivebox schedule for {template_urls}'
|
|
if import_path else
|
|
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
|
|
)
|
|
|
|
template = Crawl.objects.create(
|
|
urls=template_urls,
|
|
max_depth=0 if is_update_schedule else depth,
|
|
tags_str='' if is_update_schedule else tag,
|
|
label=template_label,
|
|
notes=template_notes,
|
|
created_by_id=created_by_id,
|
|
status=Crawl.StatusChoices.SEALED,
|
|
retry_at=None,
|
|
config={
|
|
'ONLY_NEW': not update,
|
|
'OVERWRITE': overwrite,
|
|
'DEPTH': 0 if is_update_schedule else depth,
|
|
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
|
|
},
|
|
)
|
|
crawl_schedule = CrawlSchedule.objects.create(
|
|
template=template,
|
|
schedule=schedule_str,
|
|
is_enabled=True,
|
|
label=template_label,
|
|
notes=template_notes,
|
|
created_by_id=created_by_id,
|
|
)
|
|
result['created_schedule_ids'] = [str(crawl_schedule.id)]
|
|
|
|
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
|
|
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
|
|
print(f' id={crawl_schedule.id}')
|
|
print(f' every={crawl_schedule.schedule}')
|
|
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
|
|
if import_path:
|
|
print(f' source={import_path}')
|
|
|
|
schedules = list(_active_schedules())
|
|
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
|
|
|
|
if show:
|
|
if schedules:
|
|
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
|
|
for scheduled_crawl in schedules:
|
|
template = scheduled_crawl.template
|
|
print(
|
|
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
|
|
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
|
|
f'source={template.urls.splitlines()[0] if template.urls else ""}'
|
|
)
|
|
else:
|
|
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
|
|
|
if run_all:
|
|
enqueued = 0
|
|
now = timezone.now()
|
|
for scheduled_crawl in schedules:
|
|
scheduled_crawl.enqueue(queued_at=now)
|
|
enqueued += 1
|
|
result['run_all_enqueued'] = enqueued
|
|
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
|
|
if enqueued:
|
|
print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
|
|
|
|
if foreground:
|
|
print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
|
|
run_pending_crawls(daemon=True)
|
|
|
|
if quiet:
|
|
return result
|
|
|
|
if not any((every, add, show, clear, foreground, run_all)):
|
|
if schedules:
|
|
print('[green]\\[*] Active scheduled crawls:[/green]')
|
|
for scheduled_crawl in schedules:
|
|
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
|
|
else:
|
|
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
|
|
|
return result
|
|
|
|
|
|
@click.command()
|
|
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
|
|
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
|
|
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
|
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
|
|
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
|
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
|
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
|
|
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
|
|
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
|
|
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
|
|
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
|
|
@click.argument('import_path', required=False)
|
|
@docstring(schedule.__doc__)
|
|
def main(**kwargs):
|
|
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
|
schedule(**kwargs)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|