ArchiveBox/archivebox/cli/archivebox_update.py

#!/usr/bin/env python3

__package__ = 'archivebox.cli'


import rich_click as click

from typing import Iterable

from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.folders import (
    get_indexed_folders,
    get_archived_folders,
    get_unarchived_folders,
    get_present_folders,
    get_valid_folders,
    get_invalid_folders,
    get_duplicate_folders,
    get_orphaned_folders,
    get_corrupted_folders,
    get_unrecognized_folders,
)

# Filter types for URL matching
LINK_FILTERS = {
    'exact': lambda pattern: {'url': pattern},
    'substring': lambda pattern: {'url__icontains': pattern},
    'regex': lambda pattern: {'url__iregex': pattern},
    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
    'tag': lambda pattern: {'tags__name': pattern},
    'timestamp': lambda pattern: {'timestamp': pattern},
}


@enforce_types
def update(filter_patterns: Iterable[str]=(),
          only_new: bool=False,
          index_only: bool=False,
          resume: float | None=None,
          overwrite: bool=False,
          before: float | None=None,
          after: float | None=None,
          status: str='indexed',
          filter_type: str='exact',
          plugins: str="",
          max_workers: int=4) -> None:
    """Import any new links from subscriptions and retry any previously failed/skipped links"""

    from rich import print

    from archivebox.config.django import setup_django
    setup_django()

    from django.utils import timezone
    from core.models import Snapshot
    from workers.orchestrator import parallel_archive

    # Get snapshots to update based on filters
    snapshots = Snapshot.objects.all()

    if filter_patterns:
        snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)

    if status == 'unarchived':
        snapshots = snapshots.filter(downloaded_at__isnull=True)
    elif status == 'archived':
        snapshots = snapshots.filter(downloaded_at__isnull=False)

    if before:
        from datetime import datetime
        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
    if after:
        from datetime import datetime
        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))

    if resume:
        snapshots = snapshots.filter(timestamp__gte=str(resume))

    snapshot_ids = list(snapshots.values_list('pk', flat=True))

    if not snapshot_ids:
        print('[yellow]No snapshots found matching the given filters[/yellow]')
        return

    print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')

    if index_only:
        print('[yellow]Index-only mode - skipping archiving[/yellow]')
        return

    methods = plugins.split(',') if plugins else None

    # Queue snapshots for archiving via the state machine system
    # Workers will pick them up and run the plugins
    if len(snapshot_ids) > 1 and max_workers > 1:
        parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
    else:
        # Queue snapshots by setting status to queued
        for snapshot in snapshots:
            Snapshot.objects.filter(id=snapshot.id).update(
                status=Snapshot.StatusChoices.QUEUED,
                retry_at=timezone.now(),
            )
        print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')


@click.command()
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
@click.option('--status', type=click.Choice([
    'indexed', 'archived', 'unarchived',
    'present', 'valid', 'invalid',
    'duplicate', 'orphaned', 'corrupted', 'unrecognized'
]), default='indexed', help=f'''
Update only links or data directories that have the given status:
    indexed       {get_indexed_folders.__doc__} (the default)
    archived      {get_archived_folders.__doc__}
    unarchived    {get_unarchived_folders.__doc__}

    present       {get_present_folders.__doc__}
    valid         {get_valid_folders.__doc__}
    invalid       {get_invalid_folders.__doc__}

    duplicate     {get_duplicate_folders.__doc__}
    orphaned      {get_orphaned_folders.__doc__}
    corrupted     {get_corrupted_folders.__doc__}
    unrecognized  {get_unrecognized_folders.__doc__}
''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
    """Import any new links from subscriptions and retry any previously failed/skipped links"""
    update(**kwargs)


if __name__ == '__main__':
    main()


# LEGACY VERSION:
# @enforce_types
# def update(resume: Optional[float]=None,
#            only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
#            index_only: bool=False,
#            overwrite: bool=False,
#            filter_patterns_str: Optional[str]=None,
#            filter_patterns: Optional[List[str]]=None,
#            filter_type: Optional[str]=None,
#            status: Optional[str]=None,
#            after: Optional[str]=None,
#            before: Optional[str]=None,
#            extractors: str="",
#            out_dir: Path=DATA_DIR) -> List[Link]:
#     """Import any new links from subscriptions and retry any previously failed/skipped links"""

#     from core.models import ArchiveResult
#     from .search import index_links
#     # from workers.supervisord_util import start_cli_workers


#     check_data_folder()
#     # start_cli_workers()
#     new_links: List[Link] = [] # TODO: Remove input argument: only_new

#     extractors = extractors.split(",") if extractors else []

#     # Step 1: Filter for selected_links
#     print('[*] Finding matching Snapshots to update...')
#     print(f'    - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
#     matching_snapshots = list_links(
#         filter_patterns=filter_patterns,
#         filter_type=filter_type,
#         before=before,
#         after=after,
#     )
#     print(f'    - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
#     matching_folders = list_folders(
#         links=matching_snapshots,
#         status=status,
#         out_dir=out_dir,
#     )
#     all_links = (link for link in matching_folders.values() if link)
#     print('    - Sorting by most unfinished -> least unfinished + date archived...')
#     all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))

#     if index_only:
#         for link in all_links:
#             write_link_details(link, out_dir=out_dir, skip_sql_index=True)
#         index_links(all_links, out_dir=out_dir)
#         return all_links

#     # Step 2: Run the archive methods for each link
#     to_archive = new_links if only_new else all_links
#     if resume:
#         to_archive = [
#             link for link in to_archive
#             if link.timestamp >= str(resume)
#         ]
#         if not to_archive:
#             stderr('')
#             stderr(f'[√] Nothing found to resume after {resume}', color='green')
#             return all_links

#     archive_kwargs = {
#         "out_dir": out_dir,
#     }
#     if extractors:
#         archive_kwargs["methods"] = extractors


#     archive_links(to_archive, overwrite=overwrite, **archive_kwargs)

#     # Step 4: Re-write links index with updated titles, icons, and resources
#     all_links = load_main_index(out_dir=out_dir)
#     return all_links