wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -8,8 +8,7 @@ import rich_click as click
from typing import Iterable
from archivebox.misc.util import enforce_types, docstring
from archivebox.index import (
LINK_FILTERS,
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@@ -22,6 +21,16 @@ from archivebox.index import (
get_unrecognized_folders,
)
# Filter types for URL matching
LINK_FILTERS = {
'exact': lambda pattern: {'url': pattern},
'substring': lambda pattern: {'url__icontains': pattern},
'regex': lambda pattern: {'url__iregex': pattern},
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
'tag': lambda pattern: {'tags__name': pattern},
'timestamp': lambda pattern: {'timestamp': pattern},
}
@enforce_types
def update(filter_patterns: Iterable[str]=(),
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
after: float | None=None,
status: str='indexed',
filter_type: str='exact',
extract: str="") -> None:
plugins: str="",
max_workers: int=4) -> None:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from rich import print
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from core.models import Snapshot
from workers.orchestrator import parallel_archive
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
# Get snapshots to update based on filters
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
if status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
elif status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
if before:
from datetime import datetime
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
from datetime import datetime
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
if resume:
snapshots = snapshots.filter(timestamp__gte=str(resume))
snapshot_ids = list(snapshots.values_list('pk', flat=True))
if not snapshot_ids:
print('[yellow]No snapshots found matching the given filters[/yellow]')
return
print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
if index_only:
print('[yellow]Index-only mode - skipping archiving[/yellow]')
return
methods = plugins.split(',') if plugins else None
# Queue snapshots for archiving via the state machine system
# Workers will pick them up and run the plugins
if len(snapshot_ids) > 1 and max_workers > 1:
parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
else:
# Queue snapshots by setting status to queued
for snapshot in snapshots:
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
@click.command()
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
unrecognized {get_unrecognized_folders.__doc__}
''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):