wip major changes

2026-01-03 01:15:57 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -8,8 +8,7 @@ import rich_click as click
 from typing import Iterable

 from archivebox.misc.util import enforce_types, docstring
-from archivebox.index import (
-    LINK_FILTERS,
+from archivebox.misc.folders import (
    get_indexed_folders,
    get_archived_folders,
    get_unarchived_folders,
@@ -22,6 +21,16 @@ from archivebox.index import (
    get_unrecognized_folders,
 )

+# Filter types for URL matching
+LINK_FILTERS = {
+    'exact': lambda pattern: {'url': pattern},
+    'substring': lambda pattern: {'url__icontains': pattern},
+    'regex': lambda pattern: {'url__iregex': pattern},
+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
+    'tag': lambda pattern: {'tags__name': pattern},
+    'timestamp': lambda pattern: {'timestamp': pattern},
+}
+

@enforce_types
 def update(filter_patterns: Iterable[str]=(),
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
          after: float | None=None,
          status: str='indexed',
          filter_type: str='exact',
-          extract: str="") -> None:
+          plugins: str="",
+          max_workers: int=4) -> None:
    """Import any new links from subscriptions and retry any previously failed/skipped links"""
    
+    from rich import print
+    
    from archivebox.config.django import setup_django
    setup_django()
+
+    from django.utils import timezone
+    from core.models import Snapshot
+    from workers.orchestrator import parallel_archive
    
-    from workers.orchestrator import Orchestrator
-    orchestrator = Orchestrator(exit_on_idle=False)
-    orchestrator.start()
+    # Get snapshots to update based on filters
+    snapshots = Snapshot.objects.all()
+    
+    if filter_patterns:
+        snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
+    
+    if status == 'unarchived':
+        snapshots = snapshots.filter(downloaded_at__isnull=True)
+    elif status == 'archived':
+        snapshots = snapshots.filter(downloaded_at__isnull=False)
+    
+    if before:
+        from datetime import datetime
+        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
+    if after:
+        from datetime import datetime
+        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
+    
+    if resume:
+        snapshots = snapshots.filter(timestamp__gte=str(resume))
+    
+    snapshot_ids = list(snapshots.values_list('pk', flat=True))
+    
+    if not snapshot_ids:
+        print('[yellow]No snapshots found matching the given filters[/yellow]')
+        return
+    
+    print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
+    
+    if index_only:
+        print('[yellow]Index-only mode - skipping archiving[/yellow]')
+        return
+    
+    methods = plugins.split(',') if plugins else None
+
+    # Queue snapshots for archiving via the state machine system
+    # Workers will pick them up and run the plugins
+    if len(snapshot_ids) > 1 and max_workers > 1:
+        parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
+    else:
+        # Queue snapshots by setting status to queued
+        for snapshot in snapshots:
+            Snapshot.objects.filter(id=snapshot.id).update(
+                status=Snapshot.StatusChoices.QUEUED,
+                retry_at=timezone.now(),
+            )
+        print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')


@click.command()
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
    unrecognized  {get_unrecognized_folders.__doc__}
 ''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
-@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
 def main(**kwargs):