mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
wip major changes
This commit is contained in:
@@ -8,8 +8,7 @@ import rich_click as click
|
||||
from typing import Iterable
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.index import (
|
||||
LINK_FILTERS,
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
@@ -22,6 +21,16 @@ from archivebox.index import (
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda pattern: {'url': pattern},
|
||||
'substring': lambda pattern: {'url__icontains': pattern},
|
||||
'regex': lambda pattern: {'url__iregex': pattern},
|
||||
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
|
||||
'tag': lambda pattern: {'tags__name': pattern},
|
||||
'timestamp': lambda pattern: {'timestamp': pattern},
|
||||
}
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str]=(),
|
||||
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
|
||||
after: float | None=None,
|
||||
status: str='indexed',
|
||||
filter_type: str='exact',
|
||||
extract: str="") -> None:
|
||||
plugins: str="",
|
||||
max_workers: int=4) -> None:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
from rich import print
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from core.models import Snapshot
|
||||
from workers.orchestrator import parallel_archive
|
||||
|
||||
from workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
# Get snapshots to update based on filters
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
|
||||
|
||||
if status == 'unarchived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
elif status == 'archived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
|
||||
if before:
|
||||
from datetime import datetime
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
from datetime import datetime
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
|
||||
if resume:
|
||||
snapshots = snapshots.filter(timestamp__gte=str(resume))
|
||||
|
||||
snapshot_ids = list(snapshots.values_list('pk', flat=True))
|
||||
|
||||
if not snapshot_ids:
|
||||
print('[yellow]No snapshots found matching the given filters[/yellow]')
|
||||
return
|
||||
|
||||
print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
|
||||
|
||||
if index_only:
|
||||
print('[yellow]Index-only mode - skipping archiving[/yellow]')
|
||||
return
|
||||
|
||||
methods = plugins.split(',') if plugins else None
|
||||
|
||||
# Queue snapshots for archiving via the state machine system
|
||||
# Workers will pick them up and run the plugins
|
||||
if len(snapshot_ids) > 1 and max_workers > 1:
|
||||
parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
|
||||
else:
|
||||
# Queue snapshots by setting status to queued
|
||||
for snapshot in snapshots:
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
|
||||
|
||||
|
||||
@click.command()
|
||||
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
|
||||
unrecognized {get_unrecognized_folders.__doc__}
|
||||
''')
|
||||
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
|
||||
Reference in New Issue
Block a user