mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
way better plugin hooks system wip
This commit is contained in:
@@ -186,7 +186,7 @@ def discover_outlinks(
|
||||
|
||||
# Collect discovered URLs from urls.jsonl files
|
||||
# Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
discovered_urls = {}
|
||||
for snapshot_id in snapshot_ids:
|
||||
@@ -195,7 +195,7 @@ def discover_outlinks(
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
|
||||
# Dynamically collect urls.jsonl from ANY plugin subdirectory
|
||||
for entry in collect_urls_from_extractors(snapshot_dir):
|
||||
for entry in collect_urls_from_plugins(snapshot_dir):
|
||||
url = entry.get('url')
|
||||
if url and url not in discovered_urls:
|
||||
# Add metadata for crawl tracking
|
||||
|
||||
@@ -21,7 +21,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.collection import write_config_file
|
||||
from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
|
||||
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
|
||||
from archivebox.misc.db import apply_migrations
|
||||
|
||||
@@ -106,17 +105,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||
|
||||
if quick:
|
||||
print(' > Skipping full snapshot directory check (quick mode)')
|
||||
print(' > Skipping orphan snapshot import (quick mode)')
|
||||
else:
|
||||
try:
|
||||
# Links in data folders that dont match their timestamp
|
||||
fixed, cant_fix = fix_invalid_folder_locations(DATA_DIR)
|
||||
if fixed:
|
||||
print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
|
||||
if cant_fix:
|
||||
print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
|
||||
|
||||
# Links in JSON index but not in main index
|
||||
# Import orphaned links from legacy JSON indexes
|
||||
orphaned_json_links = {
|
||||
link_dict['url']: link_dict
|
||||
for link_dict in parse_json_main_index(DATA_DIR)
|
||||
@@ -126,7 +118,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
|
||||
|
||||
# Links in data dir indexes but not in main index
|
||||
orphaned_data_dir_links = {
|
||||
link_dict['url']: link_dict
|
||||
for link_dict in parse_json_links_details(DATA_DIR)
|
||||
@@ -136,18 +127,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
|
||||
|
||||
# Links in invalid/duplicate data dirs
|
||||
invalid_folders = {
|
||||
folder: link
|
||||
for folder, link in get_invalid_folders(all_links, DATA_DIR).items()
|
||||
}
|
||||
if invalid_folders:
|
||||
print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
|
||||
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
|
||||
print(' archivebox status')
|
||||
print(' archivebox list --status=invalid')
|
||||
if pending_links:
|
||||
Snapshot.objects.create_from_dicts(list(pending_links.values()))
|
||||
|
||||
# Hint for orphaned snapshot directories
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
|
||||
print(' archivebox update')
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print(file=sys.stderr)
|
||||
@@ -157,9 +143,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
|
||||
print(' archivebox init --quick', file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
if pending_links:
|
||||
Snapshot.objects.create_from_dicts(list(pending_links.values()))
|
||||
|
||||
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ def install(dry_run: bool=False) -> None:
|
||||
from archivebox.cli.archivebox_init import init
|
||||
|
||||
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
|
||||
init() # must init full index because we need a db to store InstalledBinary entries in
|
||||
init() # must init full index because we need a db to store Binary entries in
|
||||
|
||||
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
|
||||
|
||||
|
||||
@@ -25,10 +25,7 @@ LINK_FILTERS = {
|
||||
'timestamp': lambda pattern: {'timestamp': pattern},
|
||||
}
|
||||
|
||||
STATUS_CHOICES = [
|
||||
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
|
||||
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||
]
|
||||
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
|
||||
|
||||
|
||||
|
||||
@@ -59,45 +56,6 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
return result
|
||||
|
||||
|
||||
def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
check_data_folder()
|
||||
|
||||
STATUS_FUNCTIONS = {
|
||||
"indexed": get_indexed_folders,
|
||||
"archived": get_archived_folders,
|
||||
"unarchived": get_unarchived_folders,
|
||||
"present": get_present_folders,
|
||||
"valid": get_valid_folders,
|
||||
"invalid": get_invalid_folders,
|
||||
"duplicate": get_duplicate_folders,
|
||||
"orphaned": get_orphaned_folders,
|
||||
"corrupted": get_corrupted_folders,
|
||||
"unrecognized": get_unrecognized_folders,
|
||||
}
|
||||
|
||||
try:
|
||||
return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
|
||||
except KeyError:
|
||||
raise ValueError('Status not recognized.')
|
||||
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def search(filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
@@ -110,12 +68,13 @@ def search(filter_patterns: list[str] | None=None,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
raise SystemExit(2)
|
||||
|
||||
# Query DB directly - no filesystem scanning
|
||||
snapshots = get_snapshots(
|
||||
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||
filter_type=filter_type,
|
||||
@@ -123,30 +82,27 @@ def search(filter_patterns: list[str] | None=None,
|
||||
after=after,
|
||||
)
|
||||
|
||||
# Apply status filter
|
||||
if status == 'archived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
elif status == 'unarchived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
# 'indexed' = all snapshots (no filter)
|
||||
|
||||
if sort:
|
||||
snapshots = snapshots.order_by(sort)
|
||||
|
||||
folders = list_folders(
|
||||
snapshots=snapshots,
|
||||
status=status,
|
||||
out_dir=DATA_DIR,
|
||||
)
|
||||
|
||||
# Export to requested format
|
||||
if json:
|
||||
from core.models import Snapshot
|
||||
# Filter for non-None snapshots
|
||||
valid_snapshots = [s for s in folders.values() if s is not None]
|
||||
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
|
||||
output = snapshots.to_json(with_headers=with_headers)
|
||||
elif html:
|
||||
from core.models import Snapshot
|
||||
valid_snapshots = [s for s in folders.values() if s is not None]
|
||||
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
|
||||
output = snapshots.to_html(with_headers=with_headers)
|
||||
elif csv:
|
||||
from core.models import Snapshot
|
||||
valid_snapshots = [s for s in folders.values() if s is not None]
|
||||
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
|
||||
output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
# Convert to dict for printable_folders
|
||||
folders = {s.output_dir: s for s in snapshots}
|
||||
output = printable_folders(folders, with_headers)
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -2,223 +2,284 @@
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import os
|
||||
import time
|
||||
import rich_click as click
|
||||
|
||||
from typing import Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda pattern: {'url': pattern},
|
||||
'substring': lambda pattern: {'url__icontains': pattern},
|
||||
'regex': lambda pattern: {'url__iregex': pattern},
|
||||
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
|
||||
'tag': lambda pattern: {'tags__name': pattern},
|
||||
'timestamp': lambda pattern: {'timestamp': pattern},
|
||||
}
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str]=(),
|
||||
only_new: bool=False,
|
||||
index_only: bool=False,
|
||||
resume: float | None=None,
|
||||
overwrite: bool=False,
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
status: str='indexed',
|
||||
filter_type: str='exact',
|
||||
plugins: str="",
|
||||
max_workers: int=4) -> None:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
def update(filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = 'exact',
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False) -> None:
|
||||
"""
|
||||
Update snapshots: import orphans, reconcile, and re-run failed extractors.
|
||||
|
||||
Two-phase operation:
|
||||
- Phase 1: Scan archive/ for orphaned snapshots (skip symlinks)
|
||||
- Phase 2: Process all DB snapshots (reconcile + re-queue for archiving)
|
||||
- Phase 3: Deduplicate exact duplicates
|
||||
|
||||
With filters: Only phase 2 (DB query), no filesystem scan.
|
||||
Without filters: All phases (full update).
|
||||
"""
|
||||
|
||||
from rich import print
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from core.models import Snapshot
|
||||
from workers.orchestrator import parallel_archive
|
||||
|
||||
# Get snapshots to update based on filters
|
||||
from django.utils import timezone
|
||||
|
||||
while True:
|
||||
if filter_patterns or before or after:
|
||||
# Filtered mode: query DB only
|
||||
print('[*] Processing filtered snapshots from database...')
|
||||
stats = process_filtered_snapshots(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
batch_size=batch_size
|
||||
)
|
||||
print_stats(stats)
|
||||
else:
|
||||
# Full mode: import orphans + process DB + deduplicate
|
||||
stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
|
||||
|
||||
print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
|
||||
stats_combined['phase1'] = import_orphans_from_archive(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
print('[*] Phase 2: Processing all database snapshots...')
|
||||
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
|
||||
|
||||
print('[*] Phase 3: Deduplicating...')
|
||||
stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
|
||||
|
||||
print_combined_stats(stats_combined)
|
||||
|
||||
if not continuous:
|
||||
break
|
||||
|
||||
print('[yellow]Sleeping 60s before next pass...[/yellow]')
|
||||
time.sleep(60)
|
||||
resume = None
|
||||
|
||||
|
||||
def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
|
||||
"""
|
||||
Scan archive/ for orphaned snapshots.
|
||||
Skip symlinks (already migrated).
|
||||
Create DB records and trigger migration on save().
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.config import CONSTANTS
|
||||
from django.db import transaction
|
||||
|
||||
stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
|
||||
|
||||
archive_dir = CONSTANTS.ARCHIVE_DIR
|
||||
if not archive_dir.exists():
|
||||
return stats
|
||||
|
||||
print('[*] Scanning and sorting by modification time...')
|
||||
|
||||
# Scan and sort by mtime (newest first)
|
||||
# Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
|
||||
entries = [
|
||||
(e.stat().st_mtime, e.path)
|
||||
for e in os.scandir(archive_dir)
|
||||
if e.is_dir(follow_symlinks=False) # Skip symlinks
|
||||
]
|
||||
entries.sort(reverse=True) # Newest first
|
||||
print(f'[*] Found {len(entries)} directories to check')
|
||||
|
||||
for mtime, entry_path in entries:
|
||||
entry_path = Path(entry_path)
|
||||
|
||||
# Resume from timestamp if specified
|
||||
if resume_from and entry_path.name < resume_from:
|
||||
continue
|
||||
|
||||
stats['processed'] += 1
|
||||
|
||||
# Check if already in DB
|
||||
snapshot = Snapshot.load_from_directory(entry_path)
|
||||
if snapshot:
|
||||
continue # Already in DB, skip
|
||||
|
||||
# Not in DB - create orphaned snapshot
|
||||
snapshot = Snapshot.create_from_directory(entry_path)
|
||||
if not snapshot:
|
||||
# Invalid directory
|
||||
Snapshot.move_directory_to_invalid(entry_path)
|
||||
stats['invalid'] += 1
|
||||
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
|
||||
continue
|
||||
|
||||
needs_migration = snapshot.fs_migration_needed
|
||||
|
||||
snapshot.save() # Creates DB record + triggers migration
|
||||
|
||||
stats['imported'] += 1
|
||||
if needs_migration:
|
||||
stats['migrated'] += 1
|
||||
print(f" [{stats['processed']}] Imported + migrated: {entry_path.name}")
|
||||
else:
|
||||
print(f" [{stats['processed']}] Imported: {entry_path.name}")
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
transaction.commit()
|
||||
|
||||
transaction.commit()
|
||||
return stats
|
||||
|
||||
|
||||
def process_all_db_snapshots(batch_size: int = 100) -> dict:
|
||||
"""
|
||||
Process all snapshots in DB.
|
||||
Reconcile index.json and queue for archiving.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
|
||||
total = Snapshot.objects.count()
|
||||
print(f'[*] Processing {total} snapshots from database...')
|
||||
|
||||
for snapshot in Snapshot.objects.iterator():
|
||||
# Reconcile index.json with DB
|
||||
snapshot.reconcile_with_index_json()
|
||||
|
||||
# Queue for archiving (state machine will handle it)
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1
|
||||
stats['queued'] += 1
|
||||
stats['processed'] += 1
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
transaction.commit()
|
||||
return stats
|
||||
|
||||
|
||||
def process_filtered_snapshots(
|
||||
filter_patterns: Iterable[str],
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
batch_size: int
|
||||
) -> dict:
|
||||
"""Process snapshots matching filters (DB query only)."""
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from datetime import datetime
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
|
||||
|
||||
if status == 'unarchived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
elif status == 'archived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
|
||||
|
||||
if before:
|
||||
from datetime import datetime
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
from datetime import datetime
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
|
||||
if resume:
|
||||
snapshots = snapshots.filter(timestamp__gte=str(resume))
|
||||
|
||||
snapshot_ids = list(snapshots.values_list('pk', flat=True))
|
||||
|
||||
if not snapshot_ids:
|
||||
print('[yellow]No snapshots found matching the given filters[/yellow]')
|
||||
return
|
||||
|
||||
print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
|
||||
|
||||
if index_only:
|
||||
print('[yellow]Index-only mode - skipping archiving[/yellow]')
|
||||
return
|
||||
|
||||
methods = plugins.split(',') if plugins else None
|
||||
|
||||
# Queue snapshots for archiving via the state machine system
|
||||
# Workers will pick them up and run the plugins
|
||||
if len(snapshot_ids) > 1 and max_workers > 1:
|
||||
parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
|
||||
else:
|
||||
# Queue snapshots by setting status to queued
|
||||
for snapshot in snapshots:
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
|
||||
total = snapshots.count()
|
||||
print(f'[*] Found {total} matching snapshots')
|
||||
|
||||
for snapshot in snapshots.iterator():
|
||||
# Reconcile index.json with DB
|
||||
snapshot.reconcile_with_index_json()
|
||||
|
||||
# Queue for archiving
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1
|
||||
stats['queued'] += 1
|
||||
stats['processed'] += 1
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
transaction.commit()
|
||||
return stats
|
||||
|
||||
|
||||
def print_stats(stats: dict):
|
||||
"""Print statistics for filtered mode."""
|
||||
from rich import print
|
||||
|
||||
print(f"""
|
||||
[green]Update Complete[/green]
|
||||
Processed: {stats['processed']}
|
||||
Reconciled: {stats['reconciled']}
|
||||
Queued: {stats['queued']}
|
||||
""")
|
||||
|
||||
|
||||
def print_combined_stats(stats_combined: dict):
|
||||
"""Print statistics for full mode."""
|
||||
from rich import print
|
||||
|
||||
s1 = stats_combined['phase1']
|
||||
s2 = stats_combined['phase2']
|
||||
|
||||
print(f"""
|
||||
[green]Archive Update Complete[/green]
|
||||
|
||||
Phase 1 (Import Orphans):
|
||||
Checked: {s1.get('processed', 0)}
|
||||
Imported: {s1.get('imported', 0)}
|
||||
Migrated: {s1.get('migrated', 0)}
|
||||
Invalid: {s1.get('invalid', 0)}
|
||||
|
||||
Phase 2 (Process DB):
|
||||
Processed: {s2.get('processed', 0)}
|
||||
Reconciled: {s2.get('reconciled', 0)}
|
||||
Queued: {s2.get('queued', 0)}
|
||||
|
||||
Phase 3 (Deduplicate):
|
||||
Merged: {stats_combined['deduplicated']}
|
||||
""")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
|
||||
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
|
||||
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
|
||||
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
|
||||
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
|
||||
@click.option('--status', type=click.Choice([
|
||||
'indexed', 'archived', 'unarchived',
|
||||
'present', 'valid', 'invalid',
|
||||
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||
]), default='indexed', help=f'''
|
||||
Update only links or data directories that have the given status:
|
||||
indexed {get_indexed_folders.__doc__} (the default)
|
||||
archived {get_archived_folders.__doc__}
|
||||
unarchived {get_unarchived_folders.__doc__}
|
||||
|
||||
present {get_present_folders.__doc__}
|
||||
valid {get_valid_folders.__doc__}
|
||||
invalid {get_invalid_folders.__doc__}
|
||||
|
||||
duplicate {get_duplicate_folders.__doc__}
|
||||
orphaned {get_orphaned_folders.__doc__}
|
||||
corrupted {get_corrupted_folders.__doc__}
|
||||
unrecognized {get_unrecognized_folders.__doc__}
|
||||
''')
|
||||
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
|
||||
@click.option('--resume', type=str, help='Resume from timestamp')
|
||||
@click.option('--before', type=float, help='Only snapshots before timestamp')
|
||||
@click.option('--after', type=float, help='Only snapshots after timestamp')
|
||||
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
|
||||
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
|
||||
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
update(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
# LEGACY VERSION:
|
||||
# @enforce_types
|
||||
# def update(resume: Optional[float]=None,
|
||||
# only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
|
||||
# index_only: bool=False,
|
||||
# overwrite: bool=False,
|
||||
# filter_patterns_str: Optional[str]=None,
|
||||
# filter_patterns: Optional[List[str]]=None,
|
||||
# filter_type: Optional[str]=None,
|
||||
# status: Optional[str]=None,
|
||||
# after: Optional[str]=None,
|
||||
# before: Optional[str]=None,
|
||||
# extractors: str="",
|
||||
# out_dir: Path=DATA_DIR) -> List[Link]:
|
||||
# """Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
# from core.models import ArchiveResult
|
||||
# from .search import index_links
|
||||
# # from workers.supervisord_util import start_cli_workers
|
||||
|
||||
|
||||
# check_data_folder()
|
||||
# # start_cli_workers()
|
||||
# new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
|
||||
# extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# # Step 1: Filter for selected_links
|
||||
# print('[*] Finding matching Snapshots to update...')
|
||||
# print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
|
||||
# matching_snapshots = list_links(
|
||||
# filter_patterns=filter_patterns,
|
||||
# filter_type=filter_type,
|
||||
# before=before,
|
||||
# after=after,
|
||||
# )
|
||||
# print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
|
||||
# matching_folders = list_folders(
|
||||
# links=matching_snapshots,
|
||||
# status=status,
|
||||
# out_dir=out_dir,
|
||||
# )
|
||||
# all_links = (link for link in matching_folders.values() if link)
|
||||
# print(' - Sorting by most unfinished -> least unfinished + date archived...')
|
||||
# all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
|
||||
|
||||
# if index_only:
|
||||
# for link in all_links:
|
||||
# write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||
# index_links(all_links, out_dir=out_dir)
|
||||
# return all_links
|
||||
|
||||
# # Step 2: Run the archive methods for each link
|
||||
# to_archive = new_links if only_new else all_links
|
||||
# if resume:
|
||||
# to_archive = [
|
||||
# link for link in to_archive
|
||||
# if link.timestamp >= str(resume)
|
||||
# ]
|
||||
# if not to_archive:
|
||||
# stderr('')
|
||||
# stderr(f'[√] Nothing found to resume after {resume}', color='green')
|
||||
# return all_links
|
||||
|
||||
# archive_kwargs = {
|
||||
# "out_dir": out_dir,
|
||||
# }
|
||||
# if extractors:
|
||||
# archive_kwargs["methods"] = extractors
|
||||
|
||||
|
||||
# archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||
|
||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
# return all_links
|
||||
|
||||
@@ -107,12 +107,12 @@ def version(quiet: bool=False,
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from machine.models import Machine, InstalledBinary
|
||||
from machine.models import Machine, Binary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all installed binaries from the database
|
||||
all_installed = InstalledBinary.objects.filter(
|
||||
# Get all binaries from the database
|
||||
all_installed = Binary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
|
||||
@@ -134,7 +134,7 @@ def version(quiet: bool=False,
|
||||
failures.append(installed.name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
if not has_any_installed:
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
|
||||
@@ -472,25 +472,25 @@ class TestURLCollection(unittest.TestCase):
|
||||
"""Clean up test directory."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_collect_urls_from_extractors(self):
|
||||
"""Should collect urls.jsonl from all extractor subdirectories."""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
def test_collect_urls_from_plugins(self):
|
||||
"""Should collect urls.jsonl from all parser plugin subdirectories."""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
urls = collect_urls_from_extractors(self.test_dir)
|
||||
urls = collect_urls_from_plugins(self.test_dir)
|
||||
|
||||
self.assertEqual(len(urls), 4)
|
||||
|
||||
# Check that via_extractor is set
|
||||
extractors = {u['via_extractor'] for u in urls}
|
||||
self.assertIn('wget', extractors)
|
||||
self.assertIn('parse_html_urls', extractors)
|
||||
self.assertNotIn('screenshot', extractors) # No urls.jsonl
|
||||
# Check that plugin is set
|
||||
plugins = {u['plugin'] for u in urls}
|
||||
self.assertIn('wget', plugins)
|
||||
self.assertIn('parse_html_urls', plugins)
|
||||
self.assertNotIn('screenshot', plugins) # No urls.jsonl
|
||||
|
||||
def test_collect_urls_preserves_metadata(self):
|
||||
"""Should preserve metadata from urls.jsonl entries."""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
urls = collect_urls_from_extractors(self.test_dir)
|
||||
urls = collect_urls_from_plugins(self.test_dir)
|
||||
|
||||
# Find the entry with title
|
||||
titled = [u for u in urls if u.get('title') == 'HTML Link 2']
|
||||
@@ -499,10 +499,10 @@ class TestURLCollection(unittest.TestCase):
|
||||
|
||||
def test_collect_urls_empty_dir(self):
|
||||
"""Should handle empty or non-existent directories."""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
empty_dir = self.test_dir / 'nonexistent'
|
||||
urls = collect_urls_from_extractors(empty_dir)
|
||||
urls = collect_urls_from_plugins(empty_dir)
|
||||
|
||||
self.assertEqual(len(urls), 0)
|
||||
|
||||
@@ -612,7 +612,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox crawl URL
|
||||
Should create snapshot, run plugins, output discovered URLs.
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create a mock snapshot directory with urls.jsonl
|
||||
@@ -627,7 +627,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
)
|
||||
|
||||
# Collect URLs (as crawl does)
|
||||
discovered = collect_urls_from_extractors(test_snapshot_dir)
|
||||
discovered = collect_urls_from_plugins(test_snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 2)
|
||||
|
||||
@@ -688,7 +688,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
@@ -707,7 +707,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
)
|
||||
|
||||
# Step 3: Collect discovered URLs (crawl output)
|
||||
discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
crawl_output = []
|
||||
for entry in discovered:
|
||||
entry['type'] = TYPE_SNAPSHOT
|
||||
@@ -835,7 +835,7 @@ class TestParserPluginWorkflows(unittest.TestCase):
|
||||
"""
|
||||
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create mock output directory
|
||||
@@ -847,17 +847,17 @@ class TestParserPluginWorkflows(unittest.TestCase):
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 1)
|
||||
self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
|
||||
self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
|
||||
self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')
|
||||
|
||||
def test_rss_parser_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
|
||||
@@ -869,16 +869,16 @@ class TestParserPluginWorkflows(unittest.TestCase):
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 2)
|
||||
self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
|
||||
self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))
|
||||
|
||||
def test_multiple_parsers_dedupe(self):
|
||||
"""
|
||||
Multiple parsers may discover the same URL - should be deduplicated.
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
# Create mock output with duplicate URLs from different parsers
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
|
||||
@@ -895,7 +895,7 @@ class TestParserPluginWorkflows(unittest.TestCase):
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
all_discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
all_discovered = collect_urls_from_plugins(snapshot_dir)
|
||||
|
||||
# Both entries are returned (deduplication happens at the crawl command level)
|
||||
self.assertEqual(len(all_discovered), 2)
|
||||
|
||||
Reference in New Issue
Block a user