From e2cbcd17f63ce26e5f1e8bc67f70265b6d16664b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 26 Dec 2025 18:22:48 -0800 Subject: [PATCH 1/2] more tests and migrations fixes --- .claude/settings.local.json | 7 +- STORAGE_CAS_PLAN.md | 1341 ----------------- .../BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md | 863 ++++++----- ...0003_alter_apitoken_created_by_and_more.py | 27 + archivebox/api/models.py | 6 +- archivebox/base_models/models.py | 2 +- ...alter_archiveresult_created_by_and_more.py | 32 + .../migrations/0003_alter_crawl_output_dir.py | 19 + archivebox/plugins/forumdl/config.json | 46 + .../forumdl/on_Crawl__00_validate_forumdl.py | 129 ++ .../forumdl/on_Snapshot__53_forumdl.py | 237 +++ .../plugins/forumdl/templates/embed.html | 40 + .../plugins/forumdl/templates/fullscreen.html | 147 ++ .../plugins/forumdl/templates/icon.html | 1 + .../plugins/forumdl/templates/thumbnail.html | 7 + .../plugins/forumdl/tests/test_forumdl.py | 157 ++ archivebox/plugins/gallerydl/config.json | 13 +- .../on_Crawl__00_validate_gallerydl.py | 6 +- .../gallerydl/on_Snapshot__52_gallerydl.py | 67 +- .../plugins/gallerydl/tests/test_gallerydl.py | 157 ++ archivebox/plugins/papersdl/config.json | 29 + .../on_Snapshot__63_parse_netscape_urls.py | 126 +- .../test_parse_netscape_urls_comprehensive.py | 930 ++++++++++++ .../test_parse_rss_urls_comprehensive.py | 987 ++++++++++++ pyproject.toml | 10 +- uv.lock | 14 + 26 files changed, 3608 insertions(+), 1792 deletions(-) delete mode 100644 STORAGE_CAS_PLAN.md create mode 100644 archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py create mode 100644 archivebox/core/migrations/0027_alter_archiveresult_created_by_and_more.py create mode 100644 archivebox/crawls/migrations/0003_alter_crawl_output_dir.py create mode 100644 archivebox/plugins/forumdl/config.json create mode 100755 archivebox/plugins/forumdl/on_Crawl__00_validate_forumdl.py create mode 100755 archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py create mode 100644 archivebox/plugins/forumdl/templates/embed.html create mode 100644 archivebox/plugins/forumdl/templates/fullscreen.html create mode 100644 archivebox/plugins/forumdl/templates/icon.html create mode 100644 archivebox/plugins/forumdl/templates/thumbnail.html create mode 100644 archivebox/plugins/forumdl/tests/test_forumdl.py create mode 100644 archivebox/plugins/gallerydl/tests/test_gallerydl.py create mode 100644 archivebox/plugins/papersdl/config.json create mode 100644 archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls_comprehensive.py create mode 100644 archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 70293cbd..bede7ad3 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -9,7 +9,12 @@ "Bash(pkill:*)", "Bash(python3:*)", "Bash(sqlite3:*)", - "WebFetch(domain:github.com)" + "WebFetch(domain:github.com)", + "Bash(uv add:*)", + "Bash(mkdir:*)", + "Bash(chmod:*)", + "Bash(python -m forum_dl:*)", + "Bash(archivebox manage migrate:*)" ] } } diff --git a/STORAGE_CAS_PLAN.md b/STORAGE_CAS_PLAN.md deleted file mode 100644 index dce230db..00000000 --- a/STORAGE_CAS_PLAN.md +++ /dev/null @@ -1,1341 +0,0 @@ -# Content-Addressable Storage (CAS) with Symlink Farm Architecture - -## Table of Contents -- [Overview](#overview) -- [Architecture Design](#architecture-design) -- [Database Models](#database-models) -- [Storage Backends](#storage-backends) -- [Symlink Farm Views](#symlink-farm-views) -- [Automatic Synchronization](#automatic-synchronization) -- [Migration Strategy](#migration-strategy) -- [Verification and Repair](#verification-and-repair) -- [Configuration](#configuration) -- [Workflow Examples](#workflow-examples) -- [Benefits](#benefits) - -## Overview - -### Problem Statement -ArchiveBox currently stores files in a timestamp-based structure: -``` -/data/archive/{timestamp}/{extractor}/filename.ext -``` - -This leads to: -- **Massive duplication**: `jquery.min.js` stored 1000x across different snapshots -- **No S3 support**: Direct filesystem coupling -- **Inflexible organization**: Hard to browse by domain, date, or user - -### Solution: Content-Addressable Storage + Symlink Farm - -**Core Concept:** -1. **Store files once** in content-addressable storage (CAS) by hash -2. **Create symlink farms** in multiple human-readable views -3. **Database as source of truth** with automatic sync -4. **Support S3 and local storage** via django-storages - -**Storage Layout:** -``` -/data/ -├── cas/ # Content-addressable storage (deduplicated) -│ └── sha256/ -│ └── ab/ -│ └── cd/ -│ └── abcdef123... # Actual file (stored once) -│ -├── archive/ # Human-browseable views (all symlinks) -│ ├── by_domain/ -│ │ └── example.com/ -│ │ └── 20241225/ -│ │ └── 019b54ee-28d9-72dc/ -│ │ ├── wget/ -│ │ │ └── index.html -> ../../../../../cas/sha256/ab/cd/abcdef... -│ │ └── singlefile/ -│ │ └── page.html -> ../../../../../cas/sha256/ef/12/ef1234... -│ │ -│ ├── by_date/ -│ │ └── 20241225/ -│ │ └── example.com/ -│ │ └── 019b54ee-28d9-72dc/ -│ │ └── wget/ -│ │ └── index.html -> ../../../../../../cas/sha256/ab/cd/abcdef... -│ │ -│ ├── by_user/ -│ │ └── squash/ -│ │ └── 20241225/ -│ │ └── example.com/ -│ │ └── 019b54ee-28d9-72dc/ -│ │ -│ └── by_timestamp/ # Legacy compatibility -│ └── 1735142400.123/ -│ └── wget/ -│ └── index.html -> ../../../../cas/sha256/ab/cd/abcdef... -``` - -## Architecture Design - -### Core Principles - -1. **Database = Source of Truth**: The `SnapshotFile` model is authoritative -2. **Symlinks = Materialized Views**: Auto-generated from DB, disposable -3. **Atomic Updates**: Symlinks created/deleted with DB transactions -4. **Idempotent**: Operations can be safely retried -5. **Self-Healing**: Automatic detection and repair of drift -6. **Content-Addressable**: Files deduplicated by SHA-256 hash -7. **Storage Agnostic**: Works with local filesystem, S3, Azure, etc. - -### Space Overhead Analysis - -Symlinks are incredibly cheap: -``` -Typical symlink size: -- ext4/XFS: ~60-100 bytes -- ZFS: ~120 bytes -- btrfs: ~80 bytes - -Example calculation: -100,000 files × 4 views = 400,000 symlinks -400,000 symlinks × 100 bytes = 40 MB - -Space saved by deduplication: -- Average 30% duplicate content across archives -- 100GB archive → saves ~30GB -- Symlink overhead: 0.04GB (0.13% of savings!) - -Verdict: Symlinks are FREE compared to deduplication savings -``` - -## Database Models - -### Blob Model - -```python -# archivebox/core/models.py - -class Blob(models.Model): - """ - Immutable content-addressed blob. - Stored as: /cas/{hash_algorithm}/{ab}/{cd}/{full_hash} - """ - - # Content identification - hash_algorithm = models.CharField(max_length=16, default='sha256', db_index=True) - hash = models.CharField(max_length=128, db_index=True) - size = models.BigIntegerField() - - # Storage location - storage_backend = models.CharField( - max_length=32, - default='local', - choices=[ - ('local', 'Local Filesystem'), - ('s3', 'S3'), - ('azure', 'Azure Blob Storage'), - ('gcs', 'Google Cloud Storage'), - ], - db_index=True, - ) - - # Metadata - mime_type = models.CharField(max_length=255, blank=True) - created_at = models.DateTimeField(auto_now_add=True, db_index=True) - - # Reference counting (for garbage collection) - ref_count = models.IntegerField(default=0, db_index=True) - - class Meta: - unique_together = [('hash_algorithm', 'hash', 'storage_backend')] - indexes = [ - models.Index(fields=['hash_algorithm', 'hash']), - models.Index(fields=['ref_count']), - models.Index(fields=['storage_backend', 'created_at']), - ] - constraints = [ - # Ensure ref_count is never negative - models.CheckConstraint( - check=models.Q(ref_count__gte=0), - name='blob_ref_count_positive' - ), - ] - - def __str__(self): - return f"Blob({self.hash[:16]}..., refs={self.ref_count})" - - @property - def storage_path(self) -> str: - """Content-addressed path: sha256/ab/cd/abcdef123...""" - h = self.hash - return f"{self.hash_algorithm}/{h[:2]}/{h[2:4]}/{h}" - - def get_file_url(self): - """Get URL to access this blob""" - from django.core.files.storage import default_storage - return default_storage.url(self.storage_path) - - -class SnapshotFile(models.Model): - """ - Links a Snapshot to its files (many-to-many through Blob). - Preserves original path information for backwards compatibility. - """ - - snapshot = models.ForeignKey( - Snapshot, - on_delete=models.CASCADE, - related_name='files' - ) - blob = models.ForeignKey( - Blob, - on_delete=models.PROTECT # PROTECT: can't delete blob while referenced - ) - - # Original path information - extractor = models.CharField(max_length=32) # 'wget', 'singlefile', etc. - relative_path = models.CharField(max_length=512) # 'output.html', 'warc/example.warc.gz' - - # Metadata - created_at = models.DateTimeField(auto_now_add=True, db_index=True) - - class Meta: - unique_together = [('snapshot', 'extractor', 'relative_path')] - indexes = [ - models.Index(fields=['snapshot', 'extractor']), - models.Index(fields=['blob']), - models.Index(fields=['created_at']), - ] - - def __str__(self): - return f"{self.snapshot.id}/{self.extractor}/{self.relative_path}" - - @property - def logical_path(self) -> Path: - """Virtual path as it would appear in old structure""" - return Path(self.snapshot.output_dir) / self.extractor / self.relative_path - - def save(self, *args, **kwargs): - """Override save to ensure paths are normalized""" - # Normalize path (no leading slash, use forward slashes) - self.relative_path = self.relative_path.lstrip('/').replace('\\', '/') - super().save(*args, **kwargs) -``` - -### Updated Snapshot Model - -```python -class Snapshot(ModelWithOutputDir, ...): - # ... existing fields ... - - @property - def output_dir(self) -> Path: - """ - Returns the primary view directory for browsing. - Falls back to legacy if needed. - """ - # Try by_timestamp view first (best compatibility) - by_timestamp = CONSTANTS.ARCHIVE_DIR / 'by_timestamp' / self.timestamp - if by_timestamp.exists(): - return by_timestamp - - # Fall back to legacy location (pre-CAS archives) - legacy = CONSTANTS.ARCHIVE_DIR / self.timestamp - if legacy.exists(): - return legacy - - # Default to by_timestamp for new snapshots - return by_timestamp - - def get_output_dir(self, view: str = 'by_timestamp') -> Path: - """Get output directory for a specific view""" - from storage.views import ViewManager - from urllib.parse import urlparse - - if view not in ViewManager.VIEWS: - raise ValueError(f"Unknown view: {view}") - - if view == 'by_domain': - domain = urlparse(self.url).netloc or 'unknown' - date = self.created_at.strftime('%Y%m%d') - return CONSTANTS.ARCHIVE_DIR / 'by_domain' / domain / date / str(self.id) - - elif view == 'by_date': - domain = urlparse(self.url).netloc or 'unknown' - date = self.created_at.strftime('%Y%m%d') - return CONSTANTS.ARCHIVE_DIR / 'by_date' / date / domain / str(self.id) - - elif view == 'by_user': - domain = urlparse(self.url).netloc or 'unknown' - date = self.created_at.strftime('%Y%m%d') - user = self.created_by.username - return CONSTANTS.ARCHIVE_DIR / 'by_user' / user / date / domain / str(self.id) - - elif view == 'by_timestamp': - return CONSTANTS.ARCHIVE_DIR / 'by_timestamp' / self.timestamp - - return self.output_dir -``` - -### Updated ArchiveResult Model - -```python -class ArchiveResult(models.Model): - # ... existing fields ... - - # Note: output_dir field is removed (was deprecated) - # Keep: output (relative path to primary output file) - - @property - def output_files(self): - """Get all files for this extractor""" - return self.snapshot.files.filter(extractor=self.extractor) - - @property - def primary_output_file(self): - """Get the primary output file (e.g., 'output.html')""" - if self.output: - return self.snapshot.files.filter( - extractor=self.extractor, - relative_path=self.output - ).first() - return None -``` - -## Storage Backends - -### Django Storage Configuration - -```python -# settings.py or archivebox/config/settings.py - -# For local development/testing -STORAGES = { - "default": { - "BACKEND": "django.core.files.storage.FileSystemStorage", - "OPTIONS": { - "location": "/data/cas", - "base_url": "/cas/", - }, - }, - "staticfiles": { - "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", - }, -} - -# For production with S3 -STORAGES = { - "default": { - "BACKEND": "storages.backends.s3.S3Storage", - "OPTIONS": { - "bucket_name": "archivebox-blobs", - "region_name": "us-east-1", - "default_acl": "private", - "object_parameters": { - "StorageClass": "INTELLIGENT_TIERING", # Auto-optimize storage costs - }, - }, - }, -} -``` - -### Blob Manager - -```python -# archivebox/storage/ingest.py - -import hashlib -from django.core.files.storage import default_storage -from django.core.files.base import ContentFile -from django.db import transaction -from pathlib import Path -import os - -class BlobManager: - """Manages content-addressed blob storage with deduplication""" - - @staticmethod - def hash_file(file_path: Path, algorithm='sha256') -> str: - """Calculate content hash of a file""" - hasher = hashlib.new(algorithm) - with open(file_path, 'rb') as f: - for chunk in iter(lambda: f.read(65536), b''): - hasher.update(chunk) - return hasher.hexdigest() - - @staticmethod - def ingest_file( - file_path: Path, - snapshot, - extractor: str, - relative_path: str, - mime_type: str = '', - create_views: bool = True, - ) -> SnapshotFile: - """ - Ingest a file into blob storage with deduplication. - - Args: - file_path: Path to the file to ingest - snapshot: Snapshot this file belongs to - extractor: Extractor name (wget, singlefile, etc.) - relative_path: Relative path within extractor dir - mime_type: MIME type of the file - create_views: Whether to create symlink views - - Returns: - SnapshotFile reference - """ - from storage.views import ViewManager - - # Calculate hash - file_hash = BlobManager.hash_file(file_path) - file_size = file_path.stat().st_size - - with transaction.atomic(): - # Check if blob already exists (deduplication!) - blob, created = Blob.objects.get_or_create( - hash_algorithm='sha256', - hash=file_hash, - storage_backend='local', - defaults={ - 'size': file_size, - 'mime_type': mime_type, - } - ) - - if created: - # New blob - store in CAS - cas_path = ViewManager.get_cas_path(blob) - cas_path.parent.mkdir(parents=True, exist_ok=True) - - # Use hardlink if possible (instant), copy if not - try: - os.link(file_path, cas_path) - except OSError: - import shutil - shutil.copy2(file_path, cas_path) - - print(f"✓ Stored new blob: {file_hash[:16]}... ({file_size:,} bytes)") - else: - print(f"✓ Deduplicated: {file_hash[:16]}... (saved {file_size:,} bytes)") - - # Increment reference count - blob.ref_count += 1 - blob.save(update_fields=['ref_count']) - - # Create snapshot file reference - snapshot_file, _ = SnapshotFile.objects.get_or_create( - snapshot=snapshot, - extractor=extractor, - relative_path=relative_path, - defaults={'blob': blob} - ) - - # Create symlink views (signal will also do this, but we can force it here) - if create_views: - views = ViewManager.create_symlinks(snapshot_file) - print(f" Created {len(views)} view symlinks") - - return snapshot_file - - @staticmethod - def ingest_directory( - dir_path: Path, - snapshot, - extractor: str - ) -> list[SnapshotFile]: - """Ingest all files from a directory""" - import mimetypes - - snapshot_files = [] - - for file_path in dir_path.rglob('*'): - if file_path.is_file(): - relative_path = str(file_path.relative_to(dir_path)) - mime_type, _ = mimetypes.guess_type(str(file_path)) - - snapshot_file = BlobManager.ingest_file( - file_path, - snapshot, - extractor, - relative_path, - mime_type or '' - ) - snapshot_files.append(snapshot_file) - - return snapshot_files -``` - -## Symlink Farm Views - -### View Classes - -```python -# archivebox/storage/views.py - -from pathlib import Path -from typing import Protocol -from urllib.parse import urlparse -import os -import logging - -logger = logging.getLogger(__name__) - - -class SnapshotView(Protocol): - """Protocol for generating browseable views of snapshots""" - - def get_view_path(self, snapshot_file: SnapshotFile) -> Path: - """Get the human-readable path for this file in this view""" - ... - - -class ByDomainView: - """View: /archive/by_domain/{domain}/{YYYYMMDD}/{snapshot_id}/{extractor}/{filename}""" - - def get_view_path(self, snapshot_file: SnapshotFile) -> Path: - snapshot = snapshot_file.snapshot - domain = urlparse(snapshot.url).netloc or 'unknown' - date = snapshot.created_at.strftime('%Y%m%d') - - return ( - CONSTANTS.ARCHIVE_DIR / 'by_domain' / domain / date / - str(snapshot.id) / snapshot_file.extractor / snapshot_file.relative_path - ) - - -class ByDateView: - """View: /archive/by_date/{YYYYMMDD}/{domain}/{snapshot_id}/{extractor}/{filename}""" - - def get_view_path(self, snapshot_file: SnapshotFile) -> Path: - snapshot = snapshot_file.snapshot - domain = urlparse(snapshot.url).netloc or 'unknown' - date = snapshot.created_at.strftime('%Y%m%d') - - return ( - CONSTANTS.ARCHIVE_DIR / 'by_date' / date / domain / - str(snapshot.id) / snapshot_file.extractor / snapshot_file.relative_path - ) - - -class ByUserView: - """View: /archive/by_user/{username}/{YYYYMMDD}/{domain}/{snapshot_id}/{extractor}/{filename}""" - - def get_view_path(self, snapshot_file: SnapshotFile) -> Path: - snapshot = snapshot_file.snapshot - user = snapshot.created_by.username - domain = urlparse(snapshot.url).netloc or 'unknown' - date = snapshot.created_at.strftime('%Y%m%d') - - return ( - CONSTANTS.ARCHIVE_DIR / 'by_user' / user / date / domain / - str(snapshot.id) / snapshot_file.extractor / snapshot_file.relative_path - ) - - -class LegacyTimestampView: - """View: /archive/by_timestamp/{timestamp}/{extractor}/{filename}""" - - def get_view_path(self, snapshot_file: SnapshotFile) -> Path: - snapshot = snapshot_file.snapshot - - return ( - CONSTANTS.ARCHIVE_DIR / 'by_timestamp' / snapshot.timestamp / - snapshot_file.extractor / snapshot_file.relative_path - ) - - -class ViewManager: - """Manages symlink farm views""" - - VIEWS = { - 'by_domain': ByDomainView(), - 'by_date': ByDateView(), - 'by_user': ByUserView(), - 'by_timestamp': LegacyTimestampView(), - } - - @staticmethod - def get_cas_path(blob: Blob) -> Path: - """Get the CAS storage path for a blob""" - h = blob.hash - return ( - CONSTANTS.DATA_DIR / 'cas' / blob.hash_algorithm / - h[:2] / h[2:4] / h - ) - - @staticmethod - def create_symlinks(snapshot_file: SnapshotFile, views: list[str] = None) -> dict[str, Path]: - """ - Create symlinks for all views of a file. - If any operation fails, all are rolled back. - """ - from config.common import STORAGE_CONFIG - - if views is None: - views = STORAGE_CONFIG.ENABLED_VIEWS - - cas_path = ViewManager.get_cas_path(snapshot_file.blob) - - # Verify CAS file exists before creating symlinks - if not cas_path.exists(): - raise FileNotFoundError(f"CAS file missing: {cas_path}") - - created = {} - cleanup_on_error = [] - - try: - for view_name in views: - if view_name not in ViewManager.VIEWS: - continue - - view = ViewManager.VIEWS[view_name] - view_path = view.get_view_path(snapshot_file) - - # Create parent directory - view_path.parent.mkdir(parents=True, exist_ok=True) - - # Create relative symlink (more portable) - rel_target = os.path.relpath(cas_path, view_path.parent) - - # Remove existing symlink/file if present - if view_path.exists() or view_path.is_symlink(): - view_path.unlink() - - # Create symlink - view_path.symlink_to(rel_target) - created[view_name] = view_path - cleanup_on_error.append(view_path) - - return created - - except Exception as e: - # Rollback: Remove partially created symlinks - for path in cleanup_on_error: - try: - if path.exists() or path.is_symlink(): - path.unlink() - except Exception as cleanup_error: - logger.error(f"Failed to cleanup {path}: {cleanup_error}") - - raise Exception(f"Failed to create symlinks: {e}") - - @staticmethod - def create_symlinks_idempotent(snapshot_file: SnapshotFile, views: list[str] = None): - """ - Idempotent version - safe to call multiple times. - Returns dict of created symlinks, or empty dict if already correct. - """ - from config.common import STORAGE_CONFIG - - if views is None: - views = STORAGE_CONFIG.ENABLED_VIEWS - - cas_path = ViewManager.get_cas_path(snapshot_file.blob) - needs_update = False - - # Check if all symlinks exist and point to correct target - for view_name in views: - if view_name not in ViewManager.VIEWS: - continue - - view = ViewManager.VIEWS[view_name] - view_path = view.get_view_path(snapshot_file) - - if not view_path.is_symlink(): - needs_update = True - break - - # Check if symlink points to correct target - try: - current_target = view_path.resolve() - if current_target != cas_path: - needs_update = True - break - except Exception: - needs_update = True - break - - if needs_update: - return ViewManager.create_symlinks(snapshot_file, views) - - return {} # Already correct - - @staticmethod - def cleanup_symlinks(snapshot_file: SnapshotFile): - """Remove all symlinks for a file""" - from config.common import STORAGE_CONFIG - - for view_name in STORAGE_CONFIG.ENABLED_VIEWS: - if view_name not in ViewManager.VIEWS: - continue - - view = ViewManager.VIEWS[view_name] - view_path = view.get_view_path(snapshot_file) - - if view_path.exists() or view_path.is_symlink(): - view_path.unlink() - logger.info(f"Removed symlink: {view_path}") -``` - -## Automatic Synchronization - -### Django Signals for Sync - -```python -# archivebox/storage/signals.py - -from django.db.models.signals import post_save, post_delete, pre_delete -from django.dispatch import receiver -from django.db import transaction -from core.models import SnapshotFile, Blob -import logging - -logger = logging.getLogger(__name__) - - -@receiver(post_save, sender=SnapshotFile) -def sync_symlinks_on_save(sender, instance, created, **kwargs): - """ - Automatically create/update symlinks when SnapshotFile is saved. - Runs AFTER transaction commit to ensure DB consistency. - """ - from config.common import STORAGE_CONFIG - - if not STORAGE_CONFIG.AUTO_SYNC_SYMLINKS: - return - - if created: - # New file - create all symlinks - try: - from storage.views import ViewManager - views = ViewManager.create_symlinks(instance) - logger.info(f"Created {len(views)} symlinks for {instance.relative_path}") - except Exception as e: - logger.error(f"Failed to create symlinks for {instance.id}: {e}") - # Don't fail the transaction - can be repaired later - - -@receiver(pre_delete, sender=SnapshotFile) -def sync_symlinks_on_delete(sender, instance, **kwargs): - """ - Remove symlinks when SnapshotFile is deleted. - Runs BEFORE deletion so we still have the data. - """ - try: - from storage.views import ViewManager - ViewManager.cleanup_symlinks(instance) - logger.info(f"Removed symlinks for {instance.relative_path}") - except Exception as e: - logger.error(f"Failed to remove symlinks for {instance.id}: {e}") - - -@receiver(post_delete, sender=SnapshotFile) -def cleanup_unreferenced_blob(sender, instance, **kwargs): - """ - Decrement blob reference count and cleanup if no longer referenced. - """ - try: - blob = instance.blob - - # Atomic decrement - from django.db.models import F - Blob.objects.filter(pk=blob.pk).update(ref_count=F('ref_count') - 1) - - # Reload to get updated count - blob.refresh_from_db() - - # Garbage collect if no more references - if blob.ref_count <= 0: - from storage.views import ViewManager - cas_path = ViewManager.get_cas_path(blob) - - if cas_path.exists(): - cas_path.unlink() - logger.info(f"Garbage collected blob {blob.hash[:16]}...") - - blob.delete() - - except Exception as e: - logger.error(f"Failed to cleanup blob: {e}") -``` - -### App Configuration - -```python -# archivebox/storage/apps.py - -from django.apps import AppConfig - -class StorageConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'storage' - - def ready(self): - import storage.signals # Register signal handlers -``` - -## Migration Strategy - -### Migration Command - -```python -# archivebox/core/management/commands/migrate_to_cas.py - -from django.core.management.base import BaseCommand -from django.db.models import Q -from core.models import Snapshot -from storage.ingest import BlobManager -from storage.views import ViewManager -from pathlib import Path -import shutil - -class Command(BaseCommand): - help = 'Migrate existing archives to content-addressable storage' - - def add_arguments(self, parser): - parser.add_argument('--dry-run', action='store_true', help='Show what would be done') - parser.add_argument('--views', nargs='+', default=['by_timestamp', 'by_domain', 'by_date']) - parser.add_argument('--cleanup-legacy', action='store_true', help='Delete old files after migration') - parser.add_argument('--batch-size', type=int, default=100) - - def handle(self, *args, **options): - dry_run = options['dry_run'] - views = options['views'] - cleanup = options['cleanup_legacy'] - batch_size = options['batch_size'] - - snapshots = Snapshot.objects.all().order_by('created_at') - total = snapshots.count() - - if dry_run: - self.stdout.write(self.style.WARNING('DRY RUN - No changes will be made')) - - self.stdout.write(f"Found {total} snapshots to migrate") - - total_files = 0 - total_saved = 0 - total_bytes = 0 - error_count = 0 - - for i, snapshot in enumerate(snapshots, 1): - self.stdout.write(f"\n[{i}/{total}] Processing {snapshot.url[:60]}...") - - legacy_dir = CONSTANTS.ARCHIVE_DIR / snapshot.timestamp - - if not legacy_dir.exists(): - self.stdout.write(f" Skipping (no legacy dir)") - continue - - # Process each extractor directory - for extractor_dir in legacy_dir.iterdir(): - if not extractor_dir.is_dir(): - continue - - extractor = extractor_dir.name - self.stdout.write(f" Processing extractor: {extractor}") - - if dry_run: - file_count = sum(1 for _ in extractor_dir.rglob('*') if _.is_file()) - self.stdout.write(f" Would ingest {file_count} files") - continue - - # Track blobs before ingestion - blobs_before = Blob.objects.count() - - try: - # Ingest all files from this extractor - ingested = BlobManager.ingest_directory( - extractor_dir, - snapshot, - extractor - ) - - total_files += len(ingested) - - # Calculate deduplication savings - blobs_after = Blob.objects.count() - new_blobs = blobs_after - blobs_before - dedup_count = len(ingested) - new_blobs - - if dedup_count > 0: - dedup_bytes = sum(f.blob.size for f in ingested[-dedup_count:]) - total_saved += dedup_bytes - self.stdout.write( - f" ✓ Ingested {len(ingested)} files " - f"({new_blobs} new, {dedup_count} deduplicated, " - f"saved {dedup_bytes / 1024 / 1024:.1f} MB)" - ) - else: - total_bytes_added = sum(f.blob.size for f in ingested) - total_bytes += total_bytes_added - self.stdout.write( - f" ✓ Ingested {len(ingested)} files " - f"({total_bytes_added / 1024 / 1024:.1f} MB)" - ) - - except Exception as e: - error_count += 1 - self.stdout.write(self.style.ERROR(f" ✗ Error: {e}")) - continue - - # Cleanup legacy files - if cleanup and not dry_run: - try: - shutil.rmtree(legacy_dir) - self.stdout.write(f" Cleaned up legacy dir: {legacy_dir}") - except Exception as e: - self.stdout.write(self.style.WARNING(f" Failed to cleanup: {e}")) - - # Progress update - if i % 10 == 0: - self.stdout.write( - f"\nProgress: {i}/{total} | " - f"Files: {total_files:,} | " - f"Saved: {total_saved / 1024 / 1024:.1f} MB | " - f"Errors: {error_count}" - ) - - # Final summary - self.stdout.write("\n" + "="*80) - self.stdout.write(self.style.SUCCESS("Migration Complete!")) - self.stdout.write(f" Snapshots processed: {total}") - self.stdout.write(f" Files ingested: {total_files:,}") - self.stdout.write(f" Space saved by deduplication: {total_saved / 1024 / 1024:.1f} MB") - self.stdout.write(f" Errors: {error_count}") - self.stdout.write(f" Symlink views created: {', '.join(views)}") -``` - -### Rebuild Views Command - -```python -# archivebox/core/management/commands/rebuild_views.py - -from django.core.management.base import BaseCommand -from core.models import SnapshotFile -from storage.views import ViewManager -import shutil - -class Command(BaseCommand): - help = 'Rebuild symlink farm views from database' - - def add_arguments(self, parser): - parser.add_argument( - '--views', - nargs='+', - default=['by_timestamp', 'by_domain', 'by_date'], - help='Which views to rebuild' - ) - parser.add_argument( - '--clean', - action='store_true', - help='Remove old symlinks before rebuilding' - ) - - def handle(self, *args, **options): - views = options['views'] - clean = options['clean'] - - # Clean old views - if clean: - self.stdout.write("Cleaning old views...") - for view_name in views: - view_dir = CONSTANTS.ARCHIVE_DIR / view_name - if view_dir.exists(): - shutil.rmtree(view_dir) - self.stdout.write(f" Removed {view_dir}") - - # Rebuild all symlinks - total_symlinks = 0 - total_files = SnapshotFile.objects.count() - - self.stdout.write(f"Rebuilding symlinks for {total_files:,} files...") - - for i, snapshot_file in enumerate( - SnapshotFile.objects.select_related('snapshot', 'blob'), - 1 - ): - try: - created = ViewManager.create_symlinks(snapshot_file, views=views) - total_symlinks += len(created) - except Exception as e: - self.stdout.write(self.style.ERROR( - f"Failed to create symlinks for {snapshot_file}: {e}" - )) - - if i % 1000 == 0: - self.stdout.write(f" Created {total_symlinks:,} symlinks...") - - self.stdout.write( - self.style.SUCCESS( - f"\n✓ Rebuilt {total_symlinks:,} symlinks across {len(views)} views" - ) - ) -``` - -## Verification and Repair - -### Storage Verification Command - -```python -# archivebox/core/management/commands/verify_storage.py - -from django.core.management.base import BaseCommand -from core.models import SnapshotFile, Blob -from storage.views import ViewManager -from pathlib import Path - -class Command(BaseCommand): - help = 'Verify storage consistency between DB and filesystem' - - def add_arguments(self, parser): - parser.add_argument('--fix', action='store_true', help='Fix issues found') - parser.add_argument('--vacuum', action='store_true', help='Remove orphaned symlinks') - - def handle(self, *args, **options): - fix = options['fix'] - vacuum = options['vacuum'] - - issues = { - 'missing_cas_files': [], - 'missing_symlinks': [], - 'incorrect_symlinks': [], - 'orphaned_symlinks': [], - 'orphaned_blobs': [], - } - - self.stdout.write("Checking database → filesystem consistency...") - - # Check 1: Verify all blobs exist in CAS - self.stdout.write("\n1. Verifying CAS files...") - for blob in Blob.objects.all(): - cas_path = ViewManager.get_cas_path(blob) - if not cas_path.exists(): - issues['missing_cas_files'].append(blob) - self.stdout.write(self.style.ERROR( - f"✗ Missing CAS file: {cas_path} (blob {blob.hash[:16]}...)" - )) - - # Check 2: Verify all SnapshotFiles have correct symlinks - self.stdout.write("\n2. Verifying symlinks...") - total_files = SnapshotFile.objects.count() - - for i, sf in enumerate(SnapshotFile.objects.select_related('blob'), 1): - if i % 100 == 0: - self.stdout.write(f" Checked {i}/{total_files} files...") - - cas_path = ViewManager.get_cas_path(sf.blob) - - for view_name in STORAGE_CONFIG.ENABLED_VIEWS: - view = ViewManager.VIEWS[view_name] - view_path = view.get_view_path(sf) - - if not view_path.exists() and not view_path.is_symlink(): - issues['missing_symlinks'].append((sf, view_name, view_path)) - - if fix: - try: - ViewManager.create_symlinks_idempotent(sf, [view_name]) - self.stdout.write(self.style.SUCCESS( - f"✓ Created missing symlink: {view_path}" - )) - except Exception as e: - self.stdout.write(self.style.ERROR( - f"✗ Failed to create symlink: {e}" - )) - - elif view_path.is_symlink(): - # Verify symlink points to correct CAS file - try: - current_target = view_path.resolve() - if current_target != cas_path: - issues['incorrect_symlinks'].append((sf, view_name, view_path)) - - if fix: - ViewManager.create_symlinks_idempotent(sf, [view_name]) - self.stdout.write(self.style.SUCCESS( - f"✓ Fixed incorrect symlink: {view_path}" - )) - except Exception as e: - self.stdout.write(self.style.ERROR( - f"✗ Broken symlink: {view_path} - {e}" - )) - - # Check 3: Find orphaned symlinks - if vacuum: - self.stdout.write("\n3. Checking for orphaned symlinks...") - - # Get all valid view paths from DB - valid_paths = set() - for sf in SnapshotFile.objects.all(): - for view_name in STORAGE_CONFIG.ENABLED_VIEWS: - view = ViewManager.VIEWS[view_name] - valid_paths.add(view.get_view_path(sf)) - - # Scan filesystem for symlinks - for view_name in STORAGE_CONFIG.ENABLED_VIEWS: - view_base = CONSTANTS.ARCHIVE_DIR / view_name - if not view_base.exists(): - continue - - for path in view_base.rglob('*'): - if path.is_symlink() and path not in valid_paths: - issues['orphaned_symlinks'].append(path) - - if fix: - path.unlink() - self.stdout.write(self.style.SUCCESS( - f"✓ Removed orphaned symlink: {path}" - )) - - # Check 4: Find orphaned blobs - self.stdout.write("\n4. Checking for orphaned blobs...") - orphaned_blobs = Blob.objects.filter(ref_count=0) - - for blob in orphaned_blobs: - issues['orphaned_blobs'].append(blob) - - if fix: - cas_path = ViewManager.get_cas_path(blob) - if cas_path.exists(): - cas_path.unlink() - blob.delete() - self.stdout.write(self.style.SUCCESS( - f"✓ Removed orphaned blob: {blob.hash[:16]}..." - )) - - # Summary - self.stdout.write("\n" + "="*80) - self.stdout.write(self.style.WARNING("Storage Verification Summary:")) - self.stdout.write(f" Missing CAS files: {len(issues['missing_cas_files'])}") - self.stdout.write(f" Missing symlinks: {len(issues['missing_symlinks'])}") - self.stdout.write(f" Incorrect symlinks: {len(issues['incorrect_symlinks'])}") - self.stdout.write(f" Orphaned symlinks: {len(issues['orphaned_symlinks'])}") - self.stdout.write(f" Orphaned blobs: {len(issues['orphaned_blobs'])}") - - total_issues = sum(len(v) for v in issues.values()) - - if total_issues == 0: - self.stdout.write(self.style.SUCCESS("\n✓ Storage is consistent!")) - elif fix: - self.stdout.write(self.style.SUCCESS(f"\n✓ Fixed {total_issues} issues")) - else: - self.stdout.write(self.style.WARNING( - f"\n⚠ Found {total_issues} issues. Run with --fix to repair." - )) -``` - -## Configuration - -```python -# archivebox/config/common.py - -class StorageConfig(BaseConfigSet): - toml_section_header: str = "STORAGE_CONFIG" - - # Existing fields - TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR) - LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR) - OUTPUT_PERMISSIONS: str = Field(default="644") - RESTRICT_FILE_NAMES: str = Field(default="windows") - ENFORCE_ATOMIC_WRITES: bool = Field(default=True) - DIR_OUTPUT_PERMISSIONS: str = Field(default="755") - - # New CAS fields - USE_CAS: bool = Field( - default=True, - description="Use content-addressable storage with deduplication" - ) - - ENABLED_VIEWS: list[str] = Field( - default=['by_timestamp', 'by_domain', 'by_date'], - description="Which symlink farm views to maintain" - ) - - AUTO_SYNC_SYMLINKS: bool = Field( - default=True, - description="Automatically create/update symlinks via signals" - ) - - VERIFY_ON_STARTUP: bool = Field( - default=False, - description="Verify storage consistency on startup" - ) - - VERIFY_INTERVAL_HOURS: int = Field( - default=24, - description="Run periodic storage verification (0 to disable)" - ) - - CLEANUP_TEMP_FILES: bool = Field( - default=True, - description="Remove temporary extractor files after ingestion" - ) - - CAS_BACKEND: str = Field( - default='local', - choices=['local', 's3', 'azure', 'gcs'], - description="Storage backend for CAS blobs" - ) -``` - -## Workflow Examples - -### Example 1: Normal Operation - -```python -# Extractor writes files to temporary directory -extractor_dir = Path('/tmp/wget-output') - -# After extraction completes, ingest into CAS -from storage.ingest import BlobManager - -ingested_files = BlobManager.ingest_directory( - extractor_dir, - snapshot, - 'wget' -) - -# Behind the scenes: -# 1. Each file hashed (SHA-256) -# 2. Blob created/found in DB (deduplication) -# 3. File stored in CAS (if new) -# 4. SnapshotFile created in DB -# 5. post_save signal fires -# 6. Symlinks automatically created in all enabled views -# ✓ DB and filesystem in perfect sync -``` - -### Example 2: Browse Archives - -```bash -# User can browse in multiple ways: - -# By domain (great for site collections) -$ ls /data/archive/by_domain/example.com/20241225/ -019b54ee-28d9-72dc/ - -# By date (great for time-based browsing) -$ ls /data/archive/by_date/20241225/ -example.com/ -github.com/ -wikipedia.org/ - -# By user (great for multi-user setups) -$ ls /data/archive/by_user/squash/20241225/ -example.com/ -github.com/ - -# Legacy timestamp (backwards compatibility) -$ ls /data/archive/by_timestamp/1735142400.123/ -wget/ -singlefile/ -screenshot/ -``` - -### Example 3: Crash Recovery - -```python -# System crashes after DB save but before symlinks created -# - DB has SnapshotFile record ✓ -# - Symlinks missing ✗ - -# Next verification run: -$ python -m archivebox verify_storage --fix - -# Output: -# Checking database → filesystem consistency... -# ✗ Missing symlink: /data/archive/by_domain/example.com/.../index.html -# ✓ Created missing symlink -# ✓ Fixed 1 issues - -# Storage is now consistent! -``` - -### Example 4: Migration from Legacy - -```bash -# Migrate all existing archives to CAS -$ python -m archivebox migrate_to_cas --dry-run - -# Output: -# DRY RUN - No changes will be made -# Found 1000 snapshots to migrate -# [1/1000] Processing https://example.com... -# Would ingest wget: 15 files -# Would ingest singlefile: 1 file -# ... - -# Run actual migration -$ python -m archivebox migrate_to_cas - -# Output: -# [1/1000] Processing https://example.com... -# ✓ Ingested 15 files (3 new, 12 deduplicated, saved 2.4 MB) -# ... -# Migration Complete! -# Snapshots processed: 1000 -# Files ingested: 45,231 -# Space saved by deduplication: 12.3 GB -``` - -## Benefits - -### Space Savings -- **Massive deduplication**: Common files (jquery, fonts, images) stored once -- **30-70% typical savings** across archives -- **Symlink overhead**: ~0.1% of saved space (negligible) - -### Flexibility -- **Multiple views**: Browse by domain, date, user, timestamp -- **Add views anytime**: Run `rebuild_views` to add new organization -- **No data migration needed**: Just rebuild symlinks - -### S3 Support -- **Use django-storages**: Drop-in S3, Azure, GCS support -- **Hybrid mode**: Hot data local, cold data in S3 -- **Cost optimization**: S3 Intelligent Tiering for automatic cost reduction - -### Data Integrity -- **Database as truth**: Symlinks are disposable, can be rebuilt -- **Automatic sync**: Signals keep symlinks current -- **Self-healing**: Verification detects and fixes drift -- **Atomic operations**: Transaction-safe - -### Backwards Compatibility -- **Legacy view**: `by_timestamp` maintains old structure -- **Gradual migration**: Old and new archives coexist -- **Zero downtime**: Archives keep working during migration - -### Developer Experience -- **Human-browseable**: Easy to inspect and debug -- **Standard tools work**: cp, rsync, tar, zip all work normally -- **Multiple organization schemes**: Find archives multiple ways -- **Easy backups**: Symlinks handled correctly by modern tools - -## Implementation Checklist - -- [ ] Create database models (Blob, SnapshotFile) -- [ ] Create migrations for new models -- [ ] Implement BlobManager (ingest.py) -- [ ] Implement ViewManager (views.py) -- [ ] Implement Django signals (signals.py) -- [ ] Create migrate_to_cas command -- [ ] Create rebuild_views command -- [ ] Create verify_storage command -- [ ] Update Snapshot.output_dir property -- [ ] Update ArchiveResult to use SnapshotFile -- [ ] Add StorageConfig settings -- [ ] Configure django-storages -- [ ] Test with local filesystem -- [ ] Test with S3 -- [ ] Document for users -- [ ] Update backup procedures - -## Future Enhancements - -- [ ] Web UI for browsing CAS blobs -- [ ] API endpoints for file access -- [ ] Content-aware compression (compress similar files together) -- [ ] IPFS backend support -- [ ] Automatic tiering (hot → warm → cold → glacier) -- [ ] Deduplication statistics dashboard -- [ ] Export to WARC with CAS metadata diff --git a/archivebox/BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md b/archivebox/BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md index 16c9d467..b58192a9 100644 --- a/archivebox/BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md +++ b/archivebox/BACKGROUND_HOOKS_IMPLEMENTATION_PLAN.md @@ -6,13 +6,33 @@ This plan implements support for long-running background hooks that run concurre **Key Changes:** - Background hooks use `.bg.js`/`.bg.py`/`.bg.sh` suffix -- Runner hashes files and creates ArchiveFile records for tracking -- Filesystem-level deduplication (fdupes, ZFS, Btrfs) handles space savings -- Hooks emit single JSON output with optional structured data +- Hooks output **JSONL** (any line with `{type: 'ModelName', ...}`) +- `run_hook()` is **generic** - just parses JSONL, doesn't know about specific models +- Each `Model.run()` extends records of its own type with computed fields +- ArchiveResult.run() extends ArchiveResult records with `output_files`, `output_size`, etc. +- **No HookResult TypedDict** - just list of dicts with 'type' field - Binary FK is optional and only set when hook reports cmd -- Split `output` field into `output_str` (human-readable) and `output_data` (structured) -- Use ArchiveFile model (FK to ArchiveResult) instead of JSON fields for file tracking -- Output stats (size, mimetypes) derived via properties from ArchiveFile queries +- Split `output` field into `output_str` (human-readable) and `output_json` (structured) +- Add fields: `output_files` (dict), `output_size` (bytes), `output_mimetypes` (CSV) +- External tools (fdupes, ZFS, Btrfs) handle deduplication via filesystem + +**New ArchiveResult Fields:** +```python +# Output fields (replace old 'output' field) +output_str = TextField() # Human-readable summary: "Downloaded 5 files" +output_json = JSONField() # Structured metadata (headers, redirects, etc.) +output_files = JSONField() # Dict: {'index.html': {}, 'style.css': {}} +output_size = BigIntegerField() # Total bytes across all files +output_mimetypes = CharField() # CSV sorted by size: "text/html,text/css,image/png" +``` + +**output_files Structure:** +- **Dict keyed by relative path** (not a list!) +- Values are empty dicts `{}` for now, extensible for future metadata +- Preserves insertion order (Python 3.7+) +- Easy to query: `ArchiveResult.objects.filter(output_files__has_key='index.html')` +- Easy to extend: Add `size`, `hash`, `mime_type` to values later without migration +- **Why not derive size/mimetypes from output_files?** Performance. Total size and mimetype summary are accessed frequently (admin views, sorting, filtering). Aggregating on every access would be slow. We keep summary fields (output_size, output_mimetypes) as denormalized cache for fast reads. --- @@ -32,21 +52,51 @@ class Migration(migrations.Migration): ] operations = [ - # Rename output → output_str for clarity - migrations.RenameField( - model_name='archiveresult', - old_name='output', - new_name='output_str', - ), - - # Add structured metadata field + # Add new fields (keep old 'output' temporarily for migration) migrations.AddField( model_name='archiveresult', - name='output_data', + name='output_str', + field=models.TextField( + blank=True, + help_text='Human-readable output summary (e.g., "Downloaded 5 files")' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_json', field=models.JSONField( null=True, blank=True, - help_text='Structured metadata from hook (headers, redirects, etc.)' + help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField( + default=dict, + help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField( + default=0, + help_text='Total recursive size in bytes of all output files' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField( + max_length=512, + blank=True, + help_text='CSV of mimetypes sorted by size descending' ), ), @@ -65,69 +115,74 @@ class Migration(migrations.Migration): ] ``` -### ArchiveFile Model - -Instead of storing file lists and stats as JSON fields on ArchiveResult, we use a normalized model that tracks files with hashes. Deduplication is handled at the filesystem level (fdupes, ZFS, Btrfs, etc.): +### Data Migration for Existing `.output` Field ```python -# archivebox/core/models.py +# archivebox/core/migrations/00XX_migrate_output_field.py -class ArchiveFile(models.Model): +from django.db import migrations +import json + +def migrate_output_field(apps, schema_editor): """ - Track files produced by an ArchiveResult with hash for integrity checking. + Migrate existing 'output' field to new split fields. - Files remain in their natural filesystem hierarchy. Deduplication is handled - by the filesystem layer (hardlinks via fdupes, ZFS dedup, Btrfs dedup, etc.). + Logic: + - If output contains JSON {...}, move to output_json + - If output is a file path and exists in output_files, ensure it's first + - Otherwise, move to output_str """ - archiveresult = models.ForeignKey( - 'ArchiveResult', - on_delete=models.CASCADE, - related_name='files' - ) + ArchiveResult = apps.get_model('core', 'ArchiveResult') - # Path relative to ArchiveResult output directory - relative_path = models.CharField( - max_length=512, - help_text='Path relative to extractor output dir (e.g., "index.html", "responses/all/file.js")' - ) + for ar in ArchiveResult.objects.all(): + old_output = ar.output or '' - # Hash for integrity checking and duplicate detection - hash_algorithm = models.CharField(max_length=16, default='sha256') - hash = models.CharField( - max_length=128, - db_index=True, - help_text='SHA-256 hash for integrity and finding duplicates' - ) + # Case 1: JSON output + if old_output.strip().startswith('{'): + try: + parsed = json.loads(old_output) + ar.output_json = parsed + ar.output_str = '' + except json.JSONDecodeError: + # Not valid JSON, treat as string + ar.output_str = old_output - # Cached filesystem stats - size = models.BigIntegerField(help_text='File size in bytes') - mime_type = models.CharField(max_length=128, blank=True) + # Case 2: File path (check if it looks like a relative path) + elif '/' in old_output or '.' in old_output: + # Might be a file path - if it's in output_files, it's already there + # output_files is now a dict, so no reordering needed + ar.output_str = old_output # Keep as string for display - created_at = models.DateTimeField(auto_now_add=True) + # Case 3: Plain string summary + else: + ar.output_str = old_output - class Meta: - indexes = [ - models.Index(fields=['archiveresult']), - models.Index(fields=['hash']), # Find duplicates across archive - ] - unique_together = [['archiveresult', 'relative_path']] + ar.save(update_fields=['output_str', 'output_json', 'output_files']) - def __str__(self): - return f"{self.archiveresult.extractor}/{self.relative_path}" +def reverse_migrate(apps, schema_editor): + """Reverse migration - copy output_str back to output.""" + ArchiveResult = apps.get_model('core', 'ArchiveResult') - @property - def absolute_path(self) -> Path: - """Get absolute filesystem path.""" - return Path(self.archiveresult.pwd) / self.relative_path + for ar in ArchiveResult.objects.all(): + ar.output = ar.output_str or '' + ar.save(update_fields=['output']) + +class Migration(migrations.Migration): + dependencies = [ + ('core', '00XX_archiveresult_background_hooks'), + ] + + operations = [ + migrations.RunPython(migrate_output_field, reverse_migrate), + + # Now safe to remove old 'output' field + migrations.RemoveField( + model_name='archiveresult', + name='output', + ), + ] ``` -**Benefits:** -- **Simple**: Single model, no CAS abstraction needed -- **Natural hierarchy**: Files stay in `snapshot_dir/extractor/file.html` -- **Flexible deduplication**: User chooses filesystem-level strategy -- **Easy browsing**: Directory structure matches logical organization -- **Integrity checking**: Hashes verify file integrity over time -- **Duplicate detection**: Query by hash to find duplicates for manual review --- @@ -137,8 +192,10 @@ class ArchiveFile(models.Model): **Contract:** - Hook emits ONE JSON object with `type: 'ArchiveResult'` -- Hook only provides: `status`, `output` (human-readable), optional `output_data`, optional `cmd` -- Runner calculates: `output_size`, `output_mimetypes`, `start_ts`, `end_ts`, `binary` FK +- Hook can provide: `status`, `output_str`, `output_json`, `cmd` (optional) +- Hook should NOT set: `output_files`, `output_size`, `output_mimetypes` (runner calculates these) +- `output_json` should NOT duplicate ArchiveResult fields (no `status`, `start_ts`, etc. in output_json) +- Runner calculates: `output_files`, `output_size`, `output_mimetypes`, `start_ts`, `end_ts`, `binary` FK **Example outputs:** @@ -147,16 +204,15 @@ class ArchiveFile(models.Model): console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', - output: 'Downloaded index.html (4.2 KB)' + output_str: 'Downloaded index.html (4.2 KB)' })); -// With structured metadata +// With structured metadata (headers, redirects, etc.) console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', - output: 'Archived https://example.com', - output_data: { - files: ['index.html', 'style.css', 'script.js'], + output_str: 'Archived https://example.com', + output_json: { headers: {'content-type': 'text/html', 'server': 'nginx'}, redirects: [{from: 'http://example.com', to: 'https://example.com'}] } @@ -166,7 +222,7 @@ console.log(JSON.stringify({ console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', - output: 'Archived with wget', + output_str: 'Archived with wget', cmd: ['wget', '-p', '-k', 'https://example.com'] })); @@ -174,34 +230,110 @@ console.log(JSON.stringify({ console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', - output_data: { + output_json: { title: 'My Page Title', charset: 'UTF-8' } })); + +// BAD: Don't duplicate ArchiveResult fields in output_json +console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_json: { + status: 'succeeded', // ❌ BAD - duplicates ArchiveResult.status + output_files: ['index.html'], // ❌ BAD - runner calculates this + custom_data: 'ok' // ✅ GOOD - custom fields only + } +})); ``` --- -## Phase 3: Update HookResult TypedDict +## Phase 3: run_hook() is Generic (No HookResult TypedDict) + +`run_hook()` is a generic JSONL parser - it doesn't know about ArchiveResult, InstalledBinary, or any specific model. It just: +1. Executes the hook script +2. Parses JSONL output (any line starting with `{` that has a `type` field) +3. Adds metadata about plugin and hook path +4. Returns list of dicts ```python # archivebox/hooks.py -class HookResult(TypedDict): - """Result from executing a hook script.""" - returncode: int # Process exit code - stdout: str # Full stdout from hook - stderr: str # Full stderr from hook - output_json: Optional[dict] # Parsed JSON output from hook - start_ts: str # ISO timestamp (calculated by runner) - end_ts: str # ISO timestamp (calculated by runner) - cmd: List[str] # Command that ran (from hook or fallback) - binary_id: Optional[str] # FK to InstalledBinary (optional) - hook: str # Path to hook script +def run_hook( + script: Path, + output_dir: Path, + timeout: int = 300, + config_objects: Optional[List[Any]] = None, + **kwargs: Any +) -> Optional[List[dict]]: + """ + Execute a hook script and parse JSONL output. + + This function is generic and doesn't know about specific model types. + It just executes the script and parses any JSONL lines with 'type' field. + + Each Model.run() method handles its own record types differently: + - ArchiveResult.run() extends ArchiveResult records with computed fields + - Machine.run() creates InstalledBinary records from hook output + - etc. + + Returns: + List of dicts with 'type' field, each extended with metadata: + [ + { + 'type': 'ArchiveResult', + 'status': 'succeeded', + 'output_str': '...', + 'plugin': 'wget', + 'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py', + # ... other hook-reported fields + }, + { + 'type': 'InstalledBinary', + 'name': 'wget', + 'plugin': 'wget', + 'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py', + # ... other hook-reported fields + } + ] + + None if background hook (still running) + """ ``` -**Note:** `output_files`, `output_size`, and `output_mimetypes` are no longer in HookResult. Instead, the runner hashes files and creates ArchiveFile records. Stats are derived via properties on ArchiveResult. +**Key Insight:** Hooks output JSONL. Any line with `{type: 'ModelName', ...}` creates/updates that model. The `type` field determines what gets created. Each Model.run() method decides how to handle records of its own type. + +### Helper: create_model_record() + +```python +# archivebox/hooks.py + +def create_model_record(record: dict) -> Any: + """ + Generic helper to create/update model instances from hook output. + + Args: + record: Dict with 'type' field and model data + + Returns: + Created/updated model instance + """ + from machine.models import InstalledBinary, Dependency + + model_type = record.pop('type') + + if model_type == 'InstalledBinary': + obj, created = InstalledBinary.objects.get_or_create(**record) + return obj + elif model_type == 'Dependency': + obj, created = Dependency.objects.get_or_create(**record) + return obj + # Add more types as needed + else: + raise ValueError(f"Unknown record type: {model_type}") +``` --- @@ -248,44 +380,28 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]: return str(binary.id) if binary else None -def parse_hook_output_json(stdout: str) -> Optional[dict]: - """ - Parse single JSON output from hook stdout. - - Looks for first line with {type: 'ArchiveResult', ...} - """ - for line in stdout.splitlines(): - line = line.strip() - if not line: - continue - try: - data = json.loads(line) - if data.get('type') == 'ArchiveResult': - return data # Return first match - except json.JSONDecodeError: - continue - return None - - def run_hook( script: Path, output_dir: Path, timeout: int = 300, config_objects: Optional[List[Any]] = None, **kwargs: Any -) -> Optional[HookResult]: +) -> Optional[List[dict]]: """ - Execute a hook script and capture results. + Execute a hook script and parse JSONL output. + + This is a GENERIC function that doesn't know about specific model types. + It just executes and parses JSONL (any line with {type: 'ModelName', ...}). Runner responsibilities: - Detect background hooks (.bg. in filename) - Capture stdout/stderr to log files - - Return result (caller will hash files and create ArchiveFile records) - - Determine binary FK from cmd (optional) + - Parse JSONL output and add plugin metadata - Clean up log files and PID files Hook responsibilities: - - Emit {type: 'ArchiveResult', status, output_str, output_data (optional), cmd (optional)} + - Emit JSONL: {type: 'ArchiveResult', status, output_str, output_json, cmd} + - Can emit multiple types: {type: 'InstalledBinary', ...} - Write actual output files Args: @@ -296,7 +412,7 @@ def run_hook( **kwargs: CLI arguments passed to script Returns: - HookResult for foreground hooks + List of dicts with 'type' field for foreground hooks None for background hooks (still running) """ import time @@ -390,30 +506,24 @@ def run_hook( stdout = stdout_file.read_text() if stdout_file.exists() else '' stderr = stderr_file.read_text() if stderr_file.exists() else '' - # Parse single JSON output - output_json = parse_hook_output_json(stdout) + # Parse ALL JSONL output (any line with {type: 'ModelName', ...}) + records = [] + for line in stdout.splitlines(): + line = line.strip() + if not line or not line.startswith('{'): + continue + try: + data = json.loads(line) + if 'type' in data: + # Add plugin metadata to every record + plugin_name = script.parent.name # Directory name (e.g., 'wget') + data['plugin'] = plugin_name + data['plugin_hook'] = str(script.relative_to(Path.cwd())) + records.append(data) + except json.JSONDecodeError: + continue - # Get cmd - prefer hook's reported cmd, fallback to interpreter cmd - if output_json and output_json.get('cmd'): - result_cmd = output_json['cmd'] - else: - result_cmd = full_cmd - - # 7. DETERMINE BINARY FK (OPTIONAL) - # Only set if hook reports cmd AND we can find the binary - machine = Machine.current() - binary_id = None - if output_json and output_json.get('cmd'): - binary_id = find_binary_for_cmd(output_json['cmd'], machine.id) - # If not found or not reported, leave binary_id=None - - # 8. INGEST OUTPUT FILES VIA BLOBMANAGER - # BlobManager handles hashing, deduplication, and creating SnapshotFile records - # Note: This assumes snapshot and extractor name are available in kwargs - # In practice, ArchiveResult.run() will handle this after run_hook() returns - # For now, we just return the result and let the caller handle ingestion - - # 9. CLEANUP + # 7. CLEANUP # Delete empty logs (keep non-empty for debugging) if stdout_file.exists() and stdout_file.stat().st_size == 0: stdout_file.unlink() @@ -425,32 +535,13 @@ def run_hook( for pf in output_dir.glob('*.pid'): pf.unlink(missing_ok=True) - # 10. RETURN RESULT - return HookResult( - returncode=returncode, - stdout=stdout, - stderr=stderr, - output_json=output_json, - start_ts=start_ts.isoformat(), - end_ts=end_ts.isoformat(), - cmd=result_cmd, - binary_id=binary_id, - hook=str(script), - ) + # 8. RETURN RECORDS + # Returns list of dicts, each with 'type' field and plugin metadata + return records except Exception as e: - duration_ms = int((time.time() - start_time) * 1000) - return HookResult( - returncode=-1, - stdout='', - stderr=f'Failed to run hook: {type(e).__name__}: {e}', - output_json=None, - start_ts=start_ts.isoformat(), - end_ts=datetime.now(timezone.utc).isoformat(), - cmd=full_cmd, - binary_id=None, - hook=str(script), - ) + # On error, return empty list (hook failed, no records created) + return [] ``` --- @@ -466,10 +557,13 @@ def run(self): For foreground hooks: Waits for completion and updates immediately For background hooks: Returns immediately, leaves status='started' + + This method extends any ArchiveResult records from hook output with + computed fields (output_files, output_size, binary FK, etc.). """ from django.utils import timezone from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook - import dateutil.parser + from machine.models import Machine config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] @@ -494,8 +588,10 @@ def run(self): plugin_name = hook.parent.name extractor_dir = Path(self.snapshot.output_dir) / plugin_name - # Run the hook - result = run_hook( + start_ts = timezone.now() + + # Run the hook (returns list of JSONL records) + records = run_hook( hook, output_dir=extractor_dir, config_objects=config_objects, @@ -504,64 +600,66 @@ def run(self): ) # BACKGROUND HOOK - still running - if result is None: + if records is None: self.status = self.StatusChoices.STARTED - self.start_ts = timezone.now() + self.start_ts = start_ts self.pwd = str(extractor_dir) self.save() return - # FOREGROUND HOOK - process result - if result['output_json']: - # Hook emitted JSON output - output_json = result['output_json'] + # FOREGROUND HOOK - process records + end_ts = timezone.now() - # Determine status - status = output_json.get('status', 'failed') + # Find the ArchiveResult record (enforce single output) + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + assert len(ar_records) <= 1, f"Hook {hook} output {len(ar_records)} ArchiveResults, expected 0-1" + + if ar_records: + hook_data = ar_records[0] + + # Apply hook's data + status_str = hook_data.get('status', 'failed') status_map = { 'succeeded': self.StatusChoices.SUCCEEDED, 'failed': self.StatusChoices.FAILED, 'skipped': self.StatusChoices.SKIPPED, } - self.status = status_map.get(status, self.StatusChoices.FAILED) + self.status = status_map.get(status_str, self.StatusChoices.FAILED) - # Set output fields - self.output_str = output_json.get('output', '') - if 'output_data' in output_json: - self.output_data = output_json['output_data'] + self.output_str = hook_data.get('output_str', '') + self.output_json = hook_data.get('output_json') + + # Set extractor from plugin metadata + self.extractor = hook_data['plugin'] + + # Determine binary FK from cmd (ArchiveResult-specific logic) + if 'cmd' in hook_data: + self.cmd = json.dumps(hook_data['cmd']) + machine = Machine.current() + binary_id = find_binary_for_cmd(hook_data['cmd'], machine.id) + if binary_id: + self.binary_id = binary_id else: - # No JSON output - determine status from exit code - self.status = (self.StatusChoices.SUCCEEDED if result['returncode'] == 0 - else self.StatusChoices.FAILED) - self.output_str = result['stdout'][:1024] or result['stderr'][:1024] + # No ArchiveResult output - hook didn't report, treat as failed + self.status = self.StatusChoices.FAILED + self.output_str = 'Hook did not output ArchiveResult' - # Set timestamps (from runner) - self.start_ts = dateutil.parser.parse(result['start_ts']) - self.end_ts = dateutil.parser.parse(result['end_ts']) - - # Set command and binary (from runner) - self.cmd = json.dumps(result['cmd']) - if result['binary_id']: - self.binary_id = result['binary_id'] - - # Metadata + # Set timestamps and metadata + self.start_ts = start_ts + self.end_ts = end_ts self.pwd = str(extractor_dir) self.retry_at = None + # POPULATE OUTPUT FIELDS FROM FILESYSTEM (ArchiveResult-specific) + if extractor_dir.exists(): + self._populate_output_fields(extractor_dir) + self.save() - # INGEST OUTPUT FILES VIA BLOBMANAGER - # This creates SnapshotFile records with deduplication - if extractor_dir.exists(): - from archivebox.storage import BlobManager - - snapshot_files = BlobManager.ingest_directory( - dir_path=extractor_dir, - snapshot=self.snapshot, - extractor=plugin_name, - # Exclude infrastructure files - exclude_patterns=['stdout.log', 'stderr.log', '*.pid'] - ) + # Create any side-effect records (InstalledBinary, Dependency, etc.) + for record in records: + if record['type'] != 'ArchiveResult': + create_model_record(record) # Generic helper that dispatches by type # Clean up empty output directory (no real files after excluding logs/pids) if extractor_dir.exists(): @@ -594,8 +692,114 @@ def run(self): # Trigger search indexing self.trigger_search_indexing() + + +def _populate_output_fields(self, output_dir: Path) -> None: + """ + Walk output directory and populate output_files, output_size, output_mimetypes fields. + + Args: + output_dir: Directory containing output files + """ + import mimetypes + from collections import defaultdict + + exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} + + # Track mimetypes and sizes for aggregation + mime_sizes = defaultdict(int) + total_size = 0 + output_files = {} # Dict keyed by relative path + + for file_path in output_dir.rglob('*'): + # Skip non-files and infrastructure files + if not file_path.is_file(): + continue + if file_path.name in exclude_names: + continue + + # Get file stats + stat = file_path.stat() + mime_type, _ = mimetypes.guess_type(str(file_path)) + mime_type = mime_type or 'application/octet-stream' + + # Track for ArchiveResult fields + relative_path = str(file_path.relative_to(output_dir)) + output_files[relative_path] = {} # Empty dict, extensible for future metadata + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + + # Populate ArchiveResult fields + self.output_files = output_files # Dict preserves insertion order (Python 3.7+) + self.output_size = total_size + + # Build output_mimetypes CSV (sorted by size descending) + sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) + self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes) ``` +### Querying output_files with Django + +Since `output_files` is a dict keyed by relative path, you can use Django's JSON field lookups: + +```python +# Check if a specific file exists +ArchiveResult.objects.filter(output_files__has_key='index.html') + +# Check if any of multiple files exist (OR) +from django.db.models import Q +ArchiveResult.objects.filter( + Q(output_files__has_key='index.html') | + Q(output_files__has_key='index.htm') +) + +# Get all results that have favicon +ArchiveResult.objects.filter(output_files__has_key='favicon.ico') + +# Check in Python (after fetching) +if 'index.html' in archiveresult.output_files: + print("Found index.html") + +# Get list of all paths +paths = list(archiveresult.output_files.keys()) + +# Count files +file_count = len(archiveresult.output_files) + +# Future: When we add metadata, query still works +# output_files = {'index.html': {'size': 4096, 'hash': 'abc...'}} +ArchiveResult.objects.filter(output_files__index_html__size__gt=1000) # size > 1KB +``` + +**Structure for Future Extension:** + +Current (empty metadata): +```python +{ + 'index.html': {}, + 'style.css': {}, + 'images/logo.png': {} +} +``` + +Future (with optional metadata): +```python +{ + 'index.html': { + 'size': 4096, + 'hash': 'abc123...', + 'mime_type': 'text/html' + }, + 'style.css': { + 'size': 2048, + 'hash': 'def456...', + 'mime_type': 'text/css' + } +} +``` + +All existing queries continue to work unchanged - the dict structure is backward compatible. + --- ## Phase 6: Background Hook Finalization @@ -648,14 +852,13 @@ def finalize_background_hook(archiveresult: 'ArchiveResult') -> None: """ Collect final results from completed background hook. - Runner calculates all stats - hook just emits status/output/output_data. + Same logic as ArchiveResult.run() but for background hooks that already started. Args: archiveresult: ArchiveResult instance to finalize """ from django.utils import timezone from machine.models import Machine - import dateutil.parser extractor_dir = Path(archiveresult.pwd) stdout_file = extractor_dir / 'stdout.log' @@ -663,65 +866,64 @@ def finalize_background_hook(archiveresult: 'ArchiveResult') -> None: # Read logs stdout = stdout_file.read_text() if stdout_file.exists() else '' - stderr = stderr_file.read_text() if stderr_file.exists() else '' - # Parse JSON output - output_json = parse_hook_output_json(stdout) + # Parse JSONL output (same as run_hook) + records = [] + for line in stdout.splitlines(): + line = line.strip() + if not line or not line.startswith('{'): + continue + try: + data = json.loads(line) + if 'type' in data: + records.append(data) + except json.JSONDecodeError: + continue - # Determine status - if output_json: - status_str = output_json.get('status', 'failed') + # Find the ArchiveResult record + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + assert len(ar_records) <= 1, f"Background hook output {len(ar_records)} ArchiveResults, expected 0-1" + + if ar_records: + hook_data = ar_records[0] + + # Apply hook's data + status_str = hook_data.get('status', 'failed') status_map = { 'succeeded': ArchiveResult.StatusChoices.SUCCEEDED, 'failed': ArchiveResult.StatusChoices.FAILED, 'skipped': ArchiveResult.StatusChoices.SKIPPED, } - status = status_map.get(status_str, ArchiveResult.StatusChoices.FAILED) - output_str = output_json.get('output', '') - output_data = output_json.get('output_data') + archiveresult.status = status_map.get(status_str, ArchiveResult.StatusChoices.FAILED) - # Get cmd from hook (for binary FK) - cmd = output_json.get('cmd') + archiveresult.output_str = hook_data.get('output_str', '') + archiveresult.output_json = hook_data.get('output_json') + + # Determine binary FK from cmd + if 'cmd' in hook_data: + archiveresult.cmd = json.dumps(hook_data['cmd']) + machine = Machine.current() + binary_id = find_binary_for_cmd(hook_data['cmd'], machine.id) + if binary_id: + archiveresult.binary_id = binary_id else: - # No JSON output = failed - status = ArchiveResult.StatusChoices.FAILED - output_str = stderr[:1024] if stderr else 'No output' - output_data = None - cmd = None + # No output = failed + archiveresult.status = ArchiveResult.StatusChoices.FAILED + archiveresult.output_str = 'Background hook did not output ArchiveResult' - # Get binary FK from hook's reported cmd (if any) - binary_id = None - if cmd: - machine = Machine.current() - binary_id = find_binary_for_cmd(cmd, machine.id) - - # Update ArchiveResult - archiveresult.status = status archiveresult.end_ts = timezone.now() - archiveresult.output_str = output_str - if output_data: - archiveresult.output_data = output_data archiveresult.retry_at = None - if binary_id: - archiveresult.binary_id = binary_id + # POPULATE OUTPUT FIELDS FROM FILESYSTEM + if extractor_dir.exists(): + archiveresult._populate_output_fields(extractor_dir) archiveresult.save() - # INGEST OUTPUT FILES VIA BLOBMANAGER - # This creates SnapshotFile records with deduplication - if extractor_dir.exists(): - from archivebox.storage import BlobManager - - # Determine extractor name from path (plugin directory name) - plugin_name = extractor_dir.name - - snapshot_files = BlobManager.ingest_directory( - dir_path=extractor_dir, - snapshot=archiveresult.snapshot, - extractor=plugin_name, - exclude_patterns=['stdout.log', 'stderr.log', '*.pid'] - ) + # Create any side-effect records + for record in records: + if record['type'] != 'ArchiveResult': + create_model_record(record) # Cleanup for pf in extractor_dir.glob('*.pid'): @@ -767,132 +969,9 @@ class SnapshotMachine(StateMachine, strict_states=True): --- -## Phase 6b: ArchiveResult Properties for Output Stats +## Phase 6b: Deduplication -Since output stats are no longer stored as fields, we expose them via properties that query SnapshotFile records: - -```python -# archivebox/core/models.py - -class ArchiveResult(models.Model): - # ... existing fields ... - - @property - def output_files(self): - """ - Get all SnapshotFile records created by this extractor. - - Returns: - QuerySet of SnapshotFile objects - """ - plugin_name = self._get_plugin_name() - return self.snapshot.files.filter(extractor=plugin_name) - - @property - def output_file_count(self) -> int: - """Count of output files.""" - return self.output_files.count() - - @property - def total_output_size(self) -> int: - """ - Total size in bytes of all output files. - - Returns: - Sum of blob sizes for this extractor's files - """ - from django.db.models import Sum - - result = self.output_files.aggregate(total=Sum('blob__size')) - return result['total'] or 0 - - @property - def output_mimetypes(self) -> str: - """ - CSV of mimetypes ordered by size descending. - - Returns: - String like "text/html,image/png,application/json" - """ - from django.db.models import Sum - from collections import OrderedDict - - # Group by mimetype and sum sizes - files = self.output_files.values('blob__mime_type').annotate( - total_size=Sum('blob__size') - ).order_by('-total_size') - - # Build CSV - mimes = [f['blob__mime_type'] for f in files] - return ','.join(mimes) - - @property - def output_summary(self) -> dict: - """ - Summary statistics for output files. - - Returns: - Dict with file count, total size, and mimetype breakdown - """ - from django.db.models import Sum, Count - - files = self.output_files.values('blob__mime_type').annotate( - count=Count('id'), - total_size=Sum('blob__size') - ).order_by('-total_size') - - return { - 'file_count': self.output_file_count, - 'total_size': self.total_output_size, - 'by_mimetype': list(files), - } - - def _get_plugin_name(self) -> str: - """ - Get plugin directory name from extractor. - - Returns: - Plugin name (e.g., 'wget', 'singlefile') - """ - # This assumes pwd is set to extractor_dir during run() - if self.pwd: - return Path(self.pwd).name - # Fallback: use extractor number to find plugin - # (implementation depends on how extractor names map to plugins) - return self.extractor -``` - -**Query Examples:** - -```python -# Get all files for this extractor -files = archiveresult.output_files.all() - -# Get total size -size = archiveresult.total_output_size - -# Get mimetype breakdown -summary = archiveresult.output_summary -# { -# 'file_count': 42, -# 'total_size': 1048576, -# 'by_mimetype': [ -# {'blob__mime_type': 'text/html', 'count': 5, 'total_size': 524288}, -# {'blob__mime_type': 'image/png', 'count': 30, 'total_size': 409600}, -# ... -# ] -# } - -# Admin display -print(f"{archiveresult.output_mimetypes}") # "text/html,image/png,text/css" -``` - -**Performance Considerations:** - -- Properties execute queries on access - cache results if needed -- Indexes on `(snapshot, extractor)` make queries fast -- For admin list views, use `select_related()` and `prefetch_related()` -- Consider adding `cached_property` for expensive calculations +Deduplication is handled by external filesystem tools like `fdupes` (hardlinks), ZFS dedup, Btrfs duperemove, or rdfind. Users can run these tools periodically on the archive directory to identify and link duplicate files. ArchiveBox doesn't need to track hashes or manage deduplication itself - the filesystem layer handles it transparently. --- @@ -919,8 +998,8 @@ Each hook should emit: console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', // or 'failed' or 'skipped' - output: 'Captured 15 console messages', // human-readable summary - output_data: { // optional structured metadata + output_str: 'Captured 15 console messages', // human-readable summary + output_json: { // optional structured metadata // ... specific to each hook } })); @@ -944,8 +1023,8 @@ print(f'STATUS=succeeded') result = { 'type': 'ArchiveResult', 'status': 'succeeded', - 'output': f'Favicon saved ({size} bytes)', - 'output_data': { + 'output_str': f'Favicon saved ({size} bytes)', + 'output_json': { 'size': size, 'format': 'ico' } @@ -958,7 +1037,7 @@ print(json.dumps(result)) ```bash # After wget completes cat < 50KB"). Summary fields (output_size, output_mimetypes) remain as denormalized cache for performance. diff --git a/archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py b/archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py new file mode 100644 index 00000000..ed905a90 --- /dev/null +++ b/archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py @@ -0,0 +1,27 @@ +# Generated by Django 6.0 on 2025-12-27 01:40 + +import base_models.models +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0002_alter_outboundwebhook_options_and_more'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterField( + model_name='apitoken', + name='created_by', + field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='created_by', + field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + ] diff --git a/archivebox/api/models.py b/archivebox/api/models.py index 41614074..85fc7e4d 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -10,6 +10,8 @@ from django.utils import timezone from django_stubs_ext.db.models import TypedModelMeta from signal_webhooks.models import WebhookBase +from base_models.models import get_or_create_system_user_pk + def generate_secret_token() -> str: return secrets.token_hex(16) @@ -17,7 +19,7 @@ def generate_secret_token() -> str: class APIToken(models.Model): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) token = models.CharField(max_length=32, default=generate_secret_token, unique=True) @@ -40,7 +42,7 @@ class APIToken(models.Model): class OutboundWebhook(WebhookBase): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index dafa428f..bd426c43 100644 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -50,7 +50,7 @@ class ModelWithUUID(models.Model): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, db_index=True) class Meta(TypedModelMeta): abstract = True diff --git a/archivebox/core/migrations/0027_alter_archiveresult_created_by_and_more.py b/archivebox/core/migrations/0027_alter_archiveresult_created_by_and_more.py new file mode 100644 index 00000000..f38d0f43 --- /dev/null +++ b/archivebox/core/migrations/0027_alter_archiveresult_created_by_and_more.py @@ -0,0 +1,32 @@ +# Generated by Django 6.0 on 2025-12-27 01:40 + +import archivebox.base_models.models +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0026_remove_archiveresult_output_dir_and_more'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='snapshot', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + ), + ] diff --git a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py new file mode 100644 index 00000000..f4c26aa5 --- /dev/null +++ b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2025-12-27 01:40 + +import pathlib +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0002_drop_seed_model'), + ] + + operations = [ + migrations.AlterField( + model_name='crawl', + name='output_dir', + field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')), + ), + ] diff --git a/archivebox/plugins/forumdl/config.json b/archivebox/plugins/forumdl/config.json new file mode 100644 index 00000000..a9dd9c6a --- /dev/null +++ b/archivebox/plugins/forumdl/config.json @@ -0,0 +1,46 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "SAVE_FORUMDL": { + "type": "boolean", + "default": true, + "description": "Enable forum downloading with forum-dl" + }, + "FORUMDL_BINARY": { + "type": "string", + "default": "forum-dl", + "description": "Path to forum-dl binary" + }, + "FORUMDL_TIMEOUT": { + "type": "integer", + "default": 3600, + "minimum": 30, + "x-fallback": "TIMEOUT", + "description": "Timeout for forum downloads in seconds" + }, + "FORUMDL_OUTPUT_FORMAT": { + "type": "string", + "default": "jsonl", + "enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"], + "description": "Output format for forum downloads" + }, + "FORUMDL_TEXTIFY": { + "type": "boolean", + "default": false, + "description": "Convert HTML content to plaintext (keep false to preserve HTML)" + }, + "FORUMDL_CHECK_SSL_VALIDITY": { + "type": "boolean", + "default": true, + "x-fallback": "CHECK_SSL_VALIDITY", + "description": "Whether to verify SSL certificates" + }, + "FORUMDL_EXTRA_ARGS": { + "type": "string", + "default": "", + "description": "Extra arguments for forum-dl (space-separated)" + } + } +} diff --git a/archivebox/plugins/forumdl/on_Crawl__00_validate_forumdl.py b/archivebox/plugins/forumdl/on_Crawl__00_validate_forumdl.py new file mode 100755 index 00000000..49acc9d4 --- /dev/null +++ b/archivebox/plugins/forumdl/on_Crawl__00_validate_forumdl.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Validation hook for forum-dl. + +Runs at crawl start to verify forum-dl binary is available. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None: + """Get version string from binary.""" + try: + result = subprocess.run( + [abspath, version_flag], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + first_line = result.stdout.strip().split('\n')[0] + return first_line[:64] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_forumdl() -> dict | None: + """Find forum-dl binary.""" + try: + from abx_pkg import Binary, PipProvider, EnvProvider + + class ForumdlBinary(Binary): + name: str = 'forum-dl' + binproviders_supported = [PipProvider(), EnvProvider()] + + binary = ForumdlBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'forum-dl', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('forum-dl') or os.environ.get('FORUMDL_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'forum-dl', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + # Check for forum-dl (required) + forumdl_result = find_forumdl() + + missing_deps = [] + + # Emit results for forum-dl + if forumdl_result and forumdl_result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': forumdl_result['name'], + 'abspath': forumdl_result['abspath'], + 'version': forumdl_result['version'], + 'sha256': forumdl_result['sha256'], + 'binprovider': forumdl_result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/FORUMDL_BINARY', + 'value': forumdl_result['abspath'], + })) + + if forumdl_result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/FORUMDL_VERSION', + 'value': forumdl_result['version'], + })) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'forum-dl', + 'bin_providers': 'pip,env', + })) + missing_deps.append('forum-dl') + + if missing_deps: + print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py b/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py new file mode 100755 index 00000000..89241da5 --- /dev/null +++ b/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +""" +Download forum content from a URL using forum-dl. + +Usage: on_Snapshot__forumdl.py --url= --snapshot-id= +Output: Downloads forum content to $PWD/ + +Environment variables: + FORUMDL_BINARY: Path to forum-dl binary + FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums) + FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl) + FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML) + FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) + FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated) + + # Forum-dl feature toggles + SAVE_FORUMDL: Enable forum-dl forum extraction (default: True) + + # Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set: + TIMEOUT: Fallback timeout + CHECK_SSL_VALIDITY: Fallback SSL check +""" + +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path + +import rich_click as click + + +# Extractor metadata +EXTRACTOR_NAME = 'forumdl' +BIN_NAME = 'forum-dl' +BIN_PROVIDERS = 'pip,env' +OUTPUT_DIR = '.' + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def find_forumdl() -> str | None: + """Find forum-dl binary.""" + forumdl = get_env('FORUMDL_BINARY') + if forumdl and os.path.isfile(forumdl): + return forumdl + + binary = shutil.which('forum-dl') + if binary: + return binary + + return None + + +def get_version(binary: str) -> str: + """Get forum-dl version.""" + try: + result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10) + return result.stdout.strip()[:64] + except Exception: + return '' + + +def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: + """ + Download forum using forum-dl. + + Returns: (success, output_path, error_message) + """ + # Get config from env + timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) + textify = get_env_bool('FORUMDL_TEXTIFY', False) + extra_args = get_env('FORUMDL_EXTRA_ARGS', '') + output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl') + + # Output directory is current directory (hook already runs in output dir) + output_dir = Path(OUTPUT_DIR) + + # Build output filename based on format + if output_format == 'warc': + output_file = output_dir / 'forum.warc.gz' + elif output_format == 'jsonl': + output_file = output_dir / 'forum.jsonl' + elif output_format == 'maildir': + output_file = output_dir / 'forum' # maildir is a directory + elif output_format in ('mbox', 'mh', 'mmdf', 'babyl'): + output_file = output_dir / f'forum.{output_format}' + else: + output_file = output_dir / f'forum.{output_format}' + + # Build command + cmd = [binary, '-f', output_format, '-o', str(output_file)] + + if textify: + cmd.append('--textify') + + if not check_ssl: + cmd.append('--no-check-certificate') + + if extra_args: + cmd.extend(extra_args.split()) + + cmd.append(url) + + try: + result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + + # Check if output file was created + if output_file.exists() and output_file.stat().st_size > 0: + return True, str(output_file), '' + else: + stderr = result.stderr + + # These are NOT errors - page simply has no downloadable forum content + stderr_lower = stderr.lower() + if 'unsupported url' in stderr_lower: + return True, None, '' # Not a forum site - success, no output + if 'no content' in stderr_lower: + return True, None, '' # No forum found - success, no output + if result.returncode == 0: + return True, None, '' # forum-dl exited cleanly, just no forum - success + + # These ARE errors - something went wrong + if '404' in stderr: + return False, None, '404 Not Found' + if '403' in stderr: + return False, None, '403 Forbidden' + if 'unable to extract' in stderr_lower: + return False, None, 'Unable to extract forum info' + + return False, None, f'forum-dl error: {stderr[:200]}' + + except subprocess.TimeoutExpired: + return False, None, f'Timed out after {timeout} seconds' + except Exception as e: + return False, None, f'{type(e).__name__}: {e}' + + +@click.command() +@click.option('--url', required=True, help='URL to download forum from') +@click.option('--snapshot-id', required=True, help='Snapshot UUID') +def main(url: str, snapshot_id: str): + """Download forum content from a URL using forum-dl.""" + + version = '' + output = None + status = 'failed' + error = '' + binary = None + cmd_str = '' + + try: + # Check if forum-dl is enabled + if not get_env_bool('SAVE_FORUMDL', True): + print('Skipping forum-dl (SAVE_FORUMDL=False)') + status = 'skipped' + print(f'STATUS={status}') + print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + sys.exit(0) + + # Find binary + binary = find_forumdl() + if not binary: + print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr) + print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr) + print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr) + print(f'INSTALL_HINT=pip install forum-dl', file=sys.stderr) + sys.exit(1) + + version = get_version(binary) + cmd_str = f'{binary} {url}' + + # Run extraction + success, output, error = save_forum(url, binary) + status = 'succeeded' if success else 'failed' + + if success: + if output: + output_path = Path(output) + file_size = output_path.stat().st_size + print(f'forum-dl completed: {output_path.name} ({file_size} bytes)') + else: + print(f'forum-dl completed: no forum content found on page (this is normal)') + + except Exception as e: + error = f'{type(e).__name__}: {e}' + status = 'failed' + + # Print results + if cmd_str: + print(f'CMD={cmd_str}') + if version: + print(f'VERSION={version}') + if output: + print(f'OUTPUT={output}') + print(f'STATUS={status}') + + if error: + print(f'ERROR={error}', file=sys.stderr) + + # Print JSON result + result_json = { + 'extractor': EXTRACTOR_NAME, + 'url': url, + 'snapshot_id': snapshot_id, + 'status': status, + 'cmd_version': version, + 'output': output, + 'error': error or None, + } + print(f'RESULT_JSON={json.dumps(result_json)}') + + sys.exit(0 if status == 'succeeded' else 1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/forumdl/templates/embed.html b/archivebox/plugins/forumdl/templates/embed.html new file mode 100644 index 00000000..936b7562 --- /dev/null +++ b/archivebox/plugins/forumdl/templates/embed.html @@ -0,0 +1,40 @@ + +
+
+ 💬 +

Forum Thread

+
+
+ +
diff --git a/archivebox/plugins/forumdl/templates/fullscreen.html b/archivebox/plugins/forumdl/templates/fullscreen.html new file mode 100644 index 00000000..85413866 --- /dev/null +++ b/archivebox/plugins/forumdl/templates/fullscreen.html @@ -0,0 +1,147 @@ + + + + + + + Forum Thread + + + +
+
💬
+

Forum Thread

+
+
+
Loading posts...
+
+ + + diff --git a/archivebox/plugins/forumdl/templates/icon.html b/archivebox/plugins/forumdl/templates/icon.html new file mode 100644 index 00000000..4c000f72 --- /dev/null +++ b/archivebox/plugins/forumdl/templates/icon.html @@ -0,0 +1 @@ +💬 \ No newline at end of file diff --git a/archivebox/plugins/forumdl/templates/thumbnail.html b/archivebox/plugins/forumdl/templates/thumbnail.html new file mode 100644 index 00000000..24000949 --- /dev/null +++ b/archivebox/plugins/forumdl/templates/thumbnail.html @@ -0,0 +1,7 @@ + +
+
+ 💬 + Forum +
+
diff --git a/archivebox/plugins/forumdl/tests/test_forumdl.py b/archivebox/plugins/forumdl/tests/test_forumdl.py new file mode 100644 index 00000000..aeb0a3f2 --- /dev/null +++ b/archivebox/plugins/forumdl/tests/test_forumdl.py @@ -0,0 +1,157 @@ +""" +Integration tests for forumdl plugin + +Tests verify: +1. Hook script exists +2. Dependencies installed via validation hooks +3. Verify deps with abx-pkg +4. Forum extraction works on forum URLs +5. JSONL output is correct +6. Config options work +7. Handles non-forum URLs gracefully +""" + +import json +import subprocess +import sys +import tempfile +from pathlib import Path +import pytest + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py' +FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py' +TEST_URL = 'https://example.com' + +def test_hook_script_exists(): + """Verify on_Snapshot hook exists.""" + assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}" + + +def test_forumdl_validate_hook(): + """Test forum-dl validate hook checks for forum-dl.""" + # Run forum-dl validate hook + result = subprocess.run( + [sys.executable, str(FORUMDL_VALIDATE_HOOK)], + capture_output=True, + text=True, + timeout=30 + ) + + # Hook exits 0 if all binaries found, 1 if any not found + # Parse output for InstalledBinary and Dependency records + found_binary = False + found_dependency = False + + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + if record['name'] == 'forum-dl': + assert record['abspath'], "forum-dl should have abspath" + found_binary = True + elif record.get('type') == 'Dependency': + if record['bin_name'] == 'forum-dl': + found_dependency = True + except json.JSONDecodeError: + pass + + # forum-dl should either be found (InstalledBinary) or missing (Dependency) + assert found_binary or found_dependency, \ + "forum-dl should have either InstalledBinary or Dependency record" + + +def test_verify_deps_with_abx_pkg(): + """Verify forum-dl is available via abx-pkg.""" + from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides + + missing_binaries = [] + + # Verify forum-dl is available + forumdl_binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()]) + forumdl_loaded = forumdl_binary.load() + if not (forumdl_loaded and forumdl_loaded.abspath): + missing_binaries.append('forum-dl') + + if missing_binaries: + pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted") + + +def test_handles_non_forum_url(): + """Test that forum-dl extractor handles non-forum URLs gracefully via hook.""" + # Prerequisites checked by earlier test + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Run forum-dl extraction hook on non-forum URL + result = subprocess.run( + [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=60 + ) + + # Should exit 0 even for non-forum URL + assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}" + + # Verify JSONL output + assert 'STATUS=' in result.stdout, "Should report status" + assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON" + + # Parse JSONL result + result_json = None + for line in result.stdout.split('\n'): + if line.startswith('RESULT_JSON='): + result_json = json.loads(line.split('=', 1)[1]) + break + + assert result_json, "Should have RESULT_JSON" + assert result_json['extractor'] == 'forumdl' + + +def test_config_save_forumdl_false_skips(): + """Test that SAVE_FORUMDL=False causes skip.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['SAVE_FORUMDL'] = 'False' + + result = subprocess.run( + [sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}" + assert 'STATUS=' in result.stdout + + +def test_config_timeout(): + """Test that FORUMDL_TIMEOUT config is respected.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['FORUMDL_TIMEOUT'] = '5' + + result = subprocess.run( + [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, "Should complete without hanging" + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/gallerydl/config.json b/archivebox/plugins/gallerydl/config.json index a3e08a44..e5f9f018 100644 --- a/archivebox/plugins/gallerydl/config.json +++ b/archivebox/plugins/gallerydl/config.json @@ -3,31 +3,30 @@ "type": "object", "additionalProperties": false, "properties": { - "SAVE_GALLERY_DL": { + "SAVE_GALLERYDL": { "type": "boolean", "default": true, - "x-aliases": ["USE_GALLERY_DL", "FETCH_GALLERY"], "description": "Enable gallery downloading with gallery-dl" }, - "GALLERY_DL_BINARY": { + "GALLERYDL_BINARY": { "type": "string", "default": "gallery-dl", "description": "Path to gallery-dl binary" }, - "GALLERY_DL_TIMEOUT": { + "GALLERYDL_TIMEOUT": { "type": "integer", "default": 3600, "minimum": 30, "x-fallback": "TIMEOUT", "description": "Timeout for gallery downloads in seconds" }, - "GALLERY_DL_CHECK_SSL_VALIDITY": { + "GALLERYDL_CHECK_SSL_VALIDITY": { "type": "boolean", "default": true, "x-fallback": "CHECK_SSL_VALIDITY", "description": "Whether to verify SSL certificates" }, - "GALLERY_DL_ARGS": { + "GALLERYDL_ARGS": { "type": "array", "items": {"type": "string"}, "default": [ @@ -36,7 +35,7 @@ ], "description": "Default gallery-dl arguments" }, - "GALLERY_DL_EXTRA_ARGS": { + "GALLERYDL_EXTRA_ARGS": { "type": "string", "default": "", "description": "Extra arguments for gallery-dl (space-separated)" diff --git a/archivebox/plugins/gallerydl/on_Crawl__00_validate_gallerydl.py b/archivebox/plugins/gallerydl/on_Crawl__00_validate_gallerydl.py index 17d5efee..b7a5309d 100755 --- a/archivebox/plugins/gallerydl/on_Crawl__00_validate_gallerydl.py +++ b/archivebox/plugins/gallerydl/on_Crawl__00_validate_gallerydl.py @@ -66,7 +66,7 @@ def find_gallerydl() -> dict | None: pass # Fallback to shutil.which - abspath = shutil.which('gallery-dl') or os.environ.get('GALLERY_DL_BINARY', '') + abspath = shutil.which('gallery-dl') or os.environ.get('GALLERYDL_BINARY', '') if abspath and Path(abspath).is_file(): return { 'name': 'gallery-dl', @@ -99,7 +99,7 @@ def main(): print(json.dumps({ 'type': 'Machine', '_method': 'update', - 'key': 'config/GALLERY_DL_BINARY', + 'key': 'config/GALLERYDL_BINARY', 'value': gallerydl_result['abspath'], })) @@ -107,7 +107,7 @@ def main(): print(json.dumps({ 'type': 'Machine', '_method': 'update', - 'key': 'config/GALLERY_DL_VERSION', + 'key': 'config/GALLERYDL_VERSION', 'value': gallerydl_result['version'], })) else: diff --git a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py b/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py index 5194fa8b..e68cf493 100755 --- a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py +++ b/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py @@ -6,17 +6,18 @@ Usage: on_Snapshot__gallerydl.py --url= --snapshot-id= Output: Downloads gallery images to $PWD/gallerydl/ Environment variables: - GALLERY_DL_BINARY: Path to gallery-dl binary - GALLERY_DL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries) - GALLERY_DL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - GALLERY_DL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated) + GALLERYDL_BINARY: Path to gallery-dl binary + GALLERYDL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries) + GALLERYDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) + GALLERYDL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated) + COOKIES_FILE: Path to cookies file for authentication # Gallery-dl feature toggles - USE_GALLERY_DL: Enable gallery-dl gallery extraction (default: True) - SAVE_GALLERY_DL: Alias for USE_GALLERY_DL + USE_GALLERYDL: Enable gallery-dl gallery extraction (default: True) + SAVE_GALLERYDL: Alias for USE_GALLERYDL - # Fallback to ARCHIVING_CONFIG values if GALLERY_DL_* not set: - GALLERY_DL_TIMEOUT: Fallback timeout for gallery downloads + # Fallback to ARCHIVING_CONFIG values if GALLERYDL_* not set: + GALLERYDL_TIMEOUT: Fallback timeout for gallery downloads TIMEOUT: Fallback timeout CHECK_SSL_VALIDITY: Fallback SSL check """ @@ -26,7 +27,6 @@ import os import shutil import subprocess import sys -from datetime import datetime, timezone from pathlib import Path import rich_click as click @@ -76,7 +76,7 @@ def has_media_output() -> bool: def find_gallerydl() -> str | None: """Find gallery-dl binary.""" - gallerydl = get_env('GALLERY_DL_BINARY') + gallerydl = get_env('GALLERYDL_BINARY') if gallerydl and os.path.isfile(gallerydl): return gallerydl @@ -111,24 +111,29 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env (with GALLERY_DL_ prefix or fallback to ARCHIVING_CONFIG style) - timeout = get_env_int('GALLERY_DL_TIMEOUT') or get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('GALLERY_DL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) - extra_args = get_env('GALLERY_DL_EXTRA_ARGS', '') + # Get config from env (with GALLERYDL_ prefix or fallback to ARCHIVING_CONFIG style) + timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) + extra_args = get_env('GALLERYDL_EXTRA_ARGS', '') + cookies_file = get_env('COOKIES_FILE', '') # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) # Build command (later options take precedence) + # Use -D for exact directory (flat structure) instead of -d (nested structure) cmd = [ binary, *get_gallerydl_default_args(), - '-d', str(output_dir), + '-D', str(output_dir), ] if not check_ssl: cmd.append('--no-check-certificate') + if cookies_file and Path(cookies_file).exists(): + cmd.extend(['-C', cookies_file]) + if extra_args: cmd.extend(extra_args.split()) @@ -137,7 +142,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: try: result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) - # Check if any gallery files were downloaded + # Check if any gallery files were downloaded (search recursively) gallery_extensions = ( '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg', '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', @@ -145,7 +150,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: ) downloaded_files = [ - f for f in output_dir.glob('*') + f for f in output_dir.rglob('*') if f.is_file() and f.suffix.lower() in gallery_extensions ] @@ -162,9 +167,10 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: # These are NOT errors - page simply has no downloadable gallery # Return success with no output (legitimate "nothing to download") - if 'unsupported URL' in stderr.lower(): + stderr_lower = stderr.lower() + if 'unsupported url' in stderr_lower: return True, None, '' # Not a gallery site - success, no output - if 'no results' in stderr.lower(): + if 'no results' in stderr_lower: return True, None, '' # No gallery found - success, no output if result.returncode == 0: return True, None, '' # gallery-dl exited cleanly, just no gallery - success @@ -174,7 +180,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: return False, None, '404 Not Found' if '403' in stderr: return False, None, '403 Forbidden' - if 'Unable to extract' in stderr: + if 'unable to extract' in stderr_lower: return False, None, 'Unable to extract gallery info' return False, None, f'gallery-dl error: {stderr[:200]}' @@ -191,7 +197,6 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: def main(url: str, snapshot_id: str): """Download image gallery from a URL using gallery-dl.""" - start_ts = datetime.now(timezone.utc) version = '' output = None status = 'failed' @@ -201,12 +206,9 @@ def main(url: str, snapshot_id: str): try: # Check if gallery-dl is enabled - if not (get_env_bool('USE_GALLERY_DL', True) and get_env_bool('SAVE_GALLERY_DL', True)): - print('Skipping gallery-dl (USE_GALLERY_DL=False or SAVE_GALLERY_DL=False)') + if not (get_env_bool('USE_GALLERYDL', True) and get_env_bool('SAVE_GALLERYDL', True)): + print('Skipping gallery-dl (USE_GALLERYDL=False or SAVE_GALLERYDL=False)') status = 'skipped' - end_ts = datetime.now(timezone.utc) - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') print(f'STATUS={status}') print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') sys.exit(0) @@ -215,8 +217,6 @@ def main(url: str, snapshot_id: str): if has_staticfile_output(): print(f'Skipping gallery-dl - staticfile extractor already downloaded this') status = 'skipped' - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={datetime.now(timezone.utc).isoformat()}') print(f'STATUS={status}') print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') sys.exit(0) @@ -224,8 +224,6 @@ def main(url: str, snapshot_id: str): if has_media_output(): print(f'Skipping gallery-dl - media extractor already downloaded this') status = 'skipped' - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={datetime.now(timezone.utc).isoformat()}') print(f'STATUS={status}') print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') sys.exit(0) @@ -260,12 +258,6 @@ def main(url: str, snapshot_id: str): status = 'failed' # Print results - end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') if cmd_str: print(f'CMD={cmd_str}') if version: @@ -283,9 +275,6 @@ def main(url: str, snapshot_id: str): 'url': url, 'snapshot_id': snapshot_id, 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), 'cmd_version': version, 'output': output, 'error': error or None, diff --git a/archivebox/plugins/gallerydl/tests/test_gallerydl.py b/archivebox/plugins/gallerydl/tests/test_gallerydl.py new file mode 100644 index 00000000..00404041 --- /dev/null +++ b/archivebox/plugins/gallerydl/tests/test_gallerydl.py @@ -0,0 +1,157 @@ +""" +Integration tests for gallerydl plugin + +Tests verify: +1. Hook script exists +2. Dependencies installed via validation hooks +3. Verify deps with abx-pkg +4. Gallery extraction works on gallery URLs +5. JSONL output is correct +6. Config options work +7. Handles non-gallery URLs gracefully +""" + +import json +import subprocess +import sys +import tempfile +from pathlib import Path +import pytest + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +GALLERYDL_HOOK = PLUGIN_DIR / 'on_Snapshot__52_gallerydl.py' +GALLERYDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_gallerydl.py' +TEST_URL = 'https://example.com' + +def test_hook_script_exists(): + """Verify on_Snapshot hook exists.""" + assert GALLERYDL_HOOK.exists(), f"Hook not found: {GALLERYDL_HOOK}" + + +def test_gallerydl_validate_hook(): + """Test gallery-dl validate hook checks for gallery-dl.""" + # Run gallery-dl validate hook + result = subprocess.run( + [sys.executable, str(GALLERYDL_VALIDATE_HOOK)], + capture_output=True, + text=True, + timeout=30 + ) + + # Hook exits 0 if all binaries found, 1 if any not found + # Parse output for InstalledBinary and Dependency records + found_binary = False + found_dependency = False + + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + if record['name'] == 'gallery-dl': + assert record['abspath'], "gallery-dl should have abspath" + found_binary = True + elif record.get('type') == 'Dependency': + if record['bin_name'] == 'gallery-dl': + found_dependency = True + except json.JSONDecodeError: + pass + + # gallery-dl should either be found (InstalledBinary) or missing (Dependency) + assert found_binary or found_dependency, \ + "gallery-dl should have either InstalledBinary or Dependency record" + + +def test_verify_deps_with_abx_pkg(): + """Verify gallery-dl is available via abx-pkg.""" + from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides + + missing_binaries = [] + + # Verify gallery-dl is available + gallerydl_binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()]) + gallerydl_loaded = gallerydl_binary.load() + if not (gallerydl_loaded and gallerydl_loaded.abspath): + missing_binaries.append('gallery-dl') + + if missing_binaries: + pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted") + + +def test_handles_non_gallery_url(): + """Test that gallery-dl extractor handles non-gallery URLs gracefully via hook.""" + # Prerequisites checked by earlier test + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Run gallery-dl extraction hook on non-gallery URL + result = subprocess.run( + [sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=60 + ) + + # Should exit 0 even for non-gallery URL + assert result.returncode == 0, f"Should handle non-gallery URL gracefully: {result.stderr}" + + # Verify JSONL output + assert 'STATUS=' in result.stdout, "Should report status" + assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON" + + # Parse JSONL result + result_json = None + for line in result.stdout.split('\n'): + if line.startswith('RESULT_JSON='): + result_json = json.loads(line.split('=', 1)[1]) + break + + assert result_json, "Should have RESULT_JSON" + assert result_json['extractor'] == 'gallerydl' + + +def test_config_save_gallery_dl_false_skips(): + """Test that SAVE_GALLERYDL=False causes skip.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['SAVE_GALLERYDL'] = 'False' + + result = subprocess.run( + [sys.executable, str(GALLERYDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}" + assert 'STATUS=' in result.stdout + + +def test_config_timeout(): + """Test that GALLERY_DL_TIMEOUT config is respected.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['GALLERY_DL_TIMEOUT'] = '5' + + result = subprocess.run( + [sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, "Should complete without hanging" + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/papersdl/config.json b/archivebox/plugins/papersdl/config.json new file mode 100644 index 00000000..e039f184 --- /dev/null +++ b/archivebox/plugins/papersdl/config.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "properties": { + "SAVE_PAPERSDL": { + "type": "boolean", + "default": true, + "description": "Enable paper downloading with papers-dl" + }, + "PAPERSDL_BINARY": { + "type": "string", + "default": "papers-dl", + "description": "Path to papers-dl binary" + }, + "PAPERSDL_TIMEOUT": { + "type": "integer", + "default": 300, + "minimum": 30, + "x-fallback": "TIMEOUT", + "description": "Timeout for paper downloads in seconds" + }, + "PAPERSDL_EXTRA_ARGS": { + "type": "string", + "default": "", + "description": "Extra arguments for papers-dl (space-separated)" + } + } +} diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py index a11e9bc9..84a8a51d 100755 --- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py +++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py @@ -24,14 +24,125 @@ import rich_click as click EXTRACTOR_NAME = 'parse_netscape_urls' +# Constants for timestamp epoch detection +UNIX_EPOCH = 0 # 1970-01-01 00:00:00 UTC +MAC_COCOA_EPOCH = 978307200 # 2001-01-01 00:00:00 UTC (Mac/Cocoa/NSDate epoch) + +# Reasonable date range for bookmarks (to detect correct epoch/unit) +MIN_REASONABLE_YEAR = 1995 # Netscape Navigator era +MAX_REASONABLE_YEAR = 2035 # Far enough in future + # Regex pattern for Netscape bookmark format # Example:
example title +# Make ADD_DATE optional and allow negative numbers NETSCAPE_PATTERN = re.compile( - r']*?tags="([^"]*)")?[^>]*>([^<]+)', + r']*?tags="([^"]*)")?[^>]*>([^<]+)', re.UNICODE | re.IGNORECASE ) +def parse_timestamp(timestamp_str: str) -> datetime | None: + """ + Intelligently parse bookmark timestamp with auto-detection of format and epoch. + + Browsers use different timestamp formats: + - Firefox: Unix epoch (1970) in seconds (10 digits): 1609459200 + - Safari: Mac/Cocoa epoch (2001) in seconds (9-10 digits): 631152000 + - Chrome: Unix epoch in microseconds (16 digits): 1609459200000000 + - Others: Unix epoch in milliseconds (13 digits): 1609459200000 + + Strategy: + 1. Try parsing with different epoch + unit combinations + 2. Pick the one that yields a reasonable date (1995-2035) + 3. Prioritize more common formats (Unix seconds, then Mac seconds, etc.) + """ + if not timestamp_str or timestamp_str == '': + return None + + try: + timestamp_num = float(timestamp_str) + except (ValueError, TypeError): + return None + + # Detect sign and work with absolute value + is_negative = timestamp_num < 0 + abs_timestamp = abs(timestamp_num) + + # Determine number of digits to guess the unit + if abs_timestamp == 0: + num_digits = 1 + else: + num_digits = len(str(int(abs_timestamp))) + + # Try different interpretations in order of likelihood + candidates = [] + + # Unix epoch seconds (10-11 digits) - Most common: Firefox, Chrome HTML export + if 9 <= num_digits <= 11: + try: + dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc) + if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR: + candidates.append((dt, 'unix_seconds', 100)) # Highest priority + except (ValueError, OSError, OverflowError): + pass + + # Mac/Cocoa epoch seconds (9-10 digits) - Safari + # Only consider if Unix seconds didn't work or gave unreasonable date + if 8 <= num_digits <= 11: + try: + dt = datetime.fromtimestamp(timestamp_num + MAC_COCOA_EPOCH, tz=timezone.utc) + if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR: + candidates.append((dt, 'mac_seconds', 90)) + except (ValueError, OSError, OverflowError): + pass + + # Unix epoch milliseconds (13 digits) - JavaScript exports + if 12 <= num_digits <= 14: + try: + dt = datetime.fromtimestamp(timestamp_num / 1000, tz=timezone.utc) + if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR: + candidates.append((dt, 'unix_milliseconds', 95)) + except (ValueError, OSError, OverflowError): + pass + + # Mac/Cocoa epoch milliseconds (12-13 digits) - Rare + if 11 <= num_digits <= 14: + try: + dt = datetime.fromtimestamp((timestamp_num / 1000) + MAC_COCOA_EPOCH, tz=timezone.utc) + if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR: + candidates.append((dt, 'mac_milliseconds', 85)) + except (ValueError, OSError, OverflowError): + pass + + # Unix epoch microseconds (16-17 digits) - Chrome WebKit timestamps + if 15 <= num_digits <= 18: + try: + dt = datetime.fromtimestamp(timestamp_num / 1_000_000, tz=timezone.utc) + if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR: + candidates.append((dt, 'unix_microseconds', 98)) + except (ValueError, OSError, OverflowError): + pass + + # Mac/Cocoa epoch microseconds (15-16 digits) - Very rare + if 14 <= num_digits <= 18: + try: + dt = datetime.fromtimestamp((timestamp_num / 1_000_000) + MAC_COCOA_EPOCH, tz=timezone.utc) + if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR: + candidates.append((dt, 'mac_microseconds', 80)) + except (ValueError, OSError, OverflowError): + pass + + # If no candidates found, return None + if not candidates: + return None + + # Sort by priority (highest first) and return best match + candidates.sort(key=lambda x: x[2], reverse=True) + best_dt, best_format, _ = candidates[0] + + return best_dt + + def fetch_content(url: str) -> str: """Fetch content from a URL (supports file:// and https://).""" parsed = urlparse(url) @@ -69,6 +180,7 @@ def main(url: str, snapshot_id: str = None): match = NETSCAPE_PATTERN.search(line) if match: bookmark_url = match.group(1) + timestamp_str = match.group(2) tags_str = match.group(3) or '' title = match.group(4).strip() @@ -86,11 +198,13 @@ def main(url: str, snapshot_id: str = None): tag = tag.strip() if tag: all_tags.add(tag) - try: - # Convert unix timestamp to ISO 8601 - entry['bookmarked_at'] = datetime.fromtimestamp(float(match.group(2)), tz=timezone.utc).isoformat() - except (ValueError, OSError): - pass + + # Parse timestamp with intelligent format detection + if timestamp_str: + dt = parse_timestamp(timestamp_str) + if dt: + entry['bookmarked_at'] = dt.isoformat() + urls_found.append(entry) if not urls_found: diff --git a/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls_comprehensive.py b/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls_comprehensive.py new file mode 100644 index 00000000..e481bcae --- /dev/null +++ b/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls_comprehensive.py @@ -0,0 +1,930 @@ +#!/usr/bin/env python3 +"""Comprehensive tests for parse_netscape_urls extractor covering various browser formats.""" + +import json +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +import pytest + +PLUGIN_DIR = Path(__file__).parent.parent +SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None) + + +class TestFirefoxFormat: + """Test Firefox Netscape bookmark export format.""" + + def test_firefox_basic_format(self, tmp_path): + """Test standard Firefox export format with Unix timestamps in seconds.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' + + +Bookmarks +

Bookmarks Menu

+

+

Example Site +
Mozilla +

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + assert len(entries) == 2 + assert entries[0]['url'] == 'https://example.com' + assert entries[0]['title'] == 'Example Site' + # Timestamp should be parsed as seconds (Jan 1, 2021) + assert '2021-01-01' in entries[0]['bookmarked_at'] + # Second bookmark (Jan 1, 2022) + assert '2022-01-01' in entries[1]['bookmarked_at'] + + def test_firefox_with_tags(self, tmp_path): + """Test Firefox bookmarks with tags.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +

+

Python Tutorial +
Rust Lang +

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + # Should have Tag records + Snapshot records + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + + tag_names = {t['name'] for t in tags} + assert 'coding' in tag_names + assert 'tutorial' in tag_names + assert 'python' in tag_names + assert 'rust' in tag_names + + assert snapshots[0]['tags'] == 'coding,tutorial,python' + assert snapshots[1]['tags'] == 'coding,rust' + + def test_firefox_nested_folders(self, tmp_path): + """Test Firefox bookmark folders and nested structure.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +

+

Toolbar

+

+

GitHub +

Development

+

+

Stack Overflow +
MDN +

+

+

Hacker News +

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + urls = {e['url'] for e in entries} + + assert 'https://github.com' in urls + assert 'https://stackoverflow.com' in urls + assert 'https://developer.mozilla.org' in urls + assert 'https://news.ycombinator.com' in urls + assert len(entries) == 4 + + def test_firefox_icon_and_icon_uri(self, tmp_path): + """Test Firefox bookmarks with ICON and ICON_URI attributes.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +

+

Example +
GitHub +

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + assert entries[0]['url'] == 'https://example.com' + assert entries[1]['url'] == 'https://github.com' + + +class TestChromeFormat: + """Test Chrome/Chromium Netscape bookmark export format.""" + + def test_chrome_microsecond_timestamps(self, tmp_path): + """Test Chrome format with microsecond timestamps (16-17 digits).""" + input_file = tmp_path / 'bookmarks.html' + # Chrome uses WebKit/Chrome timestamps which are microseconds + # 1609459200000000 = Jan 1, 2021 00:00:00 in microseconds + input_file.write_text(''' + +Bookmarks +

Bookmarks

+

+

Google +
Chrome +

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + # Should correctly parse microsecond timestamps + # Currently will fail - we'll fix the parser after writing tests + assert entries[0]['url'] == 'https://google.com' + # Timestamp should be around Jan 1, 2021, not year 52970! + if 'bookmarked_at' in entries[0]: + year = datetime.fromisoformat(entries[0]['bookmarked_at']).year + # Should be 2021, not some far future date + assert 2020 <= year <= 2025, f"Year should be ~2021, got {year}" + + def test_chrome_with_folders(self, tmp_path): + """Test Chrome bookmark folder structure.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +

+

Bookmarks bar

+

+

Google +

+

Other bookmarks

+

+

Example +

+

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + urls = {e['url'] for e in entries} + + assert 'https://google.com' in urls + assert 'https://example.com' in urls + + +class TestSafariFormat: + """Test Safari Netscape bookmark export format.""" + + def test_safari_basic_format(self, tmp_path): + """Test Safari export format.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' + +Bookmarks +

Bookmarks

+

+

BookmarksBar

+

+

Apple +
WebKit +

+

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + urls = {e['url'] for e in entries} + + assert 'https://apple.com' in urls + assert 'https://webkit.org' in urls + + def test_safari_reading_list(self, tmp_path): + """Test Safari Reading List entries.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +

+

com.apple.ReadingList

+

+

Article 1 +
Long article to read later +
Article 2 +
Another saved article +

+

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + urls = {e['url'] for e in entries} + + assert 'https://article1.com' in urls + assert 'https://article2.com' in urls + + +class TestEdgeFormat: + """Test Edge/IE bookmark export formats.""" + + def test_edge_chromium_format(self, tmp_path): + """Test Edge (Chromium-based) format.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' + +Bookmarks +

Bookmarks

+

+

Microsoft +
Bing +

+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + urls = {e['url'] for e in entries} + + assert 'https://microsoft.com' in urls + assert 'https://bing.com' in urls + + +class TestTimestampFormats: + """Test various timestamp format handling and edge cases.""" + + def test_unix_seconds_timestamp(self, tmp_path): + """Test Unix epoch timestamp in seconds (10-11 digits) - Firefox, Chrome HTML export.""" + input_file = tmp_path / 'bookmarks.html' + # 1609459200 = Jan 1, 2021 00:00:00 UTC (Unix epoch) + input_file.write_text(''' +

Test + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + assert dt.year == 2021 + assert dt.month == 1 + assert dt.day == 1 + + def test_mac_cocoa_seconds_timestamp(self, tmp_path): + """Test Mac/Cocoa epoch timestamp in seconds - Safari uses epoch of 2001-01-01.""" + input_file = tmp_path / 'bookmarks.html' + # Safari uses Mac absolute time: seconds since 2001-01-01 00:00:00 UTC + # 631152000 seconds after 2001-01-01 = Jan 1, 2021 + # 631152000 as Unix would be Feb 1990 (too old for a recent bookmark) + input_file.write_text(''' +
Safari Bookmark + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + # Should detect Mac epoch and convert correctly to 2021 + assert 2020 <= dt.year <= 2022, f"Expected ~2021, got {dt.year}" + + def test_safari_recent_timestamp(self, tmp_path): + """Test recent Safari timestamp (Mac epoch).""" + input_file = tmp_path / 'bookmarks.html' + # 725846400 seconds after 2001-01-01 = Jan 1, 2024 + input_file.write_text(''' +
Recent Safari + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + # Should detect Mac epoch and convert to 2024 + assert 2023 <= dt.year <= 2025, f"Expected ~2024, got {dt.year}" + + def test_unix_milliseconds_timestamp(self, tmp_path): + """Test Unix epoch timestamp in milliseconds (13 digits) - Some JavaScript exports.""" + input_file = tmp_path / 'bookmarks.html' + # 1609459200000 = Jan 1, 2021 00:00:00 UTC in milliseconds + input_file.write_text(''' +
Test + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + assert dt.year == 2021 + assert dt.month == 1 + assert dt.day == 1 + + def test_chrome_webkit_microseconds_timestamp(self, tmp_path): + """Test Chrome WebKit timestamp in microseconds (16-17 digits) - Chrome internal format.""" + input_file = tmp_path / 'bookmarks.html' + # 1609459200000000 = Jan 1, 2021 00:00:00 UTC in microseconds (Unix epoch) + # Chrome sometimes exports with microsecond precision + input_file.write_text(''' +
Test + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + assert dt.year == 2021 + assert dt.month == 1 + assert dt.day == 1 + + def test_mac_cocoa_milliseconds_timestamp(self, tmp_path): + """Test Mac/Cocoa epoch in milliseconds (rare but possible).""" + input_file = tmp_path / 'bookmarks.html' + # 631152000000 milliseconds after 2001-01-01 = Jan 1, 2021 + input_file.write_text(''' +
Safari Milliseconds + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + # Should detect Mac epoch with milliseconds and convert to 2021 + assert 2020 <= dt.year <= 2022, f"Expected ~2021, got {dt.year}" + + def test_ambiguous_timestamp_detection(self, tmp_path): + """Test that ambiguous timestamps are resolved to reasonable dates.""" + input_file = tmp_path / 'bookmarks.html' + # Test multiple bookmarks with different timestamp formats mixed together + # Parser should handle each correctly + input_file.write_text(''' +
Unix Seconds 2021 +
Mac Seconds 2021 +
Unix MS 2024 + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + # All should be parsed to reasonable dates (2020-2025) + for entry in entries: + dt = datetime.fromisoformat(entry['bookmarked_at']) + assert 2020 <= dt.year <= 2025, f"Date {dt.year} out of reasonable range for {entry['url']}" + + def test_very_old_timestamp(self, tmp_path): + """Test very old timestamp (1990s).""" + input_file = tmp_path / 'bookmarks.html' + # 820454400 = Jan 1, 1996 + input_file.write_text(''' +
Old Bookmark + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + assert dt.year == 1996 + + def test_recent_timestamp(self, tmp_path): + """Test recent timestamp (2024).""" + input_file = tmp_path / 'bookmarks.html' + # 1704067200 = Jan 1, 2024 + input_file.write_text(''' +
Recent + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + dt = datetime.fromisoformat(entry['bookmarked_at']) + assert dt.year == 2024 + + def test_invalid_timestamp(self, tmp_path): + """Test invalid/malformed timestamp - should extract URL but skip timestamp.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Test + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + # Should still extract URL but skip timestamp + assert entry['url'] == 'https://example.com' + assert 'bookmarked_at' not in entry + + def test_zero_timestamp(self, tmp_path): + """Test timestamp of 0 (Unix epoch) - too old, should be skipped.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Test + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + # Timestamp 0 = 1970, which is before MIN_REASONABLE_YEAR (1995) + # Parser should skip it as unreasonable + assert entry['url'] == 'https://example.com' + # Timestamp should be omitted (outside reasonable range) + assert 'bookmarked_at' not in entry + + def test_negative_timestamp(self, tmp_path): + """Test negative timestamp (before Unix epoch) - should handle gracefully.""" + input_file = tmp_path / 'bookmarks.html' + # -86400 = 1 day before Unix epoch = Dec 31, 1969 + input_file.write_text(''' +
Before Unix Epoch + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + # Should handle gracefully (extracts URL, may or may not include timestamp) + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert entry['url'] == 'https://example.com' + # If timestamp is included, should be reasonable (1969) + if 'bookmarked_at' in entry: + dt = datetime.fromisoformat(entry['bookmarked_at']) + # Should be near Unix epoch (late 1969) + assert 1969 <= dt.year <= 1970 + + +class TestBookmarkAttributes: + """Test various bookmark attributes and metadata.""" + + def test_private_attribute(self, tmp_path): + """Test bookmarks with PRIVATE attribute.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Private +
Public + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + # Both should be extracted + assert len(entries) == 2 + + def test_shortcuturl_attribute(self, tmp_path): + """Test bookmarks with SHORTCUTURL keyword attribute.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Google Search + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert 'google.com' in entry['url'] + + def test_post_data_attribute(self, tmp_path): + """Test bookmarks with POST_DATA attribute.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Login + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert entry['url'] == 'https://example.com/login' + + +class TestEdgeCases: + """Test edge cases and malformed data.""" + + def test_multiline_bookmark(self, tmp_path): + """Test bookmark spanning multiple lines.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
+ Multi-line Bookmark + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + # Current regex works line-by-line, so this might not match + # Document current behavior + if result.returncode == 0: + output_file = tmp_path / 'urls.jsonl' + if output_file.exists(): + content = output_file.read_text().strip() + if content: + entry = json.loads(content) + assert 'example.com' in entry['url'] + + def test_missing_add_date(self, tmp_path): + """Test bookmark without ADD_DATE attribute - should still extract URL.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
No Date + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + # Should succeed and extract URL without timestamp + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert entry['url'] == 'https://example.com' + assert entry['title'] == 'No Date' + assert 'bookmarked_at' not in entry + + def test_empty_title(self, tmp_path): + """Test bookmark with empty title.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
+ ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + # Current regex requires non-empty title [^<]+ + # Document current behavior + assert result.returncode == 1 + + def test_special_chars_in_url(self, tmp_path): + """Test URLs with special characters.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Special URL +
Encoded Spaces +
Unicode Path + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + assert len(entries) == 3 + assert 'q=test&foo=bar' in entries[0]['url'] + assert '%20' in entries[1]['url'] + + def test_javascript_url(self, tmp_path): + """Test javascript: URLs (should still be extracted).""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
JS Bookmarklet +
Normal + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines] + + # Both should be extracted + assert len(entries) == 2 + assert entries[0]['url'].startswith('javascript:') + + def test_data_url(self, tmp_path): + """Test data: URLs.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Data URL + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert entry['url'].startswith('data:') + + def test_file_url(self, tmp_path): + """Test file:// URLs.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
Local File + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert entry['url'].startswith('file://') + + def test_very_long_url(self, tmp_path): + """Test very long URLs (2000+ characters).""" + long_url = 'https://example.com/path?' + '&'.join([f'param{i}=value{i}' for i in range(100)]) + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(f''' +
Long URL + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert len(entry['url']) > 1000 + assert entry['url'].startswith('https://example.com') + + def test_unicode_in_title(self, tmp_path): + """Test Unicode characters in titles.""" + input_file = tmp_path / 'bookmarks.html' + input_file.write_text(''' +
日本語のタイトル +
Título en Español +
Заголовок на русском +
عنوان بالعربية +
Emoji 🚀 📚 🎉 + ''', encoding='utf-8') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text(encoding='utf-8').strip().split('\n') + entries = [json.loads(line) for line in lines] + + assert len(entries) == 5 + assert any('日本語' in e.get('title', '') for e in entries) + assert any('Español' in e.get('title', '') for e in entries) + + def test_large_file_many_bookmarks(self, tmp_path): + """Test parsing large file with many bookmarks (1000+).""" + bookmarks = [] + for i in range(1000): + bookmarks.append( + f'
Bookmark {i}' + ) + + input_file = tmp_path / 'bookmarks.html' + input_file.write_text( + '\n

\n' + + '\n'.join(bookmarks) + + '\n

' + ) + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + assert 'Found 1000 URLs' in result.stdout + + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + # Should have 10 unique tags + 1000 snapshots + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + + assert len(tags) == 10 + assert len(snapshots) == 1000 + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py new file mode 100644 index 00000000..7d4f181d --- /dev/null +++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py @@ -0,0 +1,987 @@ +#!/usr/bin/env python3 +"""Comprehensive tests for parse_rss_urls extractor covering various RSS/Atom variants.""" + +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +PLUGIN_DIR = Path(__file__).parent.parent +SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None) + + +class TestRssVariants: + """Test various RSS format variants.""" + + def test_rss_091(self, tmp_path): + """Test RSS 0.91 format (oldest RSS version).""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + RSS 0.91 Feed + https://example.com + Test RSS 0.91 + + RSS 0.91 Article + https://example.com/article1 + An article in RSS 0.91 format + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"Failed: {result.stderr}" + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entry = json.loads(lines[0]) + + assert entry['url'] == 'https://example.com/article1' + assert entry['title'] == 'RSS 0.91 Article' + assert entry['via_extractor'] == 'parse_rss_urls' + + def test_rss_10_rdf(self, tmp_path): + """Test RSS 1.0 (RDF) format.""" + input_file = tmp_path / 'feed.rdf' + input_file.write_text(''' + + + RSS 1.0 Feed + https://example.com + + + RDF Item 1 + https://example.com/rdf1 + 2024-01-15T10:30:00Z + Technology + + + RDF Item 2 + https://example.com/rdf2 + 2024-01-16T14:20:00Z + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"Failed: {result.stderr}" + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + entries = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + + urls = {e['url'] for e in entries} + assert 'https://example.com/rdf1' in urls + assert 'https://example.com/rdf2' in urls + assert any(e.get('bookmarked_at') for e in entries) + + def test_rss_20_with_full_metadata(self, tmp_path): + """Test RSS 2.0 with all standard metadata fields.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + Full RSS 2.0 + https://example.com + Complete RSS 2.0 feed + + Complete Article + https://example.com/complete + Full description here + author@example.com + Technology + Programming + https://example.com/complete + Mon, 15 Jan 2024 10:30:00 GMT + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + content = output_file.read_text().strip() + lines = content.split('\n') + + # Check for Tag records + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + tag_names = {t['name'] for t in tags} + assert 'Technology' in tag_names + assert 'Programming' in tag_names + + # Check Snapshot record + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + assert entry['url'] == 'https://example.com/complete' + assert entry['title'] == 'Complete Article' + assert 'bookmarked_at' in entry + assert entry['tags'] == 'Technology,Programming' or entry['tags'] == 'Programming,Technology' + + +class TestAtomVariants: + """Test various Atom format variants.""" + + def test_atom_10_full(self, tmp_path): + """Test Atom 1.0 with full metadata.""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + Atom 1.0 Feed + 2024-01-15T00:00:00Z + + Atom Entry 1 + + urn:uuid:1234-5678 + 2024-01-15T10:30:00Z + 2024-01-14T08:00:00Z + + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + tag_names = {t['name'] for t in tags} + assert 'science' in tag_names + assert 'research' in tag_names + + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + assert entry['url'] == 'https://atom.example.com/1' + assert 'bookmarked_at' in entry + + def test_atom_with_alternate_link(self, tmp_path): + """Test Atom feed with alternate link types.""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + Atom Alternate Links + + Entry with alternate + + + 2024-01-15T10:30:00Z + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + # feedparser should pick the alternate link + assert 'atom.example.com/article' in entry['url'] + + +class TestDateFormats: + """Test various date format handling.""" + + def test_rfc822_date(self, tmp_path): + """Test RFC 822 date format (RSS 2.0 standard).""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + RFC 822 Date + https://example.com/rfc822 + Wed, 15 Jan 2020 10:30:45 GMT + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert 'bookmarked_at' in entry + assert '2020-01-15' in entry['bookmarked_at'] + + def test_iso8601_date(self, tmp_path): + """Test ISO 8601 date format (Atom standard).""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + + ISO 8601 Date + + 2024-01-15T10:30:45.123Z + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert 'bookmarked_at' in entry + assert '2024-01-15' in entry['bookmarked_at'] + + def test_updated_vs_published_date(self, tmp_path): + """Test that published date is preferred over updated date.""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + + Date Priority Test + + 2024-01-10T10:00:00Z + 2024-01-15T10:00:00Z + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + # Should use published date (Jan 10) not updated date (Jan 15) + assert '2024-01-10' in entry['bookmarked_at'] + + def test_only_updated_date(self, tmp_path): + """Test fallback to updated date when published is missing.""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + + Only Updated + + 2024-01-20T10:00:00Z + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert '2024-01-20' in entry['bookmarked_at'] + + def test_no_date(self, tmp_path): + """Test entries without any date.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + No Date + https://example.com/nodate + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert 'bookmarked_at' not in entry + + +class TestTagsAndCategories: + """Test various tag and category formats.""" + + def test_rss_categories(self, tmp_path): + """Test RSS 2.0 category elements.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + Multi Category + https://example.com/cats + Tech + Web + Programming + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + tag_names = {t['name'] for t in tags} + assert 'Tech' in tag_names + assert 'Web' in tag_names + assert 'Programming' in tag_names + + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + tags_list = entry['tags'].split(',') + assert len(tags_list) == 3 + + def test_atom_categories(self, tmp_path): + """Test Atom category elements with various attributes.""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + + Atom Categories + + + + 2024-01-15T10:00:00Z + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + tag_names = {t['name'] for t in tags} + # feedparser extracts the 'term' attribute + assert 'python' in tag_names + assert 'django' in tag_names + + def test_no_tags(self, tmp_path): + """Test entries without tags.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + No Tags + https://example.com/notags + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + assert 'tags' not in entry or entry['tags'] == '' + + def test_duplicate_tags(self, tmp_path): + """Test that duplicate tags are handled properly.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + Duplicate Tags + https://example.com/dups + Python + Python + Web + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + # Tag records should be unique + tag_names = [t['name'] for t in tags] + assert tag_names.count('Python') == 1 + + +class TestCustomNamespaces: + """Test custom namespace handling (Dublin Core, Media RSS, etc.).""" + + def test_dublin_core_metadata(self, tmp_path): + """Test Dublin Core namespace fields.""" + input_file = tmp_path / 'feed.rdf' + input_file.write_text(''' + + + Dublin Core Feed + + + Dublin Core Article + https://example.com/dc1 + John Doe + Technology + 2024-01-15T10:30:00Z + Copyright 2024 + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + + assert entry['url'] == 'https://example.com/dc1' + assert entry['title'] == 'Dublin Core Article' + # feedparser should parse dc:date as bookmarked_at + assert 'bookmarked_at' in entry + + def test_media_rss_namespace(self, tmp_path): + """Test Media RSS namespace (common in podcast feeds).""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + Media RSS Feed + + Podcast Episode 1 + https://example.com/podcast/1 + + + Mon, 15 Jan 2024 10:00:00 GMT + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert entry['url'] == 'https://example.com/podcast/1' + assert entry['title'] == 'Podcast Episode 1' + + def test_itunes_namespace(self, tmp_path): + """Test iTunes namespace (common in podcast feeds).""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + iTunes Podcast + + Episode 1: Getting Started + https://example.com/ep1 + Jane Smith + 45:30 + programming, tutorial, beginner + Tue, 16 Jan 2024 08:00:00 GMT + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + + assert entry['url'] == 'https://example.com/ep1' + assert entry['title'] == 'Episode 1: Getting Started' + + +class TestEdgeCases: + """Test edge cases and malformed data.""" + + def test_missing_title(self, tmp_path): + """Test entries without title.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + https://example.com/notitle + Mon, 15 Jan 2024 10:00:00 GMT + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert entry['url'] == 'https://example.com/notitle' + assert 'title' not in entry + + def test_missing_link(self, tmp_path): + """Test entries without link (should be skipped).""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + No Link + This entry has no link + + + Has Link + https://example.com/haslink + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + # Should only have the entry with a link + assert entry['url'] == 'https://example.com/haslink' + assert '1 URL' in result.stdout + + def test_html_entities_in_title(self, tmp_path): + """Test HTML entities in titles are properly decoded.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + Using <div> & <span> tags + https://example.com/html + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert entry['title'] == 'Using

& tags' + + def test_special_characters_in_tags(self, tmp_path): + """Test special characters in tags.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + Special Tags + https://example.com/special + C++ + Node.js + Web/Mobile + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + tag_names = {t['name'] for t in tags} + assert 'C++' in tag_names + assert 'Node.js' in tag_names + assert 'Web/Mobile' in tag_names + + def test_cdata_sections(self, tmp_path): + """Test CDATA sections in titles and descriptions.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + <![CDATA[Using <strong>HTML</strong> in titles]]> + https://example.com/cdata + markup]]> + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + # feedparser should strip HTML tags + assert 'HTML' in entry['title'] + assert entry['url'] == 'https://example.com/cdata' + + def test_relative_urls(self, tmp_path): + """Test that relative URLs are preserved (feedparser handles them).""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + https://example.com + + Relative URL + /article/relative + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + # feedparser may convert relative to absolute, or leave as-is + assert 'article/relative' in entry['url'] + + def test_unicode_characters(self, tmp_path): + """Test Unicode characters in feed content.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + + Unicode: 日本語 Français 中文 العربية + https://example.com/unicode + 日本語 + Français + + + + ''', encoding='utf-8') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text(encoding='utf-8').strip().split('\n') + + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + assert '日本語' in entry['title'] + assert 'Français' in entry['title'] + + def test_very_long_title(self, tmp_path): + """Test handling of very long titles.""" + long_title = 'A' * 1000 + input_file = tmp_path / 'feed.rss' + input_file.write_text(f''' + + + + {long_title} + https://example.com/long + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert len(entry['title']) == 1000 + assert entry['title'] == long_title + + def test_multiple_entries_batch(self, tmp_path): + """Test processing a large batch of entries.""" + items = [] + for i in range(100): + items.append(f''' + + Article {i} + https://example.com/article/{i} + Tag{i % 10} + Mon, {15 + (i % 15)} Jan 2024 10:00:00 GMT + + ''') + + input_file = tmp_path / 'feed.rss' + input_file.write_text(f''' + + + Large Feed + {''.join(items)} + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert 'Found 100 URLs' in result.stdout + + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + # Should have 10 unique tags (Tag0-Tag9) + 100 snapshots + tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag'] + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + + assert len(tags) == 10 + assert len(snapshots) == 100 + + +class TestRealWorldFeeds: + """Test patterns from real-world RSS feeds.""" + + def test_medium_style_feed(self, tmp_path): + """Test Medium-style feed structure.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + + Medium Feed + + Article Title + https://medium.com/@user/article-slug-123abc + https://medium.com/p/123abc + Wed, 15 Jan 2024 10:30:00 GMT + Programming + JavaScript + Author Name + + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + assert 'medium.com' in entry['url'] + assert entry['title'] == 'Article Title' + + def test_reddit_style_feed(self, tmp_path): + """Test Reddit-style feed structure.""" + input_file = tmp_path / 'feed.rss' + input_file.write_text(''' + + Reddit Feed + + Post Title + + 2024-01-15T10:30:00+00:00 + + t3_abc123 + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + lines = output_file.read_text().strip().split('\n') + + snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot'] + entry = snapshots[0] + assert 'reddit.com' in entry['url'] + + def test_youtube_style_feed(self, tmp_path): + """Test YouTube-style feed structure.""" + input_file = tmp_path / 'feed.atom' + input_file.write_text(''' + + YouTube Channel + + Video Title + + 2024-01-15T10:30:00+00:00 + dQw4w9WgXcQ + UCxxxxxxxx + + + ''') + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output_file = tmp_path / 'urls.jsonl' + entry = json.loads(output_file.read_text().strip()) + + assert 'youtube.com' in entry['url'] + assert 'dQw4w9WgXcQ' in entry['url'] + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/pyproject.toml b/pyproject.toml index deb3f7df..8ab4ff7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,16 +47,13 @@ dependencies = [ "django-admin-data-views>=0.4.1", "django-object-actions>=4.3.0", "django-taggit==6.1.0", # TODO: remove this in favor of KVTags only - ### State Management "python-statemachine>=2.3.6", - ### CLI / Logging "click>=8.1.7", # for: nicer CLI command + argument definintions "rich>=13.8.0", # for: pretty CLI output "rich-click>=1.8.4", # for: pretty CLI command help text & output "ipython>=8.27.0", # for: archivebox shell (TODO: replace with bpython?) - ### Host OS / System "supervisor>=4.2.5", # for: archivebox server starting daphne and workers "psutil>=6.0.0", # for: monitoring orchestractor,actors,workers,etc. and machine.models.Process @@ -65,33 +62,28 @@ dependencies = [ "atomicwrites==1.4.1", # for: config file writes, index.json file writes, etc. (TODO: remove this deprecated lib in favor of archivebox.filestore.util/os.rename/os.replace) "python-crontab>=3.2.0", # for: archivebox schedule (TODO: remove this in favor of our own custom archivebox scheduler) "croniter>=3.0.3", # for: archivebox schedule (TODO: remove this in favor of our own custom archivebox scheduler) - ### Base Types "pydantic>=2.8.0", # for: archivebox.api (django-ninja), archivebox.config (pydantic-settings), and archivebox.index.schema (pydantic) "pydantic-settings>=2.5.2", # for: archivebox.config "python-benedict[io,parse]>=0.33.2", # for: dict replacement all over the codebase to allow .attr-style access "base32-crockford>=0.3.0", # for: encoding UUIDs in base32 - ### Static Typing "mypy-extensions>=1.0.0", # for: django-stubs type hints (TODO: remove in favor of pylance/pyright?) "django-stubs>=5.0.4", # for: vscode type hints on models and common django APIs - ### API clients "requests>=2.32.3", # for: fetching title, static files, headers (TODO: replace with httpx?) "sonic-client>=1.0.0", "pocket>=0.3.6", # for: importing URLs from Pocket API - ### Parsers "feedparser>=6.0.11", # for: parsing pocket/pinboard/etc. RSS/bookmarks imports "dateparser>=1.2.0", # for: parsing pocket/pinboard/etc. RSS/bookmark import dates "tzdata>=2024.2", # needed for dateparser {TZ: UTC} on some systems: https://github.com/ArchiveBox/ArchiveBox/issues/1553 "w3lib>=2.2.1", # used for parsing content-type encoding from http response headers & html tags - ### Extractor dependencies (optional binaries detected at runtime via shutil.which) "yt-dlp>=2024.1.0", # for: media extractor - ### Binary/Package Management "abx-pkg>=0.1.0", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm + "gallery-dl>=1.31.1", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index e0471366..f09bb71b 100644 --- a/uv.lock +++ b/uv.lock @@ -77,6 +77,7 @@ dependencies = [ { name = "django-stubs", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "django-taggit", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "feedparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "gallery-dl", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "ipython", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "mypy-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "platformdirs", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -172,6 +173,7 @@ requires-dist = [ { name = "django-taggit", specifier = "==6.1.0" }, { name = "djdt-flamegraph", marker = "extra == 'debug'", specifier = ">=0.2.13" }, { name = "feedparser", specifier = ">=6.0.11" }, + { name = "gallery-dl", specifier = ">=1.31.1" }, { name = "ipdb", marker = "extra == 'debug'", specifier = ">=0.13.13" }, { name = "ipython", specifier = ">=8.27.0" }, { name = "mypy-extensions", specifier = ">=1.0.0" }, @@ -819,6 +821,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, ] +[[package]] +name = "gallery-dl" +version = "1.31.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/3a/3fd9e453ff2a24e6d51d5f7f9d1d9b4dc62ad22ec6a7e1cf1453e3551370/gallery_dl-1.31.1.tar.gz", hash = "sha256:5255279a06dcb7e6d0594f80cf693f7f8f07ae94deb8a797358c372a900959d4", size = 633786, upload-time = "2025-12-20T09:56:39.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/95/3ce479e13ab6be2e50de22e45e843c17354913bb6d7d393aed4d00915eaf/gallery_dl-1.31.1-py3-none-any.whl", hash = "sha256:b9bdd63f2d14affbac3df35ebe6462ae75a4032787913112035eb5c42a054467", size = 788352, upload-time = "2025-12-20T09:56:35.7Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.72.0" From 6fdc52cc578e947b87cbcb2239068d05d468334e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 26 Dec 2025 18:25:52 -0800 Subject: [PATCH 2/2] add papersdl plugin --- .../on_Crawl__00_validate_papersdl.py | 129 ++++++++++ .../papersdl/on_Snapshot__54_papersdl.py | 232 ++++++++++++++++++ .../plugins/papersdl/templates/embed.html | 15 ++ .../papersdl/templates/fullscreen.html | 71 ++++++ .../plugins/papersdl/templates/icon.html | 1 + .../plugins/papersdl/templates/thumbnail.html | 7 + .../plugins/papersdl/tests/test_papersdl.py | 157 ++++++++++++ 7 files changed, 612 insertions(+) create mode 100755 archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py create mode 100755 archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py create mode 100644 archivebox/plugins/papersdl/templates/embed.html create mode 100644 archivebox/plugins/papersdl/templates/fullscreen.html create mode 100644 archivebox/plugins/papersdl/templates/icon.html create mode 100644 archivebox/plugins/papersdl/templates/thumbnail.html create mode 100644 archivebox/plugins/papersdl/tests/test_papersdl.py diff --git a/archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py b/archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py new file mode 100755 index 00000000..5dda5650 --- /dev/null +++ b/archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Validation hook for papers-dl. + +Runs at crawl start to verify papers-dl binary is available. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None: + """Get version string from binary.""" + try: + result = subprocess.run( + [abspath, version_flag], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + first_line = result.stdout.strip().split('\n')[0] + return first_line[:64] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_papersdl() -> dict | None: + """Find papers-dl binary.""" + try: + from abx_pkg import Binary, PipProvider, EnvProvider + + class PapersdlBinary(Binary): + name: str = 'papers-dl' + binproviders_supported = [PipProvider(), EnvProvider()] + + binary = PapersdlBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'papers-dl', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('papers-dl') or os.environ.get('PAPERSDL_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'papers-dl', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + # Check for papers-dl (required) + papersdl_result = find_papersdl() + + missing_deps = [] + + # Emit results for papers-dl + if papersdl_result and papersdl_result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': papersdl_result['name'], + 'abspath': papersdl_result['abspath'], + 'version': papersdl_result['version'], + 'sha256': papersdl_result['sha256'], + 'binprovider': papersdl_result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/PAPERSDL_BINARY', + 'value': papersdl_result['abspath'], + })) + + if papersdl_result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/PAPERSDL_VERSION', + 'value': papersdl_result['version'], + })) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'papers-dl', + 'bin_providers': 'pip,env', + })) + missing_deps.append('papers-dl') + + if missing_deps: + print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py new file mode 100755 index 00000000..b133194b --- /dev/null +++ b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Download scientific papers from a URL using papers-dl. + +Usage: on_Snapshot__papersdl.py --url= --snapshot-id= +Output: Downloads paper PDFs to $PWD/ + +Environment variables: + PAPERSDL_BINARY: Path to papers-dl binary + PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads) + PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated) + + # papers-dl feature toggles + SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True) + + # Fallback to ARCHIVING_CONFIG values if PAPERSDL_* not set: + TIMEOUT: Fallback timeout +""" + +import json +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path + +import rich_click as click + + +# Extractor metadata +EXTRACTOR_NAME = 'papersdl' +BIN_NAME = 'papers-dl' +BIN_PROVIDERS = 'pip,env' +OUTPUT_DIR = '.' + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def find_papersdl() -> str | None: + """Find papers-dl binary.""" + papersdl = get_env('PAPERSDL_BINARY') + if papersdl and os.path.isfile(papersdl): + return papersdl + + binary = shutil.which('papers-dl') + if binary: + return binary + + return None + + +def get_version(binary: str) -> str: + """Get papers-dl version.""" + try: + result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10) + return result.stdout.strip()[:64] + except Exception: + return '' + + +def extract_doi_from_url(url: str) -> str | None: + """Extract DOI from common paper URLs.""" + # Match DOI pattern in URL + doi_pattern = r'10\.\d{4,}/[^\s]+' + match = re.search(doi_pattern, url) + if match: + return match.group(0) + return None + + +def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: + """ + Download paper using papers-dl. + + Returns: (success, output_path, error_message) + """ + # Get config from env + timeout = get_env_int('PAPERSDL_TIMEOUT') or get_env_int('TIMEOUT', 300) + extra_args = get_env('PAPERSDL_EXTRA_ARGS', '') + + # Output directory is current directory (hook already runs in output dir) + output_dir = Path(OUTPUT_DIR) + + # Try to extract DOI from URL + doi = extract_doi_from_url(url) + if not doi: + # If no DOI found, papers-dl might handle the URL directly + identifier = url + else: + identifier = doi + + # Build command - papers-dl fetch -o + cmd = [binary, 'fetch', identifier, '-o', str(output_dir)] + + if extra_args: + cmd.extend(extra_args.split()) + + try: + result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + + # Check if any PDF files were downloaded + pdf_files = list(output_dir.glob('*.pdf')) + + if pdf_files: + # Return first PDF file + return True, str(pdf_files[0]), '' + else: + stderr = result.stderr + stdout = result.stdout + + # These are NOT errors - page simply has no downloadable paper + stderr_lower = stderr.lower() + stdout_lower = stdout.lower() + if 'not found' in stderr_lower or 'not found' in stdout_lower: + return True, None, '' # Paper not available - success, no output + if 'no results' in stderr_lower or 'no results' in stdout_lower: + return True, None, '' # No paper found - success, no output + if result.returncode == 0: + return True, None, '' # papers-dl exited cleanly, just no paper - success + + # These ARE errors - something went wrong + if '404' in stderr or '404' in stdout: + return False, None, '404 Not Found' + if '403' in stderr or '403' in stdout: + return False, None, '403 Forbidden' + + return False, None, f'papers-dl error: {stderr[:200] or stdout[:200]}' + + except subprocess.TimeoutExpired: + return False, None, f'Timed out after {timeout} seconds' + except Exception as e: + return False, None, f'{type(e).__name__}: {e}' + + +@click.command() +@click.option('--url', required=True, help='URL to download paper from') +@click.option('--snapshot-id', required=True, help='Snapshot UUID') +def main(url: str, snapshot_id: str): + """Download scientific paper from a URL using papers-dl.""" + + version = '' + output = None + status = 'failed' + error = '' + binary = None + cmd_str = '' + + try: + # Check if papers-dl is enabled + if not get_env_bool('SAVE_PAPERSDL', True): + print('Skipping papers-dl (SAVE_PAPERSDL=False)') + status = 'skipped' + print(f'STATUS={status}') + print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + sys.exit(0) + + # Find binary + binary = find_papersdl() + if not binary: + print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr) + print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr) + print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr) + print(f'INSTALL_HINT=pip install papers-dl', file=sys.stderr) + sys.exit(1) + + version = get_version(binary) + cmd_str = f'{binary} fetch {url}' + + # Run extraction + success, output, error = save_paper(url, binary) + status = 'succeeded' if success else 'failed' + + if success: + if output: + output_path = Path(output) + file_size = output_path.stat().st_size + print(f'papers-dl completed: {output_path.name} ({file_size} bytes)') + else: + print(f'papers-dl completed: no paper found for this URL (this is normal)') + + except Exception as e: + error = f'{type(e).__name__}: {e}' + status = 'failed' + + # Print results + if cmd_str: + print(f'CMD={cmd_str}') + if version: + print(f'VERSION={version}') + if output: + print(f'OUTPUT={output}') + print(f'STATUS={status}') + + if error: + print(f'ERROR={error}', file=sys.stderr) + + # Print JSON result + result_json = { + 'extractor': EXTRACTOR_NAME, + 'url': url, + 'snapshot_id': snapshot_id, + 'status': status, + 'cmd_version': version, + 'output': output, + 'error': error or None, + } + print(f'RESULT_JSON={json.dumps(result_json)}') + + sys.exit(0 if status == 'succeeded' else 1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/papersdl/templates/embed.html b/archivebox/plugins/papersdl/templates/embed.html new file mode 100644 index 00000000..45ef7d71 --- /dev/null +++ b/archivebox/plugins/papersdl/templates/embed.html @@ -0,0 +1,15 @@ + +
+
+ 📄 +

Scientific Paper

+
+
+ +
+ +
diff --git a/archivebox/plugins/papersdl/templates/fullscreen.html b/archivebox/plugins/papersdl/templates/fullscreen.html new file mode 100644 index 00000000..f2cee0c8 --- /dev/null +++ b/archivebox/plugins/papersdl/templates/fullscreen.html @@ -0,0 +1,71 @@ + + + + + + + Scientific Paper + + + +
+
📄
+

Scientific Paper

+
+
+ +
+ Download PDF + + diff --git a/archivebox/plugins/papersdl/templates/icon.html b/archivebox/plugins/papersdl/templates/icon.html new file mode 100644 index 00000000..063530f3 --- /dev/null +++ b/archivebox/plugins/papersdl/templates/icon.html @@ -0,0 +1 @@ +📄 \ No newline at end of file diff --git a/archivebox/plugins/papersdl/templates/thumbnail.html b/archivebox/plugins/papersdl/templates/thumbnail.html new file mode 100644 index 00000000..abe6f09a --- /dev/null +++ b/archivebox/plugins/papersdl/templates/thumbnail.html @@ -0,0 +1,7 @@ + +
+
+ 📄 + Paper +
+
diff --git a/archivebox/plugins/papersdl/tests/test_papersdl.py b/archivebox/plugins/papersdl/tests/test_papersdl.py new file mode 100644 index 00000000..bb2f3ec3 --- /dev/null +++ b/archivebox/plugins/papersdl/tests/test_papersdl.py @@ -0,0 +1,157 @@ +""" +Integration tests for papersdl plugin + +Tests verify: +1. Hook script exists +2. Dependencies installed via validation hooks +3. Verify deps with abx-pkg +4. Paper extraction works on paper URLs +5. JSONL output is correct +6. Config options work +7. Handles non-paper URLs gracefully +""" + +import json +import subprocess +import sys +import tempfile +from pathlib import Path +import pytest + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py' +PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py' +TEST_URL = 'https://example.com' + +def test_hook_script_exists(): + """Verify on_Snapshot hook exists.""" + assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}" + + +def test_papersdl_validate_hook(): + """Test papers-dl validate hook checks for papers-dl.""" + # Run papers-dl validate hook + result = subprocess.run( + [sys.executable, str(PAPERSDL_VALIDATE_HOOK)], + capture_output=True, + text=True, + timeout=30 + ) + + # Hook exits 0 if all binaries found, 1 if any not found + # Parse output for InstalledBinary and Dependency records + found_binary = False + found_dependency = False + + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + if record['name'] == 'papers-dl': + assert record['abspath'], "papers-dl should have abspath" + found_binary = True + elif record.get('type') == 'Dependency': + if record['bin_name'] == 'papers-dl': + found_dependency = True + except json.JSONDecodeError: + pass + + # papers-dl should either be found (InstalledBinary) or missing (Dependency) + assert found_binary or found_dependency, \ + "papers-dl should have either InstalledBinary or Dependency record" + + +def test_verify_deps_with_abx_pkg(): + """Verify papers-dl is available via abx-pkg.""" + from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides + + missing_binaries = [] + + # Verify papers-dl is available + papersdl_binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()]) + papersdl_loaded = papersdl_binary.load() + if not (papersdl_loaded and papersdl_loaded.abspath): + missing_binaries.append('papers-dl') + + if missing_binaries: + pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted") + + +def test_handles_non_paper_url(): + """Test that papers-dl extractor handles non-paper URLs gracefully via hook.""" + # Prerequisites checked by earlier test + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Run papers-dl extraction hook on non-paper URL + result = subprocess.run( + [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=60 + ) + + # Should exit 0 even for non-paper URL + assert result.returncode == 0, f"Should handle non-paper URL gracefully: {result.stderr}" + + # Verify JSONL output + assert 'STATUS=' in result.stdout, "Should report status" + assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON" + + # Parse JSONL result + result_json = None + for line in result.stdout.split('\n'): + if line.startswith('RESULT_JSON='): + result_json = json.loads(line.split('=', 1)[1]) + break + + assert result_json, "Should have RESULT_JSON" + assert result_json['extractor'] == 'papersdl' + + +def test_config_save_papersdl_false_skips(): + """Test that SAVE_PAPERSDL=False causes skip.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['SAVE_PAPERSDL'] = 'False' + + result = subprocess.run( + [sys.executable, str(PAPERSDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}" + assert 'STATUS=' in result.stdout + + +def test_config_timeout(): + """Test that PAPERSDL_TIMEOUT config is respected.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['PAPERSDL_TIMEOUT'] = '5' + + result = subprocess.run( + [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, "Should complete without hanging" + +if __name__ == '__main__': + pytest.main([__file__, '-v'])