ArchiveBox/archivebox/core/models.py

__package__ = 'archivebox.core'

from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta

import os
import json
from pathlib import Path

from django.db import models
from django.db.models import QuerySet, Value, Case, When, IntegerField
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.utils import timezone
from django.core.cache import cache
from django.urls import reverse, reverse_lazy
from django.contrib import admin
from django.conf import settings

from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.hooks import (
    EXTRACTOR_INDEXING_PRECEDENCE,
    get_plugins, get_plugin_name, get_plugin_icon,
    DEFAULT_PLUGIN_ICONS,
)
from archivebox.base_models.models import (
    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
    get_or_create_system_user_pk,
)
from workers.models import ModelWithStateMachine
from workers.tasks import bg_archive_snapshot
from crawls.models import Crawl
from machine.models import NetworkInterface, Binary


class Tag(ModelWithSerializers):
    # Keep AutoField for compatibility with main branch migrations
    # Don't use UUIDField here - requires complex FK transformation
    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
    created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
    modified_at = models.DateTimeField(auto_now=True)
    name = models.CharField(unique=True, blank=False, max_length=100)
    slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)

    snapshot_set: models.Manager['Snapshot']

    class Meta(TypedModelMeta):
        verbose_name = "Tag"
        verbose_name_plural = "Tags"

    def __str__(self):
        return self.name

    def save(self, *args, **kwargs):
        is_new = self._state.adding
        if is_new:
            self.slug = slugify(self.name)
            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
            i = None
            while True:
                slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
                if slug not in existing:
                    self.slug = slug
                    break
                i = (i or 0) + 1
        super().save(*args, **kwargs)

        if is_new:
            from archivebox.misc.logging_util import log_worker_event
            log_worker_event(
                worker_type='DB',
                event='Created Tag',
                indent_level=0,
                metadata={
                    'id': self.id,
                    'name': self.name,
                    'slug': self.slug,
                },
            )

    @property
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_tag', args=[self.id])

    @staticmethod
    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
        """
        Create/update Tag from JSONL record.

        Args:
            record: JSONL record with 'name' field
            overrides: Optional dict with 'snapshot' to auto-attach tag

        Returns:
            Tag instance or None
        """
        from archivebox.misc.jsonl import get_or_create_tag

        try:
            tag = get_or_create_tag(record)

            # Auto-attach to snapshot if in overrides
            if overrides and 'snapshot' in overrides and tag:
                overrides['snapshot'].tags.add(tag)

            return tag
        except ValueError:
            return None


class SnapshotTag(models.Model):
    id = models.AutoField(primary_key=True)
    snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
    tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')

    class Meta:
        db_table = 'core_snapshot_tags'
        unique_together = [('snapshot', 'tag')]


class SnapshotQuerySet(models.QuerySet):
    """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""

    # =========================================================================
    # Filtering Methods
    # =========================================================================

    FILTER_TYPES = {
        'exact': lambda pattern: models.Q(url=pattern),
        'substring': lambda pattern: models.Q(url__icontains=pattern),
        'regex': lambda pattern: models.Q(url__iregex=pattern),
        'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
        'tag': lambda pattern: models.Q(tags__name=pattern),
        'timestamp': lambda pattern: models.Q(timestamp=pattern),
    }

    def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
        """Filter snapshots by URL patterns using specified filter type"""
        from archivebox.misc.logging import stderr

        q_filter = models.Q()
        for pattern in patterns:
            try:
                q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
            except KeyError:
                stderr()
                stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
                stderr(f'    {pattern}')
                raise SystemExit(2)
        return self.filter(q_filter)

    def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
        """Search snapshots using the configured search backend"""
        from archivebox.config.common import SEARCH_BACKEND_CONFIG
        from archivebox.search import query_search_index
        from archivebox.misc.logging import stderr

        if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
            stderr()
            stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
            raise SystemExit(2)

        qsearch = self.none()
        for pattern in patterns:
            try:
                qsearch |= query_search_index(pattern)
            except:
                raise SystemExit(2)
        return self.all() & qsearch

    # =========================================================================
    # Export Methods
    # =========================================================================

    def to_json(self, with_headers: bool = False) -> str:
        """Generate JSON index from snapshots"""
        import sys
        from datetime import datetime, timezone as tz
        from archivebox.config import VERSION
        from archivebox.config.common import SERVER_CONFIG

        MAIN_INDEX_HEADER = {
            'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
            'schema': 'archivebox.index.json',
            'copyright_info': SERVER_CONFIG.FOOTER_INFO,
            'meta': {
                'project': 'ArchiveBox',
                'version': VERSION,
                'git_sha': VERSION,
                'website': 'https://ArchiveBox.io',
                'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
                'source': 'https://github.com/ArchiveBox/ArchiveBox',
                'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
                'dependencies': {},
            },
        } if with_headers else {}

        snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]

        if with_headers:
            output = {
                **MAIN_INDEX_HEADER,
                'num_links': len(snapshot_dicts),
                'updated': datetime.now(tz.utc),
                'last_run_cmd': sys.argv,
                'links': snapshot_dicts,
            }
        else:
            output = snapshot_dicts
        return to_json(output, indent=4, sort_keys=True)

    def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
        """Generate CSV output from snapshots"""
        cols = cols or ['timestamp', 'is_archived', 'url']
        header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
        row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
        return '\n'.join((header_str, *row_strs))

    def to_html(self, with_headers: bool = True) -> str:
        """Generate main index HTML from snapshots"""
        from datetime import datetime, timezone as tz
        from django.template.loader import render_to_string
        from archivebox.config import VERSION
        from archivebox.config.common import SERVER_CONFIG
        from archivebox.config.version import get_COMMIT_HASH

        template = 'static_index.html' if with_headers else 'minimal_index.html'
        snapshot_list = list(self.iterator(chunk_size=500))

        return render_to_string(template, {
            'version': VERSION,
            'git_sha': get_COMMIT_HASH() or VERSION,
            'num_links': str(len(snapshot_list)),
            'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
            'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
            'links': snapshot_list,
            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
        })


class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
    """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""

    def filter(self, *args, **kwargs):
        domain = kwargs.pop('domain', None)
        qs = super().filter(*args, **kwargs)
        if domain:
            qs = qs.filter(url__icontains=f'://{domain}')
        return qs

    def get_queryset(self):
        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')

    # =========================================================================
    # Import Methods
    # =========================================================================

    def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
        """Create or update a Snapshot from a SnapshotDict (parser output)"""
        import re
        from archivebox.config.common import GENERAL_CONFIG

        url = link_dict['url']
        timestamp = link_dict.get('timestamp')
        title = link_dict.get('title')
        tags_str = link_dict.get('tags')

        tag_list = []
        if tags_str:
            tag_list = list(dict.fromkeys(
                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
                if tag.strip()
            ))

        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
        snapshot = self.filter(url=url).order_by('-created_at').first()
        if snapshot:
            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
                snapshot.title = title
                snapshot.save(update_fields=['title', 'modified_at'])
        else:
            if timestamp:
                while self.filter(timestamp=timestamp).exists():
                    timestamp = str(float(timestamp) + 1.0)

            snapshot = self.create(
                url=url,
                timestamp=timestamp,
                title=title,
                created_by_id=created_by_id or get_or_create_system_user_pk(),
            )

        if tag_list:
            existing_tags = set(snapshot.tags.values_list('name', flat=True))
            new_tags = set(tag_list) | existing_tags
            snapshot.save_tags(new_tags)

        return snapshot

    def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
        """Create or update multiple Snapshots from a list of SnapshotDicts"""
        return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]

    def remove(self, atomic: bool = False) -> tuple:
        """Remove snapshots from the database"""
        from django.db import transaction
        if atomic:
            with transaction.atomic():
                return self.delete()
        return self.delete()


class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
    parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')

    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
    current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')

    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
    notes = models.TextField(blank=True, null=False, default='')
    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)

    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))

    state_machine_name = 'core.statemachines.SnapshotMachine'
    state_field_name = 'status'
    retry_at_field_name = 'retry_at'
    StatusChoices = ModelWithStateMachine.StatusChoices
    active_state = StatusChoices.STARTED

    objects = SnapshotManager()
    archiveresult_set: models.Manager['ArchiveResult']

    class Meta(TypedModelMeta):
        verbose_name = "Snapshot"
        verbose_name_plural = "Snapshots"
        constraints = [
            # Allow same URL in different crawls, but not duplicates within same crawl
            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
            # Global timestamp uniqueness for 1:1 symlink mapping
            models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
        ]

    def __str__(self):
        return f'[{self.id}] {self.url[:64]}'

    def save(self, *args, **kwargs):
        is_new = self._state.adding
        if not self.bookmarked_at:
            self.bookmarked_at = self.created_at or timezone.now()
        if not self.timestamp:
            self.timestamp = str(self.bookmarked_at.timestamp())

        # Migrate filesystem if needed (happens automatically on save)
        if self.pk and self.fs_migration_needed:
            from django.db import transaction
            with transaction.atomic():
                # Walk through migration chain automatically
                current = self.fs_version
                target = self._fs_current_version()

                while current != target:
                    next_ver = self._fs_next_version(current)
                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'

                    # Only run if method exists (most are no-ops)
                    if hasattr(self, method):
                        getattr(self, method)()

                    current = next_ver

                # Update version (still in transaction)
                self.fs_version = target

        super().save(*args, **kwargs)
        if self.crawl and self.url not in self.crawl.urls:
            self.crawl.urls += f'\n{self.url}'
            self.crawl.save()

        if is_new:
            from archivebox.misc.logging_util import log_worker_event
            log_worker_event(
                worker_type='DB',
                event='Created Snapshot',
                indent_level=2,
                url=self.url,
                metadata={
                    'id': str(self.id),
                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
                    'depth': self.depth,
                    'status': self.status,
                },
            )

    # =========================================================================
    # Filesystem Migration Methods
    # =========================================================================

    @staticmethod
    def _fs_current_version() -> str:
        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
        from archivebox.config import VERSION
        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
        parts = VERSION.split('.')
        if len(parts) >= 2:
            major, minor = parts[0], parts[1]
            # Strip any non-numeric suffix from minor version
            minor = ''.join(c for c in minor if c.isdigit())
            return f'{major}.{minor}.0'
        return '0.9.0'  # Fallback if version parsing fails

    @property
    def fs_migration_needed(self) -> bool:
        """Check if snapshot needs filesystem migration"""
        return self.fs_version != self._fs_current_version()

    def _fs_next_version(self, version: str) -> str:
        """Get next version in migration chain"""
        chain = ['0.7.0', '0.8.0', '0.9.0']
        try:
            idx = chain.index(version)
            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
        except ValueError:
            # Unknown version - skip to current
            return self._fs_current_version()

    def _fs_migrate_from_0_7_0_to_0_8_0(self):
        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
        # 0.7 and 0.8 both used archive/<timestamp>
        # Nothing to do!
        pass

    def _fs_migrate_from_0_8_0_to_0_9_0(self):
        """
        Migrate from flat to nested structure.

        0.8.x: archive/{timestamp}/
        0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/

        Transaction handling:
        1. Copy files INSIDE transaction
        2. Create symlink INSIDE transaction
        3. Update fs_version INSIDE transaction (done by save())
        4. Exit transaction (DB commit)
        5. Delete old files OUTSIDE transaction (after commit)
        """
        import shutil
        from django.db import transaction

        old_dir = self.get_storage_path_for_version('0.8.0')
        new_dir = self.get_storage_path_for_version('0.9.0')

        if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
            return

        new_dir.mkdir(parents=True, exist_ok=True)

        # Copy all files (idempotent)
        for old_file in old_dir.rglob('*'):
            if not old_file.is_file():
                continue

            rel_path = old_file.relative_to(old_dir)
            new_file = new_dir / rel_path

            # Skip if already copied
            if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
                continue

            new_file.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(old_file, new_file)

        # Verify all copied
        old_files = {f.relative_to(old_dir): f.stat().st_size
                     for f in old_dir.rglob('*') if f.is_file()}
        new_files = {f.relative_to(new_dir): f.stat().st_size
                     for f in new_dir.rglob('*') if f.is_file()}

        if old_files.keys() != new_files.keys():
            missing = old_files.keys() - new_files.keys()
            raise Exception(f"Migration incomplete: missing {missing}")

        # Create backwards-compat symlink (INSIDE transaction)
        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
        if symlink_path.is_symlink():
            symlink_path.unlink()

        if not symlink_path.exists() or symlink_path == old_dir:
            symlink_path.symlink_to(new_dir, target_is_directory=True)

        # Schedule old directory deletion AFTER transaction commits
        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))

    def _cleanup_old_migration_dir(self, old_dir: Path):
        """
        Delete old directory after successful migration.
        Called via transaction.on_commit() after DB commit succeeds.
        """
        import shutil
        import logging

        if old_dir.exists() and not old_dir.is_symlink():
            try:
                shutil.rmtree(old_dir)
            except Exception as e:
                # Log but don't raise - migration succeeded, this is just cleanup
                logging.getLogger('archivebox.migration').warning(
                    f"Could not remove old migration directory {old_dir}: {e}"
                )

    # =========================================================================
    # Path Calculation and Migration Helpers
    # =========================================================================

    @staticmethod
    def extract_domain_from_url(url: str) -> str:
        """
        Extract domain from URL for 0.9.x path structure.
        Uses full hostname with sanitized special chars.

        Examples:
            https://example.com:8080 → example.com_8080
            https://sub.example.com → sub.example.com
            file:///path → localhost
            data:text/html → data
        """
        from urllib.parse import urlparse

        try:
            parsed = urlparse(url)

            if parsed.scheme in ('http', 'https'):
                if parsed.port:
                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
                return parsed.hostname or 'unknown'
            elif parsed.scheme == 'file':
                return 'localhost'
            elif parsed.scheme:
                return parsed.scheme
            else:
                return 'unknown'
        except Exception:
            return 'unknown'

    def get_storage_path_for_version(self, version: str) -> Path:
        """
        Calculate storage path for specific filesystem version.
        Centralizes path logic so it's reusable.

        0.7.x/0.8.x: archive/{timestamp}
        0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
        """
        from datetime import datetime

        if version in ('0.7.0', '0.8.0'):
            return CONSTANTS.ARCHIVE_DIR / self.timestamp

        elif version in ('0.9.0', '1.0.0'):
            username = self.created_by.username if self.created_by else 'unknown'

            # Use created_at for date grouping (fallback to timestamp)
            if self.created_at:
                date_str = self.created_at.strftime('%Y%m%d')
            else:
                date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')

            domain = self.extract_domain_from_url(self.url)

            return (
                CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
                date_str / domain / str(self.id)
            )
        else:
            # Unknown version - use current
            return self.get_storage_path_for_version(self._fs_current_version())

    # =========================================================================
    # Loading and Creation from Filesystem (Used by archivebox update ONLY)
    # =========================================================================

    @classmethod
    def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
        """
        Load existing Snapshot from DB by reading index.json.

        Reads index.json, extracts url+timestamp, queries DB.
        Returns existing Snapshot or None if not found/invalid.
        Does NOT create new snapshots.

        ONLY used by: archivebox update (for orphan detection)
        """
        import json

        index_path = snapshot_dir / 'index.json'
        if not index_path.exists():
            return None

        try:
            with open(index_path) as f:
                data = json.load(f)
        except:
            return None

        url = data.get('url')
        if not url:
            return None

        # Get timestamp - prefer index.json, fallback to folder name
        timestamp = cls._select_best_timestamp(
            index_timestamp=data.get('timestamp'),
            folder_name=snapshot_dir.name
        )

        if not timestamp:
            return None

        # Look up existing
        try:
            return cls.objects.get(url=url, timestamp=timestamp)
        except cls.DoesNotExist:
            return None
        except cls.MultipleObjectsReturned:
            # Should not happen with unique constraint
            return cls.objects.filter(url=url, timestamp=timestamp).first()

    @classmethod
    def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
        """
        Create new Snapshot from orphaned directory.

        Validates timestamp, ensures uniqueness.
        Returns new UNSAVED Snapshot or None if invalid.

        ONLY used by: archivebox update (for orphan import)
        """
        import json

        index_path = snapshot_dir / 'index.json'
        if not index_path.exists():
            return None

        try:
            with open(index_path) as f:
                data = json.load(f)
        except:
            return None

        url = data.get('url')
        if not url:
            return None

        # Get and validate timestamp
        timestamp = cls._select_best_timestamp(
            index_timestamp=data.get('timestamp'),
            folder_name=snapshot_dir.name
        )

        if not timestamp:
            return None

        # Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
        timestamp = cls._ensure_unique_timestamp(url, timestamp)

        # Detect version
        fs_version = cls._detect_fs_version_from_index(data)

        return cls(
            url=url,
            timestamp=timestamp,
            title=data.get('title', ''),
            fs_version=fs_version,
            created_by_id=get_or_create_system_user_pk(),
        )

    @staticmethod
    def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
        """
        Select best timestamp from index.json vs folder name.

        Validates range (1995-2035).
        Prefers index.json if valid.
        """
        def is_valid_timestamp(ts):
            try:
                ts_int = int(float(ts))
                # 1995-01-01 to 2035-12-31
                return 788918400 <= ts_int <= 2082758400
            except:
                return False

        index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
        folder_valid = is_valid_timestamp(folder_name)

        if index_valid:
            return str(int(float(index_timestamp)))
        elif folder_valid:
            return str(int(float(folder_name)))
        else:
            return None

    @classmethod
    def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
        """
        Ensure timestamp is globally unique.
        If collision with different URL, increment by 1 until unique.

        NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
        This is just an extracted, reusable version.
        """
        while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
            timestamp = str(int(float(timestamp)) + 1)
        return timestamp

    @staticmethod
    def _detect_fs_version_from_index(data: dict) -> str:
        """
        Detect fs_version from index.json structure.

        - Has fs_version field: use it
        - Has history dict: 0.7.0
        - Has archive_results list: 0.8.0
        - Default: 0.7.0
        """
        if 'fs_version' in data:
            return data['fs_version']
        if 'history' in data and 'archive_results' not in data:
            return '0.7.0'
        if 'archive_results' in data:
            return '0.8.0'
        return '0.7.0'

    # =========================================================================
    # Index.json Reconciliation
    # =========================================================================

    def reconcile_with_index_json(self):
        """
        Merge index.json with DB. DB is source of truth.

        - Title: longest non-URL
        - Tags: union
        - ArchiveResults: keep both (by plugin+start_ts)

        Writes back in 0.9.x format.

        Used by: archivebox update (to sync index.json with DB)
        """
        import json

        index_path = Path(self.output_dir) / 'index.json'

        index_data = {}
        if index_path.exists():
            try:
                with open(index_path) as f:
                    index_data = json.load(f)
            except:
                pass

        # Merge title
        self._merge_title_from_index(index_data)

        # Merge tags
        self._merge_tags_from_index(index_data)

        # Merge ArchiveResults
        self._merge_archive_results_from_index(index_data)

        # Write back
        self.write_index_json()

    def _merge_title_from_index(self, index_data: dict):
        """Merge title - prefer longest non-URL title."""
        index_title = index_data.get('title', '').strip()
        db_title = self.title or ''

        candidates = [t for t in [index_title, db_title] if t and t != self.url]
        if candidates:
            best_title = max(candidates, key=len)
            if self.title != best_title:
                self.title = best_title

    def _merge_tags_from_index(self, index_data: dict):
        """Merge tags - union of both sources."""
        from django.db import transaction

        index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set()
        index_tags = {t.strip() for t in index_tags if t.strip()}

        db_tags = set(self.tags.values_list('name', flat=True))

        new_tags = index_tags - db_tags
        if new_tags:
            with transaction.atomic():
                for tag_name in new_tags:
                    tag, _ = Tag.objects.get_or_create(name=tag_name)
                    self.tags.add(tag)

    def _merge_archive_results_from_index(self, index_data: dict):
        """Merge ArchiveResults - keep both (by plugin+start_ts)."""
        existing = {
            (ar.plugin, ar.start_ts): ar
            for ar in ArchiveResult.objects.filter(snapshot=self)
        }

        # Handle 0.8.x format (archive_results list)
        for result_data in index_data.get('archive_results', []):
            self._create_archive_result_if_missing(result_data, existing)

        # Handle 0.7.x format (history dict)
        if 'history' in index_data and isinstance(index_data['history'], dict):
            for plugin, result_list in index_data['history'].items():
                if isinstance(result_list, list):
                    for result_data in result_list:
                        # Support both old 'extractor' and new 'plugin' keys for backwards compat
                        result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin
                        self._create_archive_result_if_missing(result_data, existing)

    def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
        """Create ArchiveResult if not already in DB."""
        from dateutil import parser

        # Support both old 'extractor' and new 'plugin' keys for backwards compat
        plugin = result_data.get('plugin') or result_data.get('extractor', '')
        if not plugin:
            return

        start_ts = None
        if result_data.get('start_ts'):
            try:
                start_ts = parser.parse(result_data['start_ts'])
            except:
                pass

        if (plugin, start_ts) in existing:
            return

        try:
            end_ts = None
            if result_data.get('end_ts'):
                try:
                    end_ts = parser.parse(result_data['end_ts'])
                except:
                    pass

            ArchiveResult.objects.create(
                snapshot=self,
                plugin=plugin,
                hook_name=result_data.get('hook_name', ''),
                status=result_data.get('status', 'failed'),
                output_str=result_data.get('output', ''),
                cmd=result_data.get('cmd', []),
                pwd=result_data.get('pwd', str(self.output_dir)),
                start_ts=start_ts,
                end_ts=end_ts,
                created_by=self.created_by,
            )
        except:
            pass

    def write_index_json(self):
        """Write index.json in 0.9.x format."""
        import json

        index_path = Path(self.output_dir) / 'index.json'

        data = {
            'url': self.url,
            'timestamp': self.timestamp,
            'title': self.title or '',
            'tags': ','.join(sorted(self.tags.values_list('name', flat=True))),
            'fs_version': self.fs_version,
            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
            'created_at': self.created_at.isoformat() if self.created_at else None,
            'archive_results': [
                {
                    'plugin': ar.plugin,
                    'status': ar.status,
                    'start_ts': ar.start_ts.isoformat() if ar.start_ts else None,
                    'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
                    'output': ar.output_str or '',
                    'cmd': ar.cmd if isinstance(ar.cmd, list) else [],
                    'pwd': ar.pwd,
                }
                for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts')
            ],
        }

        index_path.parent.mkdir(parents=True, exist_ok=True)
        with open(index_path, 'w') as f:
            json.dump(data, f, indent=2, sort_keys=True)

    # =========================================================================
    # Snapshot Utilities
    # =========================================================================

    @staticmethod
    def move_directory_to_invalid(snapshot_dir: Path):
        """
        Move invalid directory to data/invalid/YYYYMMDD/.

        Used by: archivebox update (when encountering invalid directories)
        """
        from datetime import datetime
        import shutil

        invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d')
        invalid_dir.mkdir(parents=True, exist_ok=True)

        dest = invalid_dir / snapshot_dir.name
        counter = 1
        while dest.exists():
            dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
            counter += 1

        try:
            shutil.move(str(snapshot_dir), str(dest))
        except:
            pass

    @classmethod
    def find_and_merge_duplicates(cls) -> int:
        """
        Find and merge snapshots with same url:timestamp.
        Returns count of duplicate sets merged.

        Used by: archivebox update (Phase 3: deduplication)
        """
        from django.db.models import Count

        duplicates = (
            cls.objects
            .values('url', 'timestamp')
            .annotate(count=Count('id'))
            .filter(count__gt=1)
        )

        merged = 0
        for dup in duplicates.iterator():
            snapshots = list(
                cls.objects
                .filter(url=dup['url'], timestamp=dup['timestamp'])
                .order_by('created_at')  # Keep oldest
            )

            if len(snapshots) > 1:
                try:
                    cls._merge_snapshots(snapshots)
                    merged += 1
                except:
                    pass

        return merged

    @classmethod
    def _merge_snapshots(cls, snapshots: list['Snapshot']):
        """
        Merge exact duplicates.
        Keep oldest, union files + ArchiveResults.
        """
        import shutil

        keeper = snapshots[0]
        duplicates = snapshots[1:]

        keeper_dir = Path(keeper.output_dir)

        for dup in duplicates:
            dup_dir = Path(dup.output_dir)

            # Merge files
            if dup_dir.exists() and dup_dir != keeper_dir:
                for dup_file in dup_dir.rglob('*'):
                    if not dup_file.is_file():
                        continue

                    rel = dup_file.relative_to(dup_dir)
                    keeper_file = keeper_dir / rel

                    if not keeper_file.exists():
                        keeper_file.parent.mkdir(parents=True, exist_ok=True)
                        shutil.copy2(dup_file, keeper_file)

                try:
                    shutil.rmtree(dup_dir)
                except:
                    pass

            # Merge tags
            for tag in dup.tags.all():
                keeper.tags.add(tag)

            # Move ArchiveResults
            ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)

            # Delete
            dup.delete()

    # =========================================================================
    # Output Directory Properties
    # =========================================================================

    @property
    def output_dir_parent(self) -> str:
        return 'archive'

    @property
    def output_dir_name(self) -> str:
        return str(self.timestamp)

    def archive(self, overwrite=False, methods=None):
        return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)

    @admin.display(description='Tags')
    def tags_str(self, nocache=True) -> str | None:
        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
            return calc_tags_str()
        cache_key = f'{self.pk}-tags'
        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()

    def icons(self) -> str:
        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
        from django.utils.html import format_html, mark_safe

        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'

        def calc_icons():
            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
                archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
            else:
                # Filter for results that have either output_files or output_str
                from django.db.models import Q
                archive_results = {r.plugin: r for r in self.archiveresult_set.filter(
                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
                )}

            path = self.archive_path
            canon = self.canonical_outputs()
            output = ""
            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'

            # Get all plugins from hooks system (sorted by numeric prefix)
            all_plugins = [get_plugin_name(e) for e in get_plugins()]

            for plugin in all_plugins:
                result = archive_results.get(plugin)
                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
                icon = get_plugin_icon(plugin)
                output += format_html(
                    output_template,
                    path,
                    canon.get(plugin, plugin + '/'),
                    str(bool(existing)),
                    plugin,
                    icon
                )

            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))

        cache_result = cache.get(cache_key)
        if cache_result:
            return cache_result

        fresh_result = calc_icons()
        cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
        return fresh_result

    @property
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_snapshot', args=[self.id])

    def get_absolute_url(self):
        return f'/{self.archive_path}'

    @cached_property
    def domain(self) -> str:
        return url_domain(self.url)

    @cached_property
    def output_dir(self):
        """The filesystem path to the snapshot's output directory."""
        import os

        current_path = self.get_storage_path_for_version(self.fs_version)

        if current_path.exists():
            return str(current_path)

        # Check for backwards-compat symlink
        old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
        if old_path.is_symlink():
            return str(Path(os.readlink(old_path)).resolve())
        elif old_path.exists():
            return str(old_path)

        return str(current_path)

    @cached_property
    def archive_path(self):
        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'

    @cached_property
    def archive_size(self):
        try:
            return get_dir_size(self.output_dir)[0]
        except Exception:
            return 0

    def save_tags(self, tags: Iterable[str] = ()) -> None:
        tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
        self.tags.clear()
        self.tags.add(*tags_id)

    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
        return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)

    def run(self) -> list['ArchiveResult']:
        """
        Execute this Snapshot by creating ArchiveResults for all enabled extractors.

        Called by the state machine when entering the 'started' state.
        """
        return self.create_pending_archiveresults()

    def cleanup(self):
        """
        Clean up background ArchiveResult hooks.

        Called by the state machine when entering the 'sealed' state.
        Kills any background hooks and finalizes their ArchiveResults.
        """
        from pathlib import Path
        from archivebox.hooks import kill_process

        # Kill any background ArchiveResult hooks
        if not self.OUTPUT_DIR.exists():
            return

        for plugin_dir in self.OUTPUT_DIR.iterdir():
            if not plugin_dir.is_dir():
                continue
            pid_file = plugin_dir / 'hook.pid'
            if pid_file.exists():
                kill_process(pid_file, validate=True)  # Use validation

                # Update the ArchiveResult from filesystem
                plugin_name = plugin_dir.name
                results = self.archiveresult_set.filter(
                    status=ArchiveResult.StatusChoices.STARTED,
                    pwd__contains=plugin_name
                )
                for ar in results:
                    ar.update_from_output()

    def has_running_background_hooks(self) -> bool:
        """
        Check if any ArchiveResult background hooks are still running.

        Used by state machine to determine if snapshot is finished.
        """
        from archivebox.hooks import process_is_alive

        if not self.OUTPUT_DIR.exists():
            return False

        for plugin_dir in self.OUTPUT_DIR.iterdir():
            if not plugin_dir.is_dir():
                continue
            pid_file = plugin_dir / 'hook.pid'
            if process_is_alive(pid_file):
                return True

        return False

    @staticmethod
    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
        """
        Create/update Snapshot from JSONL record.

        Args:
            record: JSONL record with 'url' field and optional metadata
            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'

        Returns:
            Snapshot instance or None

        Note:
            Filtering (depth, URL allowlist/denylist) should be done by caller
            BEFORE calling this method. This method just creates the snapshot.
        """
        from archivebox.misc.jsonl import get_or_create_snapshot
        from django.utils import timezone

        overrides = overrides or {}
        url = record.get('url')
        if not url:
            return None

        # Apply crawl context metadata
        crawl = overrides.get('crawl')
        snapshot = overrides.get('snapshot')  # Parent snapshot

        if crawl:
            record.setdefault('crawl_id', str(crawl.id))
            record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
            if snapshot:
                record.setdefault('parent_snapshot_id', str(snapshot.id))

        try:
            created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
            new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)

            # Queue for extraction
            new_snapshot.status = Snapshot.StatusChoices.QUEUED
            new_snapshot.retry_at = timezone.now()
            new_snapshot.save()

            return new_snapshot
        except ValueError:
            return None

    def create_pending_archiveresults(self) -> list['ArchiveResult']:
        """
        Create ArchiveResult records for all enabled hooks.

        Uses the hooks system to discover available hooks from:
        - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
        - data/plugins/*/on_Snapshot__*.{py,sh,js}

        Creates one ArchiveResult per hook (not per plugin), with hook_name set.
        This enables step-based execution where all hooks in a step can run in parallel.
        """
        from archivebox.hooks import discover_hooks

        hooks = discover_hooks('Snapshot')
        archiveresults = []

        for hook_path in hooks:
            hook_name = hook_path.name  # e.g., 'on_Snapshot__50_wget.py'
            plugin = hook_path.parent.name  # e.g., 'wget'

            # Check if AR already exists for this specific hook
            if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
                continue

            archiveresult, created = ArchiveResult.objects.get_or_create(
                snapshot=self,
                hook_name=hook_name,
                defaults={
                    'plugin': plugin,
                    'status': ArchiveResult.INITIAL_STATE,
                    'retry_at': timezone.now(),
                    'created_by_id': self.created_by_id,
                },
            )
            if archiveresult.status == ArchiveResult.INITIAL_STATE:
                archiveresults.append(archiveresult)

        return archiveresults

    def advance_step_if_ready(self) -> bool:
        """
        Advance current_step if all foreground hooks in current step are finished.

        Called by the state machine to check if step can advance.
        Background hooks (.bg) don't block step advancement.

        Step advancement rules:
        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
        - Background ARs (hook_name contains '.bg.') are ignored for advancement
        - When ready, increments current_step by 1 (up to 9)

        Returns:
            True if step was advanced, False if not ready or already at step 9.
        """
        from archivebox.hooks import extract_step, is_background_hook

        if self.current_step >= 9:
            return False  # Already at final step

        # Get all ARs for current step that are foreground
        current_step_ars = self.archiveresult_set.filter(
            hook_name__isnull=False
        ).exclude(hook_name='')

        # Check each AR in current step
        for ar in current_step_ars:
            ar_step = extract_step(ar.hook_name)
            if ar_step != self.current_step:
                continue  # Not in current step

            if is_background_hook(ar.hook_name):
                continue  # Background hooks don't block

            # Foreground hook in current step - check if finished
            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
                # Still pending/queued - can't advance
                return False

            if ar.status == ArchiveResult.StatusChoices.STARTED:
                # Still running - can't advance
                return False

        # All foreground hooks in current step are finished - advance!
        self.current_step += 1
        self.save(update_fields=['current_step', 'modified_at'])
        return True

    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
        """
        Reset failed/skipped ArchiveResults to queued for retry.

        This enables seamless retry of the entire extraction pipeline:
        - Resets FAILED and SKIPPED results to QUEUED
        - Sets retry_at so workers pick them up
        - Plugins run in order (numeric prefix)
        - Each plugin checks its dependencies at runtime

        Dependency handling (e.g., chrome_session → screenshot):
        - Plugins check if required outputs exist before running
        - If dependency output missing → plugin returns 'skipped'
        - On retry, if dependency now succeeds → dependent can run

        Returns count of ArchiveResults reset.
        """
        retry_at = retry_at or timezone.now()

        count = self.archiveresult_set.filter(
            status__in=[
                ArchiveResult.StatusChoices.FAILED,
                ArchiveResult.StatusChoices.SKIPPED,
            ]
        ).update(
            status=ArchiveResult.StatusChoices.QUEUED,
            retry_at=retry_at,
            output=None,
            start_ts=None,
            end_ts=None,
        )

        # Also reset the snapshot and current_step so it gets re-checked from the beginning
        if count > 0:
            self.status = self.StatusChoices.STARTED
            self.retry_at = retry_at
            self.current_step = 0  # Reset to step 0 for retry
            self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])

        return count

    # =========================================================================
    # URL Helper Properties (migrated from Link schema)
    # =========================================================================

    @cached_property
    def url_hash(self) -> str:
        from hashlib import sha256
        return sha256(self.url.encode()).hexdigest()[:8]

    @cached_property
    def scheme(self) -> str:
        return self.url.split('://')[0]

    @cached_property
    def path(self) -> str:
        parts = self.url.split('://', 1)
        return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'

    @cached_property
    def basename(self) -> str:
        return self.path.split('/')[-1]

    @cached_property
    def extension(self) -> str:
        basename = self.basename
        return basename.split('.')[-1] if '.' in basename else ''

    @cached_property
    def base_url(self) -> str:
        return f'{self.scheme}://{self.domain}'

    @cached_property
    def is_static(self) -> bool:
        static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
        return any(self.url.lower().endswith(ext) for ext in static_extensions)

    @cached_property
    def is_archived(self) -> bool:
        output_paths = (
            self.domain,
            'output.html',
            'output.pdf',
            'screenshot.png',
            'singlefile.html',
            'readability/content.html',
            'mercury/content.html',
            'htmltotext.txt',
            'media',
            'git',
        )
        return any((Path(self.output_dir) / path).exists() for path in output_paths)

    # =========================================================================
    # Date/Time Properties (migrated from Link schema)
    # =========================================================================

    @cached_property
    def bookmarked_date(self) -> Optional[str]:
        max_ts = (timezone.now() + timedelta(days=30)).timestamp()
        if self.timestamp and self.timestamp.replace('.', '').isdigit():
            if 0 < float(self.timestamp) < max_ts:
                return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
            return str(self.timestamp)
        return None

    @cached_property
    def downloaded_datestr(self) -> Optional[str]:
        return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None

    @cached_property
    def archive_dates(self) -> List[datetime]:
        return [
            result.start_ts
            for result in self.archiveresult_set.all()
            if result.start_ts
        ]

    @cached_property
    def oldest_archive_date(self) -> Optional[datetime]:
        dates = self.archive_dates
        return min(dates) if dates else None

    @cached_property
    def newest_archive_date(self) -> Optional[datetime]:
        dates = self.archive_dates
        return max(dates) if dates else None

    @cached_property
    def num_outputs(self) -> int:
        return self.archiveresult_set.filter(status='succeeded').count()

    @cached_property
    def num_failures(self) -> int:
        return self.archiveresult_set.filter(status='failed').count()

    # =========================================================================
    # Output Path Methods (migrated from Link schema)
    # =========================================================================

    def canonical_outputs(self) -> Dict[str, Optional[str]]:
        """
        Intelligently discover the best output file for each plugin.
        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
        """
        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'

        # Mimetypes that can be embedded/previewed in an iframe
        IFRAME_EMBEDDABLE_EXTENSIONS = {
            'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
            'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
            'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
        }

        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
        MAX_SCAN_FILES = 50  # Don't scan massive directories

        def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
            """Find the best representative file in a plugin's output directory"""
            if not dir_path.exists() or not dir_path.is_dir():
                return None

            candidates = []
            file_count = 0

            # Special handling for media plugin - look for thumbnails
            is_media_dir = plugin_name == 'media'

            # Scan for suitable files
            for file_path in dir_path.rglob('*'):
                file_count += 1
                if file_count > MAX_SCAN_FILES:
                    break

                if file_path.is_dir() or file_path.name.startswith('.'):
                    continue

                ext = file_path.suffix.lstrip('.').lower()
                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
                    continue

                try:
                    size = file_path.stat().st_size
                except OSError:
                    continue

                # For media dir, allow smaller image files (thumbnails are often < 15KB)
                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
                if size < min_size:
                    continue

                # Prefer main files: index.html, output.*, content.*, etc.
                priority = 0
                name_lower = file_path.name.lower()

                if is_media_dir:
                    # Special prioritization for media directories
                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
                        priority = 200  # Highest priority for thumbnails
                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
                        priority = 150  # High priority for any image
                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
                        priority = 100  # Lower priority for actual media files
                    else:
                        priority = 50
                elif 'index' in name_lower:
                    priority = 100
                elif name_lower.startswith(('output', 'content', plugin_name)):
                    priority = 50
                elif ext in ('html', 'htm', 'pdf'):
                    priority = 30
                elif ext in ('png', 'jpg', 'jpeg', 'webp'):
                    priority = 20
                else:
                    priority = 10

                candidates.append((priority, size, file_path))

            if not candidates:
                return None

            # Sort by priority (desc), then size (desc)
            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
            best_file = candidates[0][2]
            return str(best_file.relative_to(Path(self.output_dir)))

        canonical = {
            'index_path': 'index.html',
            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
        }

        # Scan each ArchiveResult's output directory for the best file
        snap_dir = Path(self.output_dir)
        for result in self.archiveresult_set.filter(status='succeeded'):
            if not result.output_files and not result.output_str:
                continue

            # Try to find the best output file for this plugin
            plugin_dir = snap_dir / result.plugin
            best_output = None

            # Check output_files first (new field)
            if result.output_files:
                first_file = next(iter(result.output_files.keys()), None)
                if first_file and (plugin_dir / first_file).exists():
                    best_output = f'{result.plugin}/{first_file}'

            # Fallback to output_str if it looks like a path
            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
                best_output = result.output_str

            if not best_output and plugin_dir.exists():
                # Intelligently find the best file in the plugin's directory
                best_output = find_best_output_in_dir(plugin_dir, result.plugin)

            if best_output:
                canonical[f'{result.plugin}_path'] = best_output

        # Also scan top-level for legacy outputs (backwards compatibility)
        for file_path in snap_dir.glob('*'):
            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
                continue

            ext = file_path.suffix.lstrip('.').lower()
            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
                continue

            try:
                size = file_path.stat().st_size
                if size >= MIN_DISPLAY_SIZE:
                    # Add as generic output with stem as key
                    key = f'{file_path.stem}_path'
                    if key not in canonical:
                        canonical[key] = file_path.name
            except OSError:
                continue

        if self.is_static:
            static_path = f'warc/{self.timestamp}'
            canonical.update({
                'title': self.basename,
                'wget_path': static_path,
            })

        return canonical

    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
        """Get the latest output that each plugin produced"""
        from archivebox.hooks import get_plugins
        from django.db.models import Q

        latest: Dict[str, Any] = {}
        for plugin in get_plugins():
            results = self.archiveresult_set.filter(plugin=plugin)
            if status is not None:
                results = results.filter(status=status)
            # Filter for results with output_files or output_str
            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
            result = results.first()
            # Return embed_path() for backwards compatibility
            latest[plugin] = result.embed_path() if result else None
        return latest

    # =========================================================================
    # Serialization Methods
    # =========================================================================

    def to_dict(self, extended: bool = False) -> Dict[str, Any]:
        """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
        from archivebox.misc.util import ts_to_date_str

        result = {
            'TYPE': 'core.models.Snapshot',
            'id': str(self.id),
            'url': self.url,
            'timestamp': self.timestamp,
            'title': self.title,
            'tags': self.tags_str(),
            'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
            'created_at': self.created_at.isoformat() if self.created_at else None,
            # Computed properties
            'domain': self.domain,
            'scheme': self.scheme,
            'base_url': self.base_url,
            'path': self.path,
            'basename': self.basename,
            'extension': self.extension,
            'is_static': self.is_static,
            'is_archived': self.is_archived,
            'archive_path': self.archive_path,
            'output_dir': self.output_dir,
            'link_dir': self.output_dir,  # backwards compatibility alias
            'archive_size': self.archive_size,
            'bookmarked_date': self.bookmarked_date,
            'downloaded_datestr': self.downloaded_datestr,
            'num_outputs': self.num_outputs,
            'num_failures': self.num_failures,
        }
        if extended:
            result['canonical'] = self.canonical_outputs()
        return result

    def to_json(self, indent: int = 4) -> str:
        """Convert to JSON string"""
        return to_json(self.to_dict(extended=True), indent=indent)

    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
        """Convert to CSV string"""
        data = self.to_dict()
        cols = cols or ['timestamp', 'is_archived', 'url']
        return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)

    def write_json_details(self, out_dir: Optional[str] = None) -> None:
        """Write JSON index file for this snapshot to its output directory"""
        out_dir = out_dir or self.output_dir
        path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
        atomic_write(str(path), self.to_dict(extended=True))

    def write_html_details(self, out_dir: Optional[str] = None) -> None:
        """Write HTML detail page for this snapshot to its output directory"""
        from django.template.loader import render_to_string
        from archivebox.config.common import SERVER_CONFIG
        from archivebox.config.configset import get_config
        from archivebox.misc.logging_util import printable_filesize

        out_dir = out_dir or self.output_dir
        config = get_config()
        SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
        TITLE_LOADING_MSG = 'Not yet archived...'

        canonical = self.canonical_outputs()
        context = {
            **self.to_dict(extended=True),
            **{f'{k}_path': v for k, v in canonical.items()},
            'canonical': {f'{k}_path': v for k, v in canonical.items()},
            'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
            'url_str': htmlencode(urldecode(self.base_url)),
            'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
            'extension': self.extension or 'html',
            'tags': self.tags_str() or 'untagged',
            'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
            'status': 'archived' if self.is_archived else 'not yet archived',
            'status_color': 'success' if self.is_archived else 'danger',
            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
        }
        rendered_html = render_to_string('snapshot.html', context)
        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)

    # =========================================================================
    # Helper Methods
    # =========================================================================

    @staticmethod
    def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None


class ArchiveResultManager(models.Manager):
    def indexable(self, sorted: bool = True):
        INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
        qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
        if sorted:
            precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
        return qs


class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    class StatusChoices(models.TextChoices):
        QUEUED = 'queued', 'Queued'
        STARTED = 'started', 'Started'
        BACKOFF = 'backoff', 'Waiting to retry'
        SUCCEEDED = 'succeeded', 'Succeeded'
        FAILED = 'failed', 'Failed'
        SKIPPED = 'skipped', 'Skipped'

    @classmethod
    def get_plugin_choices(cls):
        """Get plugin choices from discovered hooks (for forms/admin)."""
        plugins = [get_plugin_name(e) for e in get_plugins()]
        return tuple((e, e) for e in plugins)

    # Keep AutoField for backward compatibility with 0.7.x databases
    # UUID field is added separately by migration for new records
    id = models.AutoField(primary_key=True, editable=False)
    # Note: unique constraint is added by migration 0027 - don't set unique=True here
    # or SQLite table recreation in earlier migrations will fail
    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
    # No choices= constraint - plugin names come from plugin system and can be any string
    plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True)
    hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
    cmd = models.JSONField(default=None, null=True, blank=True)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)

    # New output fields (replacing old 'output' field)
    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')

    # Binary FK (optional - set when hook reports cmd)
    binary = models.ForeignKey(
        'machine.Binary',
        on_delete=models.SET_NULL,
        null=True, blank=True,
        related_name='archiveresults',
        help_text='Primary binary used by this hook'
    )

    start_ts = models.DateTimeField(default=None, null=True, blank=True)
    end_ts = models.DateTimeField(default=None, null=True, blank=True)

    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
    notes = models.TextField(blank=True, null=False, default='')
    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
    iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)

    state_machine_name = 'core.statemachines.ArchiveResultMachine'
    retry_at_field_name = 'retry_at'
    state_field_name = 'status'
    active_state = StatusChoices.STARTED

    objects = ArchiveResultManager()

    class Meta(TypedModelMeta):
        verbose_name = 'Archive Result'
        verbose_name_plural = 'Archive Results Log'

    def __str__(self):
        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'

    def save(self, *args, **kwargs):
        is_new = self._state.adding
        # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
        # Call the Django Model.save() directly instead
        models.Model.save(self, *args, **kwargs)

        if is_new:
            from archivebox.misc.logging_util import log_worker_event
            log_worker_event(
                worker_type='DB',
                event='Created ArchiveResult',
                indent_level=3,
                plugin=self.plugin,
                metadata={
                    'id': str(self.id),
                    'snapshot_id': str(self.snapshot_id),
                    'snapshot_url': str(self.snapshot.url)[:64],
                    'status': self.status,
                },
            )

    @cached_property
    def snapshot_dir(self):
        return Path(self.snapshot.output_dir)

    @cached_property
    def url(self):
        return self.snapshot.url

    @property
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_archiveresult', args=[self.id])

    def get_absolute_url(self):
        return f'/{self.snapshot.archive_path}/{self.plugin}'

    @property
    def plugin_module(self) -> Any | None:
        # Hook scripts are now used instead of Python plugin modules
        # The plugin name maps to hooks in archivebox/plugins/{plugin}/
        return None

    def output_exists(self) -> bool:
        return os.path.exists(Path(self.snapshot_dir) / self.plugin)

    def embed_path(self) -> Optional[str]:
        """
        Get the relative path to the embeddable output file for this result.

        Returns the first file from output_files if set, otherwise tries to
        find a reasonable default based on the plugin type.
        """
        # Check output_files dict for primary output
        if self.output_files:
            # Return first file from output_files (dict preserves insertion order)
            first_file = next(iter(self.output_files.keys()), None)
            if first_file:
                return f'{self.plugin}/{first_file}'

        # Fallback: check output_str if it looks like a file path
        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
            return self.output_str

        # Try to find output file based on plugin's canonical output path
        canonical = self.snapshot.canonical_outputs()
        plugin_key = f'{self.plugin}_path'
        if plugin_key in canonical:
            return canonical[plugin_key]

        # Fallback to plugin directory
        return f'{self.plugin}/'

    def create_output_dir(self):
        output_dir = Path(self.snapshot_dir) / self.plugin
        output_dir.mkdir(parents=True, exist_ok=True)
        return output_dir

    @property
    def output_dir_name(self) -> str:
        return self.plugin

    @property
    def output_dir_parent(self) -> str:
        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))

    def save_search_index(self):
        pass

    def run(self):
        """
        Execute this ArchiveResult's hook and update status.

        If self.hook_name is set, runs only that specific hook.
        If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).

        Updates status/output fields, queues discovered URLs, and triggers indexing.
        """
        from django.utils import timezone
        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook

        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]

        # Determine which hook(s) to run
        hooks = []

        if self.hook_name:
            # SPECIFIC HOOK MODE: Find the specific hook by name
            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
                if not base_dir.exists():
                    continue
                plugin_dir = base_dir / self.plugin
                if plugin_dir.exists():
                    hook_path = plugin_dir / self.hook_name
                    if hook_path.exists():
                        hooks.append(hook_path)
                        break
        else:
            # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
                if not base_dir.exists():
                    continue
                plugin_dir = base_dir / self.plugin
                if plugin_dir.exists():
                    matches = list(plugin_dir.glob('on_Snapshot__*.*'))
                    if matches:
                        hooks.extend(sorted(matches))

        if not hooks:
            self.status = self.StatusChoices.FAILED
            if self.hook_name:
                self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
            else:
                self.output_str = f'No hooks found for plugin: {self.plugin}'
            self.retry_at = None
            self.save()
            return

        # Output directory is plugin_dir for the hook output
        plugin_dir = Path(self.snapshot.output_dir) / self.plugin

        start_ts = timezone.now()
        is_bg_hook = False

        for hook in hooks:
            # Check if this is a background hook
            is_bg_hook = is_background_hook(hook.name)

            result = run_hook(
                hook,
                output_dir=plugin_dir,
                config_objects=config_objects,
                url=self.snapshot.url,
                snapshot_id=str(self.snapshot.id),
                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
                depth=self.snapshot.depth,
            )

            # Background hooks return None
            if result is None:
                is_bg_hook = True

        # Update status based on hook execution
        if is_bg_hook:
            # BACKGROUND HOOK - still running, return immediately
            # Status stays STARTED, will be finalized by Snapshot.cleanup()
            self.status = self.StatusChoices.STARTED
            self.start_ts = start_ts
            self.pwd = str(plugin_dir)
            self.save()
            return

        # FOREGROUND HOOK - completed, update from filesystem
        self.start_ts = start_ts
        self.pwd = str(plugin_dir)
        self.update_from_output()

        # Clean up empty output directory if no files were created
        if plugin_dir.exists() and not self.output_files:
            try:
                if not any(plugin_dir.iterdir()):
                    plugin_dir.rmdir()
            except (OSError, RuntimeError):
                pass

    def update_from_output(self):
        """
        Update this ArchiveResult from filesystem logs and output files.

        Used for:
        - Foreground hooks that completed (called from ArchiveResult.run())
        - Background hooks that completed (called from Snapshot.cleanup())

        Updates:
        - status, output_str, output_json from ArchiveResult JSONL record
        - output_files, output_size, output_mimetypes by walking filesystem
        - end_ts, retry_at, cmd, cmd_version, binary FK
        - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
        """
        import json
        import mimetypes
        from collections import defaultdict
        from pathlib import Path
        from django.utils import timezone
        from archivebox.hooks import process_hook_records

        plugin_dir = Path(self.pwd) if self.pwd else None
        if not plugin_dir or not plugin_dir.exists():
            self.status = self.StatusChoices.FAILED
            self.output_str = 'Output directory not found'
            self.end_ts = timezone.now()
            self.retry_at = None
            self.save()
            return

        # Read and parse JSONL output from stdout.log
        stdout_file = plugin_dir / 'stdout.log'
        stdout = stdout_file.read_text() if stdout_file.exists() else ''

        records = []
        for line in stdout.splitlines():
            if line.strip() and line.strip().startswith('{'):
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError:
                    continue

        # Find ArchiveResult record and update status/output from it
        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
        if ar_records:
            hook_data = ar_records[0]

            # Update status
            status_map = {
                'succeeded': self.StatusChoices.SUCCEEDED,
                'failed': self.StatusChoices.FAILED,
                'skipped': self.StatusChoices.SKIPPED,
            }
            self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)

            # Update output fields
            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
            self.output_json = hook_data.get('output_json')

            # Update cmd fields
            if hook_data.get('cmd'):
                self.cmd = hook_data['cmd']
                self._set_binary_from_cmd(hook_data['cmd'])
            if hook_data.get('cmd_version'):
                self.cmd_version = hook_data['cmd_version'][:128]
        else:
            # No ArchiveResult record = failed
            self.status = self.StatusChoices.FAILED
            self.output_str = 'Hook did not output ArchiveResult record'

        # Walk filesystem and populate output_files, output_size, output_mimetypes
        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
        mime_sizes = defaultdict(int)
        total_size = 0
        output_files = {}

        for file_path in plugin_dir.rglob('*'):
            if not file_path.is_file():
                continue
            if file_path.name in exclude_names:
                continue

            try:
                stat = file_path.stat()
                mime_type, _ = mimetypes.guess_type(str(file_path))
                mime_type = mime_type or 'application/octet-stream'

                relative_path = str(file_path.relative_to(plugin_dir))
                output_files[relative_path] = {}
                mime_sizes[mime_type] += stat.st_size
                total_size += stat.st_size
            except (OSError, IOError):
                continue

        self.output_files = output_files
        self.output_size = total_size
        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)

        # Update timestamps
        self.end_ts = timezone.now()
        self.retry_at = None

        self.save()

        # Process side-effect records (filter Snapshots for depth/URL)
        filtered_records = []
        for record in records:
            record_type = record.get('type')

            # Skip ArchiveResult records (already processed above)
            if record_type == 'ArchiveResult':
                continue

            # Filter Snapshot records for depth/URL constraints
            if record_type == 'Snapshot':
                if not self.snapshot.crawl:
                    continue

                url = record.get('url')
                if not url:
                    continue

                depth = record.get('depth', self.snapshot.depth + 1)
                if depth > self.snapshot.crawl.max_depth:
                    continue

                if not self._url_passes_filters(url):
                    continue

            filtered_records.append(record)

        # Process filtered records with unified dispatcher
        overrides = {
            'snapshot': self.snapshot,
            'crawl': self.snapshot.crawl,
            'created_by_id': self.snapshot.created_by_id,
        }
        process_hook_records(filtered_records, overrides=overrides)

        # Update snapshot title if this is the title plugin
        plugin_name = get_plugin_name(self.plugin)
        if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
            self._update_snapshot_title(plugin_dir)

        # Trigger search indexing if succeeded
        if self.status == self.StatusChoices.SUCCEEDED:
            self.trigger_search_indexing()

        # Cleanup PID files and empty logs
        pid_file = plugin_dir / 'hook.pid'
        pid_file.unlink(missing_ok=True)
        stderr_file = plugin_dir / 'stderr.log'
        if stdout_file.exists() and stdout_file.stat().st_size == 0:
            stdout_file.unlink()
        if stderr_file.exists() and stderr_file.stat().st_size == 0:
            stderr_file.unlink()

    def _set_binary_from_cmd(self, cmd: list) -> None:
        """
        Find Binary for command and set binary FK.

        Tries matching by absolute path first, then by binary name.
        Only matches binaries on the current machine.
        """
        if not cmd:
            return

        from machine.models import Machine

        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
        machine = Machine.current()

        # Try matching by absolute path first
        binary = Binary.objects.filter(
            abspath=bin_path_or_name,
            machine=machine
        ).first()

        if binary:
            self.binary = binary
            return

        # Fallback: match by binary name
        bin_name = Path(bin_path_or_name).name
        binary = Binary.objects.filter(
            name=bin_name,
            machine=machine
        ).first()

        if binary:
            self.binary = binary

    def _update_snapshot_title(self, plugin_dir: Path):
        """
        Update snapshot title from title plugin output.

        The title plugin writes title.txt with the extracted page title.
        This updates the Snapshot.title field if the file exists and has content.
        """
        title_file = plugin_dir / 'title.txt'
        if title_file.exists():
            try:
                title = title_file.read_text(encoding='utf-8').strip()
                if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
                    self.snapshot.title = title[:512]  # Max length from model
                    self.snapshot.save(update_fields=['title', 'modified_at'])
            except Exception:
                pass  # Failed to read title, that's okay

    def _url_passes_filters(self, url: str) -> bool:
        """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.

        Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
        """
        import re
        from archivebox.config.configset import get_config

        # Get merged config with proper hierarchy
        config = get_config(
            user=self.snapshot.created_by if self.snapshot else None,
            crawl=self.snapshot.crawl if self.snapshot else None,
            snapshot=self.snapshot,
        )

        # Get allowlist/denylist (can be string or list)
        allowlist_raw = config.get('URL_ALLOWLIST', '')
        denylist_raw = config.get('URL_DENYLIST', '')

        # Normalize to list of patterns
        def to_pattern_list(value):
            if isinstance(value, list):
                return value
            if isinstance(value, str):
                return [p.strip() for p in value.split(',') if p.strip()]
            return []

        allowlist = to_pattern_list(allowlist_raw)
        denylist = to_pattern_list(denylist_raw)

        # Denylist takes precedence
        if denylist:
            for pattern in denylist:
                try:
                    if re.search(pattern, url):
                        return False
                except re.error:
                    continue  # Skip invalid regex patterns

        # If allowlist exists, URL must match at least one pattern
        if allowlist:
            for pattern in allowlist:
                try:
                    if re.search(pattern, url):
                        return True
                except re.error:
                    continue  # Skip invalid regex patterns
            return False  # No allowlist patterns matched

        return True  # No filters or passed filters

    def trigger_search_indexing(self):
        """Run any ArchiveResult__index hooks to update search indexes."""
        from archivebox.hooks import discover_hooks, run_hook

        # Pass config objects in priority order (later overrides earlier)
        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]

        for hook in discover_hooks('ArchiveResult__index'):
            run_hook(
                hook,
                output_dir=self.output_dir,
                config_objects=config_objects,
                url=self.snapshot.url,
                snapshot_id=str(self.snapshot.id),
                plugin=self.plugin,
            )

    @property
    def output_dir(self) -> Path:
        """Get the output directory for this plugin's results."""
        return Path(self.snapshot.output_dir) / self.plugin

    def is_background_hook(self) -> bool:
        """Check if this ArchiveResult is for a background hook."""
        plugin_dir = Path(self.pwd) if self.pwd else None
        if not plugin_dir:
            return False
        pid_file = plugin_dir / 'hook.pid'
        return pid_file.exists()