wip major changes

2026-04-05 07:17:52 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,7 +1,8 @@
 __package__ = 'archivebox.core'

-from typing import Optional, Dict, Iterable, Any
+from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
 from uuid import uuid7
+from datetime import datetime, timedelta
 from django_stubs_ext.db.models import TypedModelMeta

 import os
@@ -18,15 +19,11 @@ from django.urls import reverse, reverse_lazy
 from django.contrib import admin
 from django.conf import settings

-import abx
-
 from archivebox.config import CONSTANTS
-from archivebox.misc.system import get_dir_size
-from archivebox.misc.util import parse_date, base_url, domain as url_domain
+from archivebox.misc.system import get_dir_size, atomic_write
+from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
 from archivebox.misc.hashing import get_dir_info
-from archivebox.index.schema import Link
-from archivebox.index.html import snapshot_icons
-from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
 from archivebox.base_models.models import (
    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -38,6 +35,7 @@ from crawls.models import Crawl
 from machine.models import NetworkInterface


+
 class Tag(ModelWithSerializers):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
@@ -94,8 +92,181 @@ class SnapshotManager(models.Manager):
    def get_queryset(self):
        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')

+    # =========================================================================
+    # Filtering Methods
+    # =========================================================================

-class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    FILTER_TYPES = {
+        'exact': lambda pattern: models.Q(url=pattern),
+        'substring': lambda pattern: models.Q(url__icontains=pattern),
+        'regex': lambda pattern: models.Q(url__iregex=pattern),
+        'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
+        'tag': lambda pattern: models.Q(tags__name=pattern),
+        'timestamp': lambda pattern: models.Q(timestamp=pattern),
+    }
+
+    def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> QuerySet:
+        """Filter snapshots by URL patterns using specified filter type"""
+        from archivebox.misc.logging import stderr
+
+        q_filter = models.Q()
+        for pattern in patterns:
+            try:
+                q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
+            except KeyError:
+                stderr()
+                stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
+                stderr(f'    {pattern}')
+                raise SystemExit(2)
+        return self.filter(q_filter)
+
+    def search(self, patterns: List[str]) -> QuerySet:
+        """Search snapshots using the configured search backend"""
+        from archivebox.config.common import SEARCH_BACKEND_CONFIG
+        from archivebox.search import query_search_index
+        from archivebox.misc.logging import stderr
+
+        if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
+            stderr()
+            stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
+            raise SystemExit(2)
+
+        qsearch = self.none()
+        for pattern in patterns:
+            try:
+                qsearch |= query_search_index(pattern)
+            except:
+                raise SystemExit(2)
+        return self.all() & qsearch
+
+    # =========================================================================
+    # Export Methods
+    # =========================================================================
+
+    def to_json(self, with_headers: bool = False) -> str:
+        """Generate JSON index from snapshots"""
+        import sys
+        from datetime import datetime, timezone as tz
+        from archivebox.config import VERSION
+        from archivebox.config.common import SERVER_CONFIG
+
+        MAIN_INDEX_HEADER = {
+            'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+            'schema': 'archivebox.index.json',
+            'copyright_info': SERVER_CONFIG.FOOTER_INFO,
+            'meta': {
+                'project': 'ArchiveBox',
+                'version': VERSION,
+                'git_sha': VERSION,
+                'website': 'https://ArchiveBox.io',
+                'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
+                'source': 'https://github.com/ArchiveBox/ArchiveBox',
+                'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
+                'dependencies': {},
+            },
+        } if with_headers else {}
+
+        snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
+
+        if with_headers:
+            output = {
+                **MAIN_INDEX_HEADER,
+                'num_links': len(snapshot_dicts),
+                'updated': datetime.now(tz.utc),
+                'last_run_cmd': sys.argv,
+                'links': snapshot_dicts,
+            }
+        else:
+            output = snapshot_dicts
+        return to_json(output, indent=4, sort_keys=True)
+
+    def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
+        """Generate CSV output from snapshots"""
+        cols = cols or ['timestamp', 'is_archived', 'url']
+        header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
+        row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
+        return '\n'.join((header_str, *row_strs))
+
+    def to_html(self, with_headers: bool = True) -> str:
+        """Generate main index HTML from snapshots"""
+        from datetime import datetime, timezone as tz
+        from django.template.loader import render_to_string
+        from archivebox.config import VERSION
+        from archivebox.config.common import SERVER_CONFIG
+        from archivebox.config.version import get_COMMIT_HASH
+
+        template = 'static_index.html' if with_headers else 'minimal_index.html'
+        snapshot_list = list(self.iterator(chunk_size=500))
+
+        return render_to_string(template, {
+            'version': VERSION,
+            'git_sha': get_COMMIT_HASH() or VERSION,
+            'num_links': str(len(snapshot_list)),
+            'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
+            'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
+            'links': snapshot_list,
+            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
+        })
+
+    # =========================================================================
+    # Import Methods
+    # =========================================================================
+
+    def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
+        """Create or update a Snapshot from a SnapshotDict (parser output)"""
+        import re
+        from archivebox.config.common import GENERAL_CONFIG
+
+        url = link_dict['url']
+        timestamp = link_dict.get('timestamp')
+        title = link_dict.get('title')
+        tags_str = link_dict.get('tags')
+
+        tag_list = []
+        if tags_str:
+            tag_list = list(dict.fromkeys(
+                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
+                if tag.strip()
+            ))
+
+        try:
+            snapshot = self.get(url=url)
+            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
+                snapshot.title = title
+                snapshot.save(update_fields=['title', 'modified_at'])
+        except self.model.DoesNotExist:
+            if timestamp:
+                while self.filter(timestamp=timestamp).exists():
+                    timestamp = str(float(timestamp) + 1.0)
+
+            snapshot = self.create(
+                url=url,
+                timestamp=timestamp,
+                title=title,
+                created_by_id=created_by_id or get_or_create_system_user_pk(),
+            )
+
+        if tag_list:
+            existing_tags = set(snapshot.tags.values_list('name', flat=True))
+            new_tags = set(tag_list) | existing_tags
+            snapshot.save_tags(new_tags)
+
+        return snapshot
+
+    def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
+        """Create or update multiple Snapshots from a list of SnapshotDicts"""
+        return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
+
+    def remove(self, atomic: bool = False) -> tuple:
+        """Remove snapshots from the database"""
+        from django.db import transaction
+        if atomic:
+            with transaction.atomic():
+                return self.delete()
+        return self.delete()
+
+
+class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -108,6 +279,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW

    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
+    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs

    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
@@ -152,9 +324,6 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
    def archive(self, overwrite=False, methods=None):
        return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)

-    def as_link(self) -> Link:
-        return Link.from_json(self.as_json())
-
    @admin.display(description='Tags')
    def tags_str(self, nocache=True) -> str | None:
        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
@@ -164,7 +333,55 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()

    def icons(self) -> str:
-        return snapshot_icons(self)
+        """Generate HTML icons showing which extractors have succeeded for this snapshot"""
+        from django.utils.html import format_html, mark_safe
+        from collections import defaultdict
+
+        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
+
+        def calc_icons():
+            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+                archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
+            else:
+                archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
+
+            path = self.archive_path
+            canon = self.canonical_outputs()
+            output = ""
+            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
+            icons = {
+                "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
+                "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
+                "readability": "🆁", "mercury": "🅼", "warc": "📦"
+            }
+            exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
+
+            extractor_outputs = defaultdict(lambda: None)
+            for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
+                for result in archive_results:
+                    if result.extractor == extractor:
+                        extractor_outputs[extractor] = result
+
+            for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
+                if extractor not in exclude:
+                    existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                    output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
+                if extractor == "wget":
+                    exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                    output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
+                if extractor == "archive_org":
+                    exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                    output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
+
+            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+
+        cache_result = cache.get(cache_key)
+        if cache_result:
+            return cache_result
+
+        fresh_result = calc_icons()
+        cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
+        return fresh_result

    @property
    def api_url(self) -> str:
@@ -178,7 +395,8 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
        return url_domain(self.url)

    @cached_property
-    def link_dir(self):
+    def output_dir(self):
+        """The filesystem path to the snapshot's output directory."""
        return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)

    @cached_property
@@ -188,7 +406,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
    @cached_property
    def archive_size(self):
        try:
-            return get_dir_size(self.link_dir)[0]
+            return get_dir_size(self.output_dir)[0]
        except Exception:
            return 0

@@ -200,20 +418,327 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
        return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)

+    def run(self) -> list['ArchiveResult']:
+        """
+        Execute this Snapshot by creating ArchiveResults for all enabled extractors.
+
+        Called by the state machine when entering the 'started' state.
+        """
+        return self.create_pending_archiveresults()
+
    def create_pending_archiveresults(self) -> list['ArchiveResult']:
-        ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
+        """
+        Create ArchiveResult records for all enabled extractors.
+        
+        Uses the hooks system to discover available extractors from:
+        - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
+        - data/plugins/*/on_Snapshot__*.{py,sh,js}
+        """
+        from archivebox.hooks import get_enabled_extractors
+        
+        extractors = get_enabled_extractors()
        archiveresults = []
-        for extractor in ALL_EXTRACTORS:
+        
+        for extractor in extractors:
            if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
                continue
            archiveresult, _ = ArchiveResult.objects.get_or_create(
                snapshot=self, extractor=extractor,
-                defaults={'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now()},
+                defaults={
+                    'status': ArchiveResult.INITIAL_STATE,
+                    'retry_at': timezone.now(),
+                    'created_by_id': self.created_by_id,
+                },
            )
            if archiveresult.status == ArchiveResult.INITIAL_STATE:
                archiveresults.append(archiveresult)
        return archiveresults

+    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
+        """
+        Reset failed/skipped ArchiveResults to queued for retry.
+
+        This enables seamless retry of the entire extraction pipeline:
+        - Resets FAILED and SKIPPED results to QUEUED
+        - Sets retry_at so workers pick them up
+        - Extractors run in order (numeric prefix)
+        - Each extractor checks its dependencies at runtime
+
+        Dependency handling (e.g., chrome_session → screenshot):
+        - Extractors check if required outputs exist before running
+        - If dependency output missing → extractor returns 'skipped'
+        - On retry, if dependency now succeeds → dependent can run
+
+        Returns count of ArchiveResults reset.
+        """
+        retry_at = retry_at or timezone.now()
+
+        count = self.archiveresult_set.filter(
+            status__in=[
+                ArchiveResult.StatusChoices.FAILED,
+                ArchiveResult.StatusChoices.SKIPPED,
+            ]
+        ).update(
+            status=ArchiveResult.StatusChoices.QUEUED,
+            retry_at=retry_at,
+            output=None,
+            start_ts=None,
+            end_ts=None,
+        )
+
+        # Also reset the snapshot so it gets re-checked
+        if count > 0:
+            self.status = self.StatusChoices.STARTED
+            self.retry_at = retry_at
+            self.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+        return count
+
+    # =========================================================================
+    # URL Helper Properties (migrated from Link schema)
+    # =========================================================================
+
+    @cached_property
+    def url_hash(self) -> str:
+        from hashlib import sha256
+        return sha256(self.url.encode()).hexdigest()[:8]
+
+    @cached_property
+    def scheme(self) -> str:
+        return self.url.split('://')[0]
+
+    @cached_property
+    def path(self) -> str:
+        parts = self.url.split('://', 1)
+        return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
+
+    @cached_property
+    def basename(self) -> str:
+        return self.path.split('/')[-1]
+
+    @cached_property
+    def extension(self) -> str:
+        basename = self.basename
+        return basename.split('.')[-1] if '.' in basename else ''
+
+    @cached_property
+    def base_url(self) -> str:
+        return f'{self.scheme}://{self.domain}'
+
+    @cached_property
+    def is_static(self) -> bool:
+        static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
+        return any(self.url.lower().endswith(ext) for ext in static_extensions)
+
+    @cached_property
+    def is_archived(self) -> bool:
+        output_paths = (
+            self.domain,
+            'output.html',
+            'output.pdf',
+            'screenshot.png',
+            'singlefile.html',
+            'readability/content.html',
+            'mercury/content.html',
+            'htmltotext.txt',
+            'media',
+            'git',
+        )
+        return any((Path(self.output_dir) / path).exists() for path in output_paths)
+
+    # =========================================================================
+    # Date/Time Properties (migrated from Link schema)
+    # =========================================================================
+
+    @cached_property
+    def bookmarked_date(self) -> Optional[str]:
+        max_ts = (timezone.now() + timedelta(days=30)).timestamp()
+        if self.timestamp and self.timestamp.replace('.', '').isdigit():
+            if 0 < float(self.timestamp) < max_ts:
+                return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
+            return str(self.timestamp)
+        return None
+
+    @cached_property
+    def downloaded_datestr(self) -> Optional[str]:
+        return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
+
+    @cached_property
+    def archive_dates(self) -> List[datetime]:
+        return [
+            result.start_ts
+            for result in self.archiveresult_set.all()
+            if result.start_ts
+        ]
+
+    @cached_property
+    def oldest_archive_date(self) -> Optional[datetime]:
+        dates = self.archive_dates
+        return min(dates) if dates else None
+
+    @cached_property
+    def newest_archive_date(self) -> Optional[datetime]:
+        dates = self.archive_dates
+        return max(dates) if dates else None
+
+    @cached_property
+    def num_outputs(self) -> int:
+        return self.archiveresult_set.filter(status='succeeded').count()
+
+    @cached_property
+    def num_failures(self) -> int:
+        return self.archiveresult_set.filter(status='failed').count()
+
+    # =========================================================================
+    # Output Path Methods (migrated from Link schema)
+    # =========================================================================
+
+    def canonical_outputs(self) -> Dict[str, Optional[str]]:
+        """Predict the expected output paths that should be present after archiving"""
+        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
+        canonical = {
+            'index_path': 'index.html',
+            'favicon_path': 'favicon.ico',
+            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
+            'wget_path': f'warc/{self.timestamp}',
+            'warc_path': 'warc/',
+            'singlefile_path': 'singlefile.html',
+            'readability_path': 'readability/content.html',
+            'mercury_path': 'mercury/content.html',
+            'htmltotext_path': 'htmltotext.txt',
+            'pdf_path': 'output.pdf',
+            'screenshot_path': 'screenshot.png',
+            'dom_path': 'output.html',
+            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
+            'git_path': 'git/',
+            'media_path': 'media/',
+            'headers_path': 'headers.json',
+        }
+
+        if self.is_static:
+            static_path = f'warc/{self.timestamp}'
+            canonical.update({
+                'title': self.basename,
+                'wget_path': static_path,
+                'pdf_path': static_path,
+                'screenshot_path': static_path,
+                'dom_path': static_path,
+                'singlefile_path': static_path,
+                'readability_path': static_path,
+                'mercury_path': static_path,
+                'htmltotext_path': static_path,
+            })
+        return canonical
+
+    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
+        """Get the latest output that each archive method produced"""
+        from archivebox.hooks import get_extractors
+
+        latest: Dict[str, Any] = {}
+        for archive_method in get_extractors():
+            results = self.archiveresult_set.filter(extractor=archive_method)
+            if status is not None:
+                results = results.filter(status=status)
+            results = results.filter(output__isnull=False).order_by('-start_ts')
+            latest[archive_method] = results.first().output if results.exists() else None
+        return latest
+
+    # =========================================================================
+    # Serialization Methods
+    # =========================================================================
+
+    def to_dict(self, extended: bool = False) -> Dict[str, Any]:
+        """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
+        from archivebox.misc.util import ts_to_date_str
+
+        result = {
+            'TYPE': 'core.models.Snapshot',
+            'id': str(self.id),
+            'url': self.url,
+            'timestamp': self.timestamp,
+            'title': self.title,
+            'tags': self.tags_str(),
+            'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
+            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            # Computed properties
+            'domain': self.domain,
+            'scheme': self.scheme,
+            'base_url': self.base_url,
+            'path': self.path,
+            'basename': self.basename,
+            'extension': self.extension,
+            'is_static': self.is_static,
+            'is_archived': self.is_archived,
+            'archive_path': self.archive_path,
+            'output_dir': self.output_dir,
+            'link_dir': self.output_dir,  # backwards compatibility alias
+            'archive_size': self.archive_size,
+            'bookmarked_date': self.bookmarked_date,
+            'downloaded_datestr': self.downloaded_datestr,
+            'num_outputs': self.num_outputs,
+            'num_failures': self.num_failures,
+        }
+        if extended:
+            result['canonical'] = self.canonical_outputs()
+        return result
+
+    def to_json(self, indent: int = 4) -> str:
+        """Convert to JSON string"""
+        return to_json(self.to_dict(extended=True), indent=indent)
+
+    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
+        """Convert to CSV string"""
+        data = self.to_dict()
+        cols = cols or ['timestamp', 'is_archived', 'url']
+        return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
+
+    def write_json_details(self, out_dir: Optional[str] = None) -> None:
+        """Write JSON index file for this snapshot to its output directory"""
+        out_dir = out_dir or self.output_dir
+        path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
+        atomic_write(str(path), self.to_dict(extended=True))
+
+    def write_html_details(self, out_dir: Optional[str] = None) -> None:
+        """Write HTML detail page for this snapshot to its output directory"""
+        from django.template.loader import render_to_string
+        from archivebox.config.common import SERVER_CONFIG
+        from archivebox.config.configset import get_config
+        from archivebox.misc.logging_util import printable_filesize
+
+        out_dir = out_dir or self.output_dir
+        config = get_config()
+        SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
+        TITLE_LOADING_MSG = 'Not yet archived...'
+
+        canonical = self.canonical_outputs()
+        context = {
+            **self.to_dict(extended=True),
+            **{f'{k}_path': v for k, v in canonical.items()},
+            'canonical': {f'{k}_path': v for k, v in canonical.items()},
+            'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
+            'url_str': htmlencode(urldecode(self.base_url)),
+            'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
+            'extension': self.extension or 'html',
+            'tags': self.tags_str() or 'untagged',
+            'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
+            'status': 'archived' if self.is_archived else 'not yet archived',
+            'status_color': 'success' if self.is_archived else 'danger',
+            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
+            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
+            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
+        }
+        rendered_html = render_to_string('snapshot.html', context)
+        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
+
+    # =========================================================================
+    # Helper Methods
+    # =========================================================================
+
+    @staticmethod
+    def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
+        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
+

 class ArchiveResultManager(models.Manager):
    def indexable(self, sorted: bool = True):
@@ -225,7 +750,7 @@ class ArchiveResultManager(models.Manager):
        return qs


-class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    class StatusChoices(models.TextChoices):
        QUEUED = 'queued', 'Queued'
        STARTED = 'started', 'Started'
@@ -277,7 +802,7 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M

    @cached_property
    def snapshot_dir(self):
-        return Path(self.snapshot.link_dir)
+        return Path(self.snapshot.output_dir)

    @cached_property
    def url(self):
@@ -292,7 +817,9 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M

    @property
    def extractor_module(self) -> Any | None:
-        return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
+        # Hook scripts are now used instead of Python extractor modules
+        # The extractor name maps to hooks in archivebox/plugins/{extractor}/
+        return None

    def output_exists(self) -> bool:
        return os.path.exists(Path(self.snapshot_dir) / self.extractor)
@@ -315,3 +842,150 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M

    def save_search_index(self):
        pass
+
+    def run(self):
+        """
+        Execute this ArchiveResult's extractor and update status.
+
+        Discovers and runs the hook script for self.extractor,
+        updates status/output fields, queues discovered URLs, and triggers indexing.
+        """
+        from django.utils import timezone
+        from archivebox.hooks import discover_hooks, run_hook
+
+        extractor_dir = Path(self.snapshot.output_dir) / self.extractor
+        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
+
+        # Discover hook for this extractor
+        hooks = discover_hooks(f'Snapshot__{self.extractor}')
+        if not hooks:
+            self.status = self.StatusChoices.FAILED
+            self.output = f'No hook found for: {self.extractor}'
+            self.retry_at = None
+            self.save()
+            return
+
+        # Run the hook
+        start_ts = timezone.now()
+        result = run_hook(
+            hooks[0],
+            output_dir=extractor_dir,
+            config_objects=config_objects,
+            url=self.snapshot.url,
+        )
+        end_ts = timezone.now()
+
+        # Determine status from return code and JSON output
+        output_json = result.get('output_json') or {}
+        json_status = output_json.get('status')
+
+        if json_status == 'skipped':
+            status = 'skipped'
+        elif json_status == 'failed':
+            status = 'failed'
+        elif result['returncode'] == 0:
+            status = 'succeeded'
+        else:
+            status = 'failed'
+
+        # Update self from result
+        status_map = {
+            'succeeded': self.StatusChoices.SUCCEEDED,
+            'failed': self.StatusChoices.FAILED,
+            'skipped': self.StatusChoices.SKIPPED,
+        }
+        self.status = status_map.get(status, self.StatusChoices.FAILED)
+        self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
+        self.start_ts = start_ts
+        self.end_ts = end_ts
+        self.retry_at = None
+        self.save()
+
+        # Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
+        self._queue_urls_for_crawl(extractor_dir)
+
+        # Trigger search indexing if succeeded
+        if self.status == self.StatusChoices.SUCCEEDED:
+            self.trigger_search_indexing()
+
+    def _queue_urls_for_crawl(self, extractor_dir: Path):
+        """
+        Read urls.jsonl and queue discovered URLs for crawling.
+
+        Parser extractors output urls.jsonl with discovered URLs and Tags.
+        - Tag records: {"type": "Tag", "name": "..."}
+        - Snapshot records: {"type": "Snapshot", "url": "...", ...}
+
+        Tags are created in the database.
+        URLs get added to the parent Crawl's queue with metadata
+        (depth, via_snapshot, via_extractor) for recursive crawling.
+
+        Used at all depths:
+        - depth=0: Initial source file (e.g., bookmarks.html) parsed for URLs
+        - depth>0: Crawled pages parsed for outbound links
+        """
+        import json
+
+        if not self.snapshot.crawl:
+            return
+
+        urls_file = extractor_dir / 'urls.jsonl'
+        if not urls_file.exists():
+            return
+
+        urls_added = 0
+        tags_created = 0
+        with open(urls_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    record_type = entry.get('type', 'Snapshot')
+
+                    # Handle Tag records
+                    if record_type == 'Tag':
+                        tag_name = entry.get('name')
+                        if tag_name:
+                            Tag.objects.get_or_create(name=tag_name)
+                            tags_created += 1
+                        continue
+
+                    # Handle Snapshot records (or records without type)
+                    if not entry.get('url'):
+                        continue
+
+                    # Add crawl metadata
+                    entry['depth'] = self.snapshot.depth + 1
+                    entry['via_snapshot'] = str(self.snapshot.id)
+                    entry['via_extractor'] = self.extractor
+
+                    if self.snapshot.crawl.add_url(entry):
+                        urls_added += 1
+                except json.JSONDecodeError:
+                    continue
+
+        if urls_added > 0:
+            self.snapshot.crawl.create_snapshots_from_urls()
+    
+    def trigger_search_indexing(self):
+        """Run any ArchiveResult__index hooks to update search indexes."""
+        from archivebox.hooks import discover_hooks, run_hook
+
+        # Pass config objects in priority order (later overrides earlier)
+        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
+
+        for hook in discover_hooks('ArchiveResult__index'):
+            run_hook(
+                hook,
+                output_dir=self.output_dir,
+                config_objects=config_objects,
+                snapshot_id=str(self.snapshot.id),
+                extractor=self.extractor,
+            )
+    
+    @property
+    def output_dir(self) -> Path:
+        """Get the output directory for this extractor's results."""
+        return Path(self.snapshot.output_dir) / self.extractor