Refactor archive file access to use DB instead of filesystem

This prepares the codebase for S3 storage support by eliminating filesystem scanning for archive metadata. All file listings and size calculations now use the existing output_files and output_size fields on ArchiveResult. Changes: - Snapshot.archive_size: now uses Sum(ArchiveResult.output_size) from DB - Snapshot.canonical_outputs(): uses output_files dict instead of rglob - Admin size displays: removed os.access checks, use DB directly - views.py render_live_index: uses output_files/output_size from DB - archivebox_status: uses DB aggregation instead of get_dir_size - logging_util: uses snapshot.archive_size instead of get_dir_size No new models or DB fields required - leverages existing output_files and output_size fields that are already populated during archiving.
2026-01-03 01:15:57 +10:00 · 2026-01-01 14:14:38 +00:00
parent f7457b13ad
commit 68061656e3
5 changed files with 94 additions and 147 deletions
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -37,46 +37,42 @@ def status(out_dir: Path=DATA_DIR) -> None:
    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
    print()
-    print('[green]\\[*] Scanning archive data directories...[/green]')
+    print('[green]\\[*] Scanning archive data from database...[/green]')
    print(f'[yellow]   {ARCHIVE_DIR}/*[/yellow]')
-    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
+
+    # Get archive stats from DB (no filesystem scanning)
+    from django.db.models import Sum, Count
+    from archivebox.core.models import ArchiveResult
+
+    archive_stats = ArchiveResult.objects.filter(status='succeeded').aggregate(
+        total_size=Sum('output_size'),
+        total_results=Count('id'),
+    )
+    num_bytes = archive_stats['total_size'] or 0
+    num_results = archive_stats['total_results'] or 0
    size = printable_filesize(num_bytes)
-    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
+    print(f'    Size: {size} across {num_results} archive results (from DB)')

    # Use DB as source of truth for snapshot status
    num_indexed = links.count()
-    num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
+    num_archived = links.filter(status='sealed').count() or links.exclude(downloaded_at=None).count()
    num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
    print(f'    > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
    print(f'      > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
    print(f'      > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')

-    # Count directories on filesystem
-    num_present = 0
-    orphaned_dirs = []
-    if ARCHIVE_DIR.exists():
-        for entry in ARCHIVE_DIR.iterdir():
-            if entry.is_dir():
-                num_present += 1
-                if not links.filter(timestamp=entry.name).exists():
-                    orphaned_dirs.append(str(entry))
-
-    num_valid = min(num_present, num_indexed)  # approximate
+    # All snapshots are tracked in DB now, no need to count filesystem dirs
+    num_valid = num_indexed
    print()
-    print(f'    > present: {num_present}'.ljust(36), '(directories in archive/)')
-    print(f'      > [green]valid:[/green] {num_valid}'.ljust(36), '               (directories with matching DB entry)')
+    print(f'    > [green]valid:[/green] {num_valid}'.ljust(36), '(snapshots in database)')

-    num_orphaned = len(orphaned_dirs)
+    num_orphaned = 0  # Orphan detection would require filesystem scan, skip for S3 compatibility
    print(f'      > [red]orphaned:[/red] {num_orphaned}'.ljust(36), '         (directories without matching DB entry)')

    if num_indexed:
        print('    [violet]Hint:[/violet] You can list snapshots by status like so:')
        print('        [green]archivebox list --status=<status>  (e.g. archived, queued, etc.)[/green]')

-    if orphaned_dirs:
-        print('    [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
-        print('        [green]archivebox init[/green]')
-
    print()
    print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
    print(f'[yellow]   {CONSTANTS.LOGS_DIR}/*[/yellow]')
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -1,9 +1,6 @@

 __package__ = 'archivebox.core'

-import os
-from pathlib import Path
-
 from django.contrib import admin, messages
 from django.urls import path
 from django.utils.html import format_html, mark_safe
@@ -363,7 +360,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
        # ordering='archiveresult_count'
    )
    def size(self, obj):
-        archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
+        """Display archive size from DB (no filesystem access)."""
+        archive_size = obj.archive_size
        if archive_size:
            size_txt = printable_filesize(archive_size)
            if archive_size > 52428800:
@@ -442,14 +440,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
        description='Size',
    )
    def size_with_stats(self, obj):
-        """Show archive size with output size from archive results."""
+        """Show archive size from DB (no filesystem access)."""
        stats = obj.get_progress_stats()

-        # Use output_size from archive results if available, fallback to disk size
-        output_size = stats['output_size']
-        archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
-
-        size_bytes = output_size or archive_size or 0
+        # Use output_size from archive results (already aggregated in stats)
+        size_bytes = stats['output_size'] or 0

        if size_bytes:
            size_txt = printable_filesize(size_bytes)
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -22,7 +22,7 @@ from django.contrib import admin
 from django.conf import settings

 from archivebox.config import CONSTANTS
-from archivebox.misc.system import get_dir_size, atomic_write
+from archivebox.misc.system import atomic_write
 from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
 from archivebox.misc.hashing import get_dir_info
 from archivebox.hooks import (
@@ -1345,11 +1345,19 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'

    @cached_property
-    def archive_size(self):
-        try:
-            return get_dir_size(self.output_dir)[0]
-        except Exception:
-            return 0
+    def archive_size(self) -> int:
+        """
+        Total size of all archived files for this snapshot.
+        Computed from ArchiveResult.output_size in DB (no filesystem access).
+        """
+        from django.db.models import Sum
+
+        total = self.archiveresult_set.filter(
+            status='succeeded'
+        ).aggregate(
+            total_size=Sum('output_size')
+        )['total_size']
+        return total or 0

    def save_tags(self, tags: Iterable[str] = ()) -> None:
        tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
@@ -1904,8 +1912,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

    def canonical_outputs(self) -> Dict[str, Optional[str]]:
        """
-        Intelligently discover the best output file for each plugin.
-        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
+        Discover the best output file for each plugin.
+        Uses ArchiveResult.output_files from DB (no filesystem scanning).
        """
        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'

@@ -1917,36 +1925,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        }

        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
-        MAX_SCAN_FILES = 50  # Don't scan massive directories

-        def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
-            """Find the best representative file in a plugin's output directory"""
-            if not dir_path.exists() or not dir_path.is_dir():
+        def find_best_output_from_files(output_files: dict, plugin_name: str) -> Optional[str]:
+            """Find the best representative file from output_files dict."""
+            if not output_files:
                return None

            candidates = []
-            file_count = 0
-
-            # Special handling for media plugin - look for thumbnails
            is_media_dir = plugin_name == 'media'

-            # Scan for suitable files
-            for file_path in dir_path.rglob('*'):
-                file_count += 1
-                if file_count > MAX_SCAN_FILES:
-                    break
-
-                if file_path.is_dir() or file_path.name.startswith('.'):
+            for rel_path, metadata in output_files.items():
+                if rel_path.startswith('.'):
                    continue

-                ext = file_path.suffix.lstrip('.').lower()
+                ext = rel_path.rsplit('.', 1)[-1].lower() if '.' in rel_path else ''
                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
                    continue

-                try:
-                    size = file_path.stat().st_size
-                except OSError:
-                    continue
+                # Get size from metadata if available, otherwise assume it passes
+                size = metadata.get('size', MIN_DISPLAY_SIZE) if isinstance(metadata, dict) else MIN_DISPLAY_SIZE

                # For media dir, allow smaller image files (thumbnails are often < 15KB)
                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
@@ -1955,16 +1952,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

                # Prefer main files: index.html, output.*, content.*, etc.
                priority = 0
-                name_lower = file_path.name.lower()
+                name_lower = rel_path.lower()

                if is_media_dir:
-                    # Special prioritization for media directories
                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
-                        priority = 200  # Highest priority for thumbnails
+                        priority = 200
                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
-                        priority = 150  # High priority for any image
+                        priority = 150
                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
-                        priority = 100  # Lower priority for actual media files
+                        priority = 100
                    else:
                        priority = 50
                elif 'index' in name_lower:
@@ -1978,15 +1974,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                else:
                    priority = 10

-                candidates.append((priority, size, file_path))
+                candidates.append((priority, size, rel_path))

            if not candidates:
                return None

            # Sort by priority (desc), then size (desc)
            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
-            best_file = candidates[0][2]
-            return str(best_file.relative_to(Path(self.output_dir)))
+            return candidates[0][2]

        canonical = {
            'index_path': 'index.html',
@@ -1994,52 +1989,26 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
        }

-        # Scan each ArchiveResult's output directory for the best file
-        snap_dir = Path(self.output_dir)
+        # Get best output from each ArchiveResult using output_files from DB
        for result in self.archiveresult_set.filter(status='succeeded'):
            if not result.output_files and not result.output_str:
                continue

-            # Try to find the best output file for this plugin
-            plugin_dir = snap_dir / result.plugin
            best_output = None

-            # Check output_files first (new field)
+            # Check output_files first (primary source)
            if result.output_files:
-                first_file = next(iter(result.output_files.keys()), None)
-                if first_file and (plugin_dir / first_file).exists():
-                    best_output = f'{result.plugin}/{first_file}'
+                best_file = find_best_output_from_files(result.output_files, result.plugin)
+                if best_file:
+                    best_output = f'{result.plugin}/{best_file}'

            # Fallback to output_str if it looks like a path
-            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
+            if not best_output and result.output_str:
                best_output = result.output_str

-            if not best_output and plugin_dir.exists():
-                # Intelligently find the best file in the plugin's directory
-                best_output = find_best_output_in_dir(plugin_dir, result.plugin)
-
            if best_output:
                canonical[f'{result.plugin}_path'] = best_output

-        # Also scan top-level for legacy outputs (backwards compatibility)
-        for file_path in snap_dir.glob('*'):
-            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
-                continue
-
-            ext = file_path.suffix.lstrip('.').lower()
-            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
-                continue
-
-            try:
-                size = file_path.stat().st_size
-                if size >= MIN_DISPLAY_SIZE:
-                    # Add as generic output with stem as key
-                    key = f'{file_path.stem}_path'
-                    if key not in canonical:
-                        canonical[key] = file_path.name
-            except OSError:
-                continue
-
        if self.is_static:
            static_path = f'warc/{self.timestamp}'
            canonical.update({
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -54,6 +54,7 @@ class SnapshotView(View):

    @staticmethod
    def render_live_index(request, snapshot):
+        """Render the live index page using DB data (no filesystem access)."""
        TITLE_LOADING_MSG = 'Not yet archived...'

        # Dict of plugin -> ArchiveResult object
@@ -61,37 +62,33 @@ class SnapshotView(View):
        # Dict of plugin -> result info dict (for template compatibility)
        archiveresults = {}

-        results = snapshot.archiveresult_set.all()
+        # Get succeeded results with output files from DB
+        results = snapshot.archiveresult_set.filter(status='succeeded')

        for result in results:
            embed_path = result.embed_path()
-            abs_path = result.snapshot_dir / (embed_path or 'None')

-            if (result.status == 'succeeded'
-                and embed_path
-                and os.access(abs_path, os.R_OK)
-                and abs_path.exists()):
-                if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
-                    continue
+            # Check if result has any output files (from DB, not filesystem)
+            if not embed_path or not (result.output_files or result.output_str):
+                continue

-                # Store the full ArchiveResult object for template tags
-                archiveresult_objects[result.plugin] = result
+            # Store the full ArchiveResult object for template tags
+            archiveresult_objects[result.plugin] = result

-                result_info = {
-                    'name': result.plugin,
-                    'path': embed_path,
-                    'ts': ts_to_date_str(result.end_ts),
-                    'size': abs_path.stat().st_size or '?',
-                    'result': result,  # Include the full object for template tags
-                }
-                archiveresults[result.plugin] = result_info
+            # Get size from output_size field (DB) instead of stat()
+            result_info = {
+                'name': result.plugin,
+                'path': embed_path,
+                'ts': ts_to_date_str(result.end_ts),
+                'size': result.output_size or '?',
+                'result': result,  # Include the full object for template tags
+            }
+            archiveresults[result.plugin] = result_info

-        # Use canonical_outputs for intelligent discovery
-        # This method now scans ArchiveResults and uses smart heuristics
+        # Use canonical_outputs for intelligent discovery (now uses DB, not filesystem)
        canonical = snapshot.canonical_outputs()

-        # Add any newly discovered outputs from canonical_outputs to archiveresults
-        snap_dir = Path(snapshot.output_dir)
+        # Add any outputs from canonical_outputs not already in archiveresults
        for key, path in canonical.items():
            if not key.endswith('_path') or not path or path.startswith('http'):
                continue
@@ -100,22 +97,16 @@ class SnapshotView(View):
            if plugin_name in archiveresults:
                continue  # Already have this from ArchiveResult

-            file_path = snap_dir / path
-            if not file_path.exists() or not file_path.is_file():
-                continue
-
-            try:
-                file_size = file_path.stat().st_size
-                if file_size >= 15_000:  # Only show files > 15KB
-                    archiveresults[plugin_name] = {
-                        'name': plugin_name,
-                        'path': path,
-                        'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
-                        'size': file_size,
-                        'result': None,
-                    }
-            except OSError:
-                continue
+            # For canonical outputs not from ArchiveResult, add with minimal info
+            # (these are derived from output_files, so we know they exist)
+            if plugin_name not in ('index', 'google_favicon', 'archive_org'):
+                archiveresults[plugin_name] = {
+                    'name': plugin_name,
+                    'path': path,
+                    'ts': '',
+                    'size': '?',
+                    'result': None,
+                }

        # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
        # Convert to base names for display ordering
@@ -131,10 +122,8 @@ class SnapshotView(View):

        snapshot_info = snapshot.to_dict(extended=True)

-        try:
-            warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
-        except IndexError:
-            warc_path = 'warc/'
+        # Get warc path from canonical outputs (DB) instead of filesystem glob
+        warc_path = canonical.get('wget_path', 'warc/')

        context = {
            **snapshot_info,
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -25,7 +25,6 @@ from django.core.management.base import DjangoHelpFormatter

 from archivebox.config import CONSTANTS, DATA_DIR, VERSION
 from archivebox.config.common import SHELL_CONFIG
-from archivebox.misc.system import get_dir_size
 from archivebox.misc.util import enforce_types
 from archivebox.misc.logging import ANSI, stderr

@@ -312,14 +311,13 @@ def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new:
    else:
        _LAST_RUN_STATS.succeeded += 1

-    try:
-        size = get_dir_size(out_dir)
-    except FileNotFoundError:
-        size = (0, None, '0')
+    # Get archive size from DB instead of filesystem
+    archive_size = snapshot.archive_size
+    num_results = snapshot.archiveresult_set.filter(status='succeeded').count()

    end_ts = datetime.now(timezone.utc)
    duration = str(end_ts - start_ts).split('.')[0]
-    print('        [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
+    print('        [bright_black]{} results ({}) in {}s [/]'.format(num_results, printable_filesize(archive_size), duration))