Refactor archive file access to use DB instead of filesystem

This prepares the codebase for S3 storage support by eliminating
filesystem scanning for archive metadata. All file listings and
size calculations now use the existing output_files and output_size
fields on ArchiveResult.

Changes:
- Snapshot.archive_size: now uses Sum(ArchiveResult.output_size) from DB
- Snapshot.canonical_outputs(): uses output_files dict instead of rglob
- Admin size displays: removed os.access checks, use DB directly
- views.py render_live_index: uses output_files/output_size from DB
- archivebox_status: uses DB aggregation instead of get_dir_size
- logging_util: uses snapshot.archive_size instead of get_dir_size

No new models or DB fields required - leverages existing output_files
and output_size fields that are already populated during archiving.
This commit is contained in:
Claude
2026-01-01 14:14:38 +00:00
parent f7457b13ad
commit 68061656e3
5 changed files with 94 additions and 147 deletions

View File

@@ -37,46 +37,42 @@ def status(out_dir: Path=DATA_DIR) -> None:
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
print()
print('[green]\\[*] Scanning archive data directories...[/green]')
print('[green]\\[*] Scanning archive data from database...[/green]')
print(f'[yellow] {ARCHIVE_DIR}/*[/yellow]')
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
# Get archive stats from DB (no filesystem scanning)
from django.db.models import Sum, Count
from archivebox.core.models import ArchiveResult
archive_stats = ArchiveResult.objects.filter(status='succeeded').aggregate(
total_size=Sum('output_size'),
total_results=Count('id'),
)
num_bytes = archive_stats['total_size'] or 0
num_results = archive_stats['total_results'] or 0
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print(f' Size: {size} across {num_results} archive results (from DB)')
# Use DB as source of truth for snapshot status
num_indexed = links.count()
num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
num_archived = links.filter(status='sealed').count() or links.exclude(downloaded_at=None).count()
num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
# Count directories on filesystem
num_present = 0
orphaned_dirs = []
if ARCHIVE_DIR.exists():
for entry in ARCHIVE_DIR.iterdir():
if entry.is_dir():
num_present += 1
if not links.filter(timestamp=entry.name).exists():
orphaned_dirs.append(str(entry))
num_valid = min(num_present, num_indexed) # approximate
# All snapshots are tracked in DB now, no need to count filesystem dirs
num_valid = num_indexed
print()
print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), '(snapshots in database)')
num_orphaned = len(orphaned_dirs)
num_orphaned = 0 # Orphan detection would require filesystem scan, skip for S3 compatibility
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
if num_indexed:
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
if orphaned_dirs:
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
print(' [green]archivebox init[/green]')
print()
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')

View File

@@ -1,9 +1,6 @@
__package__ = 'archivebox.core'
import os
from pathlib import Path
from django.contrib import admin, messages
from django.urls import path
from django.utils.html import format_html, mark_safe
@@ -363,7 +360,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# ordering='archiveresult_count'
)
def size(self, obj):
archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
"""Display archive size from DB (no filesystem access)."""
archive_size = obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
@@ -442,14 +440,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description='Size',
)
def size_with_stats(self, obj):
"""Show archive size with output size from archive results."""
"""Show archive size from DB (no filesystem access)."""
stats = obj.get_progress_stats()
# Use output_size from archive results if available, fallback to disk size
output_size = stats['output_size']
archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
size_bytes = output_size or archive_size or 0
# Use output_size from archive results (already aggregated in stats)
size_bytes = stats['output_size'] or 0
if size_bytes:
size_txt = printable_filesize(size_bytes)

View File

@@ -22,7 +22,7 @@ from django.contrib import admin
from django.conf import settings
from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.system import atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.hooks import (
@@ -1345,11 +1345,19 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
@cached_property
def archive_size(self):
try:
return get_dir_size(self.output_dir)[0]
except Exception:
return 0
def archive_size(self) -> int:
"""
Total size of all archived files for this snapshot.
Computed from ArchiveResult.output_size in DB (no filesystem access).
"""
from django.db.models import Sum
total = self.archiveresult_set.filter(
status='succeeded'
).aggregate(
total_size=Sum('output_size')
)['total_size']
return total or 0
def save_tags(self, tags: Iterable[str] = ()) -> None:
tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
@@ -1904,8 +1912,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""
Intelligently discover the best output file for each plugin.
Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
Discover the best output file for each plugin.
Uses ArchiveResult.output_files from DB (no filesystem scanning).
"""
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
@@ -1917,36 +1925,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
}
MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files
MAX_SCAN_FILES = 50 # Don't scan massive directories
def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
"""Find the best representative file in a plugin's output directory"""
if not dir_path.exists() or not dir_path.is_dir():
def find_best_output_from_files(output_files: dict, plugin_name: str) -> Optional[str]:
"""Find the best representative file from output_files dict."""
if not output_files:
return None
candidates = []
file_count = 0
# Special handling for media plugin - look for thumbnails
is_media_dir = plugin_name == 'media'
# Scan for suitable files
for file_path in dir_path.rglob('*'):
file_count += 1
if file_count > MAX_SCAN_FILES:
break
if file_path.is_dir() or file_path.name.startswith('.'):
for rel_path, metadata in output_files.items():
if rel_path.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
ext = rel_path.rsplit('.', 1)[-1].lower() if '.' in rel_path else ''
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
continue
try:
size = file_path.stat().st_size
except OSError:
continue
# Get size from metadata if available, otherwise assume it passes
size = metadata.get('size', MIN_DISPLAY_SIZE) if isinstance(metadata, dict) else MIN_DISPLAY_SIZE
# For media dir, allow smaller image files (thumbnails are often < 15KB)
min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
@@ -1955,16 +1952,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Prefer main files: index.html, output.*, content.*, etc.
priority = 0
name_lower = file_path.name.lower()
name_lower = rel_path.lower()
if is_media_dir:
# Special prioritization for media directories
if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
priority = 200 # Highest priority for thumbnails
priority = 200
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
priority = 150 # High priority for any image
priority = 150
elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
priority = 100 # Lower priority for actual media files
priority = 100
else:
priority = 50
elif 'index' in name_lower:
@@ -1978,15 +1974,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
else:
priority = 10
candidates.append((priority, size, file_path))
candidates.append((priority, size, rel_path))
if not candidates:
return None
# Sort by priority (desc), then size (desc)
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
best_file = candidates[0][2]
return str(best_file.relative_to(Path(self.output_dir)))
return candidates[0][2]
canonical = {
'index_path': 'index.html',
@@ -1994,52 +1989,26 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
}
# Scan each ArchiveResult's output directory for the best file
snap_dir = Path(self.output_dir)
# Get best output from each ArchiveResult using output_files from DB
for result in self.archiveresult_set.filter(status='succeeded'):
if not result.output_files and not result.output_str:
continue
# Try to find the best output file for this plugin
plugin_dir = snap_dir / result.plugin
best_output = None
# Check output_files first (new field)
# Check output_files first (primary source)
if result.output_files:
first_file = next(iter(result.output_files.keys()), None)
if first_file and (plugin_dir / first_file).exists():
best_output = f'{result.plugin}/{first_file}'
best_file = find_best_output_from_files(result.output_files, result.plugin)
if best_file:
best_output = f'{result.plugin}/{best_file}'
# Fallback to output_str if it looks like a path
if not best_output and result.output_str and (snap_dir / result.output_str).exists():
if not best_output and result.output_str:
best_output = result.output_str
if not best_output and plugin_dir.exists():
# Intelligently find the best file in the plugin's directory
best_output = find_best_output_in_dir(plugin_dir, result.plugin)
if best_output:
canonical[f'{result.plugin}_path'] = best_output
# Also scan top-level for legacy outputs (backwards compatibility)
for file_path in snap_dir.glob('*'):
if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
continue
try:
size = file_path.stat().st_size
if size >= MIN_DISPLAY_SIZE:
# Add as generic output with stem as key
key = f'{file_path.stem}_path'
if key not in canonical:
canonical[key] = file_path.name
except OSError:
continue
if self.is_static:
static_path = f'warc/{self.timestamp}'
canonical.update({

View File

@@ -54,6 +54,7 @@ class SnapshotView(View):
@staticmethod
def render_live_index(request, snapshot):
"""Render the live index page using DB data (no filesystem access)."""
TITLE_LOADING_MSG = 'Not yet archived...'
# Dict of plugin -> ArchiveResult object
@@ -61,37 +62,33 @@ class SnapshotView(View):
# Dict of plugin -> result info dict (for template compatibility)
archiveresults = {}
results = snapshot.archiveresult_set.all()
# Get succeeded results with output files from DB
results = snapshot.archiveresult_set.filter(status='succeeded')
for result in results:
embed_path = result.embed_path()
abs_path = result.snapshot_dir / (embed_path or 'None')
if (result.status == 'succeeded'
and embed_path
and os.access(abs_path, os.R_OK)
and abs_path.exists()):
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
continue
# Check if result has any output files (from DB, not filesystem)
if not embed_path or not (result.output_files or result.output_str):
continue
# Store the full ArchiveResult object for template tags
archiveresult_objects[result.plugin] = result
# Store the full ArchiveResult object for template tags
archiveresult_objects[result.plugin] = result
result_info = {
'name': result.plugin,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': abs_path.stat().st_size or '?',
'result': result, # Include the full object for template tags
}
archiveresults[result.plugin] = result_info
# Get size from output_size field (DB) instead of stat()
result_info = {
'name': result.plugin,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': result.output_size or '?',
'result': result, # Include the full object for template tags
}
archiveresults[result.plugin] = result_info
# Use canonical_outputs for intelligent discovery
# This method now scans ArchiveResults and uses smart heuristics
# Use canonical_outputs for intelligent discovery (now uses DB, not filesystem)
canonical = snapshot.canonical_outputs()
# Add any newly discovered outputs from canonical_outputs to archiveresults
snap_dir = Path(snapshot.output_dir)
# Add any outputs from canonical_outputs not already in archiveresults
for key, path in canonical.items():
if not key.endswith('_path') or not path or path.startswith('http'):
continue
@@ -100,22 +97,16 @@ class SnapshotView(View):
if plugin_name in archiveresults:
continue # Already have this from ArchiveResult
file_path = snap_dir / path
if not file_path.exists() or not file_path.is_file():
continue
try:
file_size = file_path.stat().st_size
if file_size >= 15_000: # Only show files > 15KB
archiveresults[plugin_name] = {
'name': plugin_name,
'path': path,
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
'size': file_size,
'result': None,
}
except OSError:
continue
# For canonical outputs not from ArchiveResult, add with minimal info
# (these are derived from output_files, so we know they exist)
if plugin_name not in ('index', 'google_favicon', 'archive_org'):
archiveresults[plugin_name] = {
'name': plugin_name,
'path': path,
'ts': '',
'size': '?',
'result': None,
}
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
@@ -131,10 +122,8 @@ class SnapshotView(View):
snapshot_info = snapshot.to_dict(extended=True)
try:
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
except IndexError:
warc_path = 'warc/'
# Get warc path from canonical outputs (DB) instead of filesystem glob
warc_path = canonical.get('wget_path', 'warc/')
context = {
**snapshot_info,

View File

@@ -25,7 +25,6 @@ from django.core.management.base import DjangoHelpFormatter
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import ANSI, stderr
@@ -312,14 +311,13 @@ def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new:
else:
_LAST_RUN_STATS.succeeded += 1
try:
size = get_dir_size(out_dir)
except FileNotFoundError:
size = (0, None, '0')
# Get archive size from DB instead of filesystem
archive_size = snapshot.archive_size
num_results = snapshot.archiveresult_set.filter(status='succeeded').count()
end_ts = datetime.now(timezone.utc)
duration = str(end_ts - start_ts).split('.')[0]
print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
print(' [bright_black]{} results ({}) in {}s [/]'.format(num_results, printable_filesize(archive_size), duration))