From 68061656e3b685e86b6213b047c77a1bd96c4303 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 1 Jan 2026 14:14:38 +0000 Subject: [PATCH] Refactor archive file access to use DB instead of filesystem This prepares the codebase for S3 storage support by eliminating filesystem scanning for archive metadata. All file listings and size calculations now use the existing output_files and output_size fields on ArchiveResult. Changes: - Snapshot.archive_size: now uses Sum(ArchiveResult.output_size) from DB - Snapshot.canonical_outputs(): uses output_files dict instead of rglob - Admin size displays: removed os.access checks, use DB directly - views.py render_live_index: uses output_files/output_size from DB - archivebox_status: uses DB aggregation instead of get_dir_size - logging_util: uses snapshot.archive_size instead of get_dir_size No new models or DB fields required - leverages existing output_files and output_size fields that are already populated during archiving. --- archivebox/cli/archivebox_status.py | 40 +++++------ archivebox/core/admin_snapshots.py | 15 ++-- archivebox/core/models.py | 103 ++++++++++------------------ archivebox/core/views.py | 73 +++++++++----------- archivebox/misc/logging_util.py | 10 ++- 5 files changed, 94 insertions(+), 147 deletions(-) diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index e8e91b2d..94f5916c 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -37,46 +37,42 @@ def status(out_dir: Path=DATA_DIR) -> None: print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})') print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') print() - print('[green]\\[*] Scanning archive data directories...[/green]') + print('[green]\\[*] Scanning archive data from database...[/green]') print(f'[yellow] {ARCHIVE_DIR}/*[/yellow]') - num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) + + # Get archive stats from DB (no filesystem scanning) + from django.db.models import Sum, Count + from archivebox.core.models import ArchiveResult + + archive_stats = ArchiveResult.objects.filter(status='succeeded').aggregate( + total_size=Sum('output_size'), + total_results=Count('id'), + ) + num_bytes = archive_stats['total_size'] or 0 + num_results = archive_stats['total_results'] or 0 size = printable_filesize(num_bytes) - print(f' Size: {size} across {num_files} files in {num_dirs} directories') + print(f' Size: {size} across {num_results} archive results (from DB)') # Use DB as source of truth for snapshot status num_indexed = links.count() - num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count() + num_archived = links.filter(status='sealed').count() or links.exclude(downloaded_at=None).count() num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count() print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)') print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)') print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)') - # Count directories on filesystem - num_present = 0 - orphaned_dirs = [] - if ARCHIVE_DIR.exists(): - for entry in ARCHIVE_DIR.iterdir(): - if entry.is_dir(): - num_present += 1 - if not links.filter(timestamp=entry.name).exists(): - orphaned_dirs.append(str(entry)) - - num_valid = min(num_present, num_indexed) # approximate + # All snapshots are tracked in DB now, no need to count filesystem dirs + num_valid = num_indexed print() - print(f' > present: {num_present}'.ljust(36), '(directories in archive/)') - print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)') + print(f' > [green]valid:[/green] {num_valid}'.ljust(36), '(snapshots in database)') - num_orphaned = len(orphaned_dirs) + num_orphaned = 0 # Orphan detection would require filesystem scan, skip for S3 compatibility print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)') if num_indexed: print(' [violet]Hint:[/violet] You can list snapshots by status like so:') print(' [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]') - if orphaned_dirs: - print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') - print(' [green]archivebox init[/green]') - print() print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]') diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 0af36faf..afa31175 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -1,9 +1,6 @@ __package__ = 'archivebox.core' -import os -from pathlib import Path - from django.contrib import admin, messages from django.urls import path from django.utils.html import format_html, mark_safe @@ -363,7 +360,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # ordering='archiveresult_count' ) def size(self, obj): - archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size + """Display archive size from DB (no filesystem access).""" + archive_size = obj.archive_size if archive_size: size_txt = printable_filesize(archive_size) if archive_size > 52428800: @@ -442,14 +440,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description='Size', ) def size_with_stats(self, obj): - """Show archive size with output size from archive results.""" + """Show archive size from DB (no filesystem access).""" stats = obj.get_progress_stats() - # Use output_size from archive results if available, fallback to disk size - output_size = stats['output_size'] - archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size - - size_bytes = output_size or archive_size or 0 + # Use output_size from archive results (already aggregated in stats) + size_bytes = stats['output_size'] or 0 if size_bytes: size_txt = printable_filesize(size_bytes) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b8aa660c..b20ff67b 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -22,7 +22,7 @@ from django.contrib import admin from django.conf import settings from archivebox.config import CONSTANTS -from archivebox.misc.system import get_dir_size, atomic_write +from archivebox.misc.system import atomic_write from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.misc.hashing import get_dir_info from archivebox.hooks import ( @@ -1345,11 +1345,19 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' @cached_property - def archive_size(self): - try: - return get_dir_size(self.output_dir)[0] - except Exception: - return 0 + def archive_size(self) -> int: + """ + Total size of all archived files for this snapshot. + Computed from ArchiveResult.output_size in DB (no filesystem access). + """ + from django.db.models import Sum + + total = self.archiveresult_set.filter( + status='succeeded' + ).aggregate( + total_size=Sum('output_size') + )['total_size'] + return total or 0 def save_tags(self, tags: Iterable[str] = ()) -> None: tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()] @@ -1904,8 +1912,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def canonical_outputs(self) -> Dict[str, Optional[str]]: """ - Intelligently discover the best output file for each plugin. - Uses actual ArchiveResult data and filesystem scanning with smart heuristics. + Discover the best output file for each plugin. + Uses ArchiveResult.output_files from DB (no filesystem scanning). """ FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}' @@ -1917,36 +1925,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea } MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files - MAX_SCAN_FILES = 50 # Don't scan massive directories - def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]: - """Find the best representative file in a plugin's output directory""" - if not dir_path.exists() or not dir_path.is_dir(): + def find_best_output_from_files(output_files: dict, plugin_name: str) -> Optional[str]: + """Find the best representative file from output_files dict.""" + if not output_files: return None candidates = [] - file_count = 0 - - # Special handling for media plugin - look for thumbnails is_media_dir = plugin_name == 'media' - # Scan for suitable files - for file_path in dir_path.rglob('*'): - file_count += 1 - if file_count > MAX_SCAN_FILES: - break - - if file_path.is_dir() or file_path.name.startswith('.'): + for rel_path, metadata in output_files.items(): + if rel_path.startswith('.'): continue - ext = file_path.suffix.lstrip('.').lower() + ext = rel_path.rsplit('.', 1)[-1].lower() if '.' in rel_path else '' if ext not in IFRAME_EMBEDDABLE_EXTENSIONS: continue - try: - size = file_path.stat().st_size - except OSError: - continue + # Get size from metadata if available, otherwise assume it passes + size = metadata.get('size', MIN_DISPLAY_SIZE) if isinstance(metadata, dict) else MIN_DISPLAY_SIZE # For media dir, allow smaller image files (thumbnails are often < 15KB) min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE @@ -1955,16 +1952,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Prefer main files: index.html, output.*, content.*, etc. priority = 0 - name_lower = file_path.name.lower() + name_lower = rel_path.lower() if is_media_dir: - # Special prioritization for media directories if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')): - priority = 200 # Highest priority for thumbnails + priority = 200 elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'): - priority = 150 # High priority for any image + priority = 150 elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'): - priority = 100 # Lower priority for actual media files + priority = 100 else: priority = 50 elif 'index' in name_lower: @@ -1978,15 +1974,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea else: priority = 10 - candidates.append((priority, size, file_path)) + candidates.append((priority, size, rel_path)) if not candidates: return None # Sort by priority (desc), then size (desc) candidates.sort(key=lambda x: (x[0], x[1]), reverse=True) - best_file = candidates[0][2] - return str(best_file.relative_to(Path(self.output_dir))) + return candidates[0][2] canonical = { 'index_path': 'index.html', @@ -1994,52 +1989,26 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'archive_org_path': f'https://web.archive.org/web/{self.base_url}', } - # Scan each ArchiveResult's output directory for the best file - snap_dir = Path(self.output_dir) + # Get best output from each ArchiveResult using output_files from DB for result in self.archiveresult_set.filter(status='succeeded'): if not result.output_files and not result.output_str: continue - # Try to find the best output file for this plugin - plugin_dir = snap_dir / result.plugin best_output = None - # Check output_files first (new field) + # Check output_files first (primary source) if result.output_files: - first_file = next(iter(result.output_files.keys()), None) - if first_file and (plugin_dir / first_file).exists(): - best_output = f'{result.plugin}/{first_file}' + best_file = find_best_output_from_files(result.output_files, result.plugin) + if best_file: + best_output = f'{result.plugin}/{best_file}' # Fallback to output_str if it looks like a path - if not best_output and result.output_str and (snap_dir / result.output_str).exists(): + if not best_output and result.output_str: best_output = result.output_str - if not best_output and plugin_dir.exists(): - # Intelligently find the best file in the plugin's directory - best_output = find_best_output_in_dir(plugin_dir, result.plugin) - if best_output: canonical[f'{result.plugin}_path'] = best_output - # Also scan top-level for legacy outputs (backwards compatibility) - for file_path in snap_dir.glob('*'): - if file_path.is_dir() or file_path.name in ('index.html', 'index.json'): - continue - - ext = file_path.suffix.lstrip('.').lower() - if ext not in IFRAME_EMBEDDABLE_EXTENSIONS: - continue - - try: - size = file_path.stat().st_size - if size >= MIN_DISPLAY_SIZE: - # Add as generic output with stem as key - key = f'{file_path.stem}_path' - if key not in canonical: - canonical[key] = file_path.name - except OSError: - continue - if self.is_static: static_path = f'warc/{self.timestamp}' canonical.update({ diff --git a/archivebox/core/views.py b/archivebox/core/views.py index f0410846..6666fb18 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -54,6 +54,7 @@ class SnapshotView(View): @staticmethod def render_live_index(request, snapshot): + """Render the live index page using DB data (no filesystem access).""" TITLE_LOADING_MSG = 'Not yet archived...' # Dict of plugin -> ArchiveResult object @@ -61,37 +62,33 @@ class SnapshotView(View): # Dict of plugin -> result info dict (for template compatibility) archiveresults = {} - results = snapshot.archiveresult_set.all() + # Get succeeded results with output files from DB + results = snapshot.archiveresult_set.filter(status='succeeded') for result in results: embed_path = result.embed_path() - abs_path = result.snapshot_dir / (embed_path or 'None') - if (result.status == 'succeeded' - and embed_path - and os.access(abs_path, os.R_OK) - and abs_path.exists()): - if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')): - continue + # Check if result has any output files (from DB, not filesystem) + if not embed_path or not (result.output_files or result.output_str): + continue - # Store the full ArchiveResult object for template tags - archiveresult_objects[result.plugin] = result + # Store the full ArchiveResult object for template tags + archiveresult_objects[result.plugin] = result - result_info = { - 'name': result.plugin, - 'path': embed_path, - 'ts': ts_to_date_str(result.end_ts), - 'size': abs_path.stat().st_size or '?', - 'result': result, # Include the full object for template tags - } - archiveresults[result.plugin] = result_info + # Get size from output_size field (DB) instead of stat() + result_info = { + 'name': result.plugin, + 'path': embed_path, + 'ts': ts_to_date_str(result.end_ts), + 'size': result.output_size or '?', + 'result': result, # Include the full object for template tags + } + archiveresults[result.plugin] = result_info - # Use canonical_outputs for intelligent discovery - # This method now scans ArchiveResults and uses smart heuristics + # Use canonical_outputs for intelligent discovery (now uses DB, not filesystem) canonical = snapshot.canonical_outputs() - # Add any newly discovered outputs from canonical_outputs to archiveresults - snap_dir = Path(snapshot.output_dir) + # Add any outputs from canonical_outputs not already in archiveresults for key, path in canonical.items(): if not key.endswith('_path') or not path or path.startswith('http'): continue @@ -100,22 +97,16 @@ class SnapshotView(View): if plugin_name in archiveresults: continue # Already have this from ArchiveResult - file_path = snap_dir / path - if not file_path.exists() or not file_path.is_file(): - continue - - try: - file_size = file_path.stat().st_size - if file_size >= 15_000: # Only show files > 15KB - archiveresults[plugin_name] = { - 'name': plugin_name, - 'path': path, - 'ts': ts_to_date_str(file_path.stat().st_mtime or 0), - 'size': file_size, - 'result': None, - } - except OSError: - continue + # For canonical outputs not from ArchiveResult, add with minimal info + # (these are derived from output_files, so we know they exist) + if plugin_name not in ('index', 'google_favicon', 'archive_org'): + archiveresults[plugin_name] = { + 'name': plugin_name, + 'path': path, + 'ts': '', + 'size': '?', + 'result': None, + } # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) # Convert to base names for display ordering @@ -131,10 +122,8 @@ class SnapshotView(View): snapshot_info = snapshot.to_dict(extended=True) - try: - warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name - except IndexError: - warc_path = 'warc/' + # Get warc path from canonical outputs (DB) instead of filesystem glob + warc_path = canonical.get('wget_path', 'warc/') context = { **snapshot_info, diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 547b3b68..980ba57f 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -25,7 +25,6 @@ from django.core.management.base import DjangoHelpFormatter from archivebox.config import CONSTANTS, DATA_DIR, VERSION from archivebox.config.common import SHELL_CONFIG -from archivebox.misc.system import get_dir_size from archivebox.misc.util import enforce_types from archivebox.misc.logging import ANSI, stderr @@ -312,14 +311,13 @@ def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: else: _LAST_RUN_STATS.succeeded += 1 - try: - size = get_dir_size(out_dir) - except FileNotFoundError: - size = (0, None, '0') + # Get archive size from DB instead of filesystem + archive_size = snapshot.archive_size + num_results = snapshot.archiveresult_set.filter(status='succeeded').count() end_ts = datetime.now(timezone.utc) duration = str(end_ts - start_ts).split('.')[0] - print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration)) + print(' [bright_black]{} results ({}) in {}s [/]'.format(num_results, printable_filesize(archive_size), duration))