diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index e8e91b2d..94f5916c 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -37,46 +37,42 @@ def status(out_dir: Path=DATA_DIR) -> None: print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})') print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') print() - print('[green]\\[*] Scanning archive data directories...[/green]') + print('[green]\\[*] Scanning archive data from database...[/green]') print(f'[yellow] {ARCHIVE_DIR}/*[/yellow]') - num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) + + # Get archive stats from DB (no filesystem scanning) + from django.db.models import Sum, Count + from archivebox.core.models import ArchiveResult + + archive_stats = ArchiveResult.objects.filter(status='succeeded').aggregate( + total_size=Sum('output_size'), + total_results=Count('id'), + ) + num_bytes = archive_stats['total_size'] or 0 + num_results = archive_stats['total_results'] or 0 size = printable_filesize(num_bytes) - print(f' Size: {size} across {num_files} files in {num_dirs} directories') + print(f' Size: {size} across {num_results} archive results (from DB)') # Use DB as source of truth for snapshot status num_indexed = links.count() - num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count() + num_archived = links.filter(status='sealed').count() or links.exclude(downloaded_at=None).count() num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count() print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)') print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)') print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)') - # Count directories on filesystem - num_present = 0 - orphaned_dirs = [] - if ARCHIVE_DIR.exists(): - for entry in ARCHIVE_DIR.iterdir(): - if entry.is_dir(): - num_present += 1 - if not links.filter(timestamp=entry.name).exists(): - orphaned_dirs.append(str(entry)) - - num_valid = min(num_present, num_indexed) # approximate + # All snapshots are tracked in DB now, no need to count filesystem dirs + num_valid = num_indexed print() - print(f' > present: {num_present}'.ljust(36), '(directories in archive/)') - print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)') + print(f' > [green]valid:[/green] {num_valid}'.ljust(36), '(snapshots in database)') - num_orphaned = len(orphaned_dirs) + num_orphaned = 0 # Orphan detection would require filesystem scan, skip for S3 compatibility print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)') if num_indexed: print(' [violet]Hint:[/violet] You can list snapshots by status like so:') print(' [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]') - if orphaned_dirs: - print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') - print(' [green]archivebox init[/green]') - print() print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]') diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 0af36faf..afa31175 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -1,9 +1,6 @@ __package__ = 'archivebox.core' -import os -from pathlib import Path - from django.contrib import admin, messages from django.urls import path from django.utils.html import format_html, mark_safe @@ -363,7 +360,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # ordering='archiveresult_count' ) def size(self, obj): - archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size + """Display archive size from DB (no filesystem access).""" + archive_size = obj.archive_size if archive_size: size_txt = printable_filesize(archive_size) if archive_size > 52428800: @@ -442,14 +440,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description='Size', ) def size_with_stats(self, obj): - """Show archive size with output size from archive results.""" + """Show archive size from DB (no filesystem access).""" stats = obj.get_progress_stats() - # Use output_size from archive results if available, fallback to disk size - output_size = stats['output_size'] - archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size - - size_bytes = output_size or archive_size or 0 + # Use output_size from archive results (already aggregated in stats) + size_bytes = stats['output_size'] or 0 if size_bytes: size_txt = printable_filesize(size_bytes) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b8aa660c..b20ff67b 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -22,7 +22,7 @@ from django.contrib import admin from django.conf import settings from archivebox.config import CONSTANTS -from archivebox.misc.system import get_dir_size, atomic_write +from archivebox.misc.system import atomic_write from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.misc.hashing import get_dir_info from archivebox.hooks import ( @@ -1345,11 +1345,19 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' @cached_property - def archive_size(self): - try: - return get_dir_size(self.output_dir)[0] - except Exception: - return 0 + def archive_size(self) -> int: + """ + Total size of all archived files for this snapshot. + Computed from ArchiveResult.output_size in DB (no filesystem access). + """ + from django.db.models import Sum + + total = self.archiveresult_set.filter( + status='succeeded' + ).aggregate( + total_size=Sum('output_size') + )['total_size'] + return total or 0 def save_tags(self, tags: Iterable[str] = ()) -> None: tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()] @@ -1904,8 +1912,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def canonical_outputs(self) -> Dict[str, Optional[str]]: """ - Intelligently discover the best output file for each plugin. - Uses actual ArchiveResult data and filesystem scanning with smart heuristics. + Discover the best output file for each plugin. + Uses ArchiveResult.output_files from DB (no filesystem scanning). """ FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}' @@ -1917,36 +1925,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea } MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files - MAX_SCAN_FILES = 50 # Don't scan massive directories - def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]: - """Find the best representative file in a plugin's output directory""" - if not dir_path.exists() or not dir_path.is_dir(): + def find_best_output_from_files(output_files: dict, plugin_name: str) -> Optional[str]: + """Find the best representative file from output_files dict.""" + if not output_files: return None candidates = [] - file_count = 0 - - # Special handling for media plugin - look for thumbnails is_media_dir = plugin_name == 'media' - # Scan for suitable files - for file_path in dir_path.rglob('*'): - file_count += 1 - if file_count > MAX_SCAN_FILES: - break - - if file_path.is_dir() or file_path.name.startswith('.'): + for rel_path, metadata in output_files.items(): + if rel_path.startswith('.'): continue - ext = file_path.suffix.lstrip('.').lower() + ext = rel_path.rsplit('.', 1)[-1].lower() if '.' in rel_path else '' if ext not in IFRAME_EMBEDDABLE_EXTENSIONS: continue - try: - size = file_path.stat().st_size - except OSError: - continue + # Get size from metadata if available, otherwise assume it passes + size = metadata.get('size', MIN_DISPLAY_SIZE) if isinstance(metadata, dict) else MIN_DISPLAY_SIZE # For media dir, allow smaller image files (thumbnails are often < 15KB) min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE @@ -1955,16 +1952,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Prefer main files: index.html, output.*, content.*, etc. priority = 0 - name_lower = file_path.name.lower() + name_lower = rel_path.lower() if is_media_dir: - # Special prioritization for media directories if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')): - priority = 200 # Highest priority for thumbnails + priority = 200 elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'): - priority = 150 # High priority for any image + priority = 150 elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'): - priority = 100 # Lower priority for actual media files + priority = 100 else: priority = 50 elif 'index' in name_lower: @@ -1978,15 +1974,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea else: priority = 10 - candidates.append((priority, size, file_path)) + candidates.append((priority, size, rel_path)) if not candidates: return None # Sort by priority (desc), then size (desc) candidates.sort(key=lambda x: (x[0], x[1]), reverse=True) - best_file = candidates[0][2] - return str(best_file.relative_to(Path(self.output_dir))) + return candidates[0][2] canonical = { 'index_path': 'index.html', @@ -1994,52 +1989,26 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'archive_org_path': f'https://web.archive.org/web/{self.base_url}', } - # Scan each ArchiveResult's output directory for the best file - snap_dir = Path(self.output_dir) + # Get best output from each ArchiveResult using output_files from DB for result in self.archiveresult_set.filter(status='succeeded'): if not result.output_files and not result.output_str: continue - # Try to find the best output file for this plugin - plugin_dir = snap_dir / result.plugin best_output = None - # Check output_files first (new field) + # Check output_files first (primary source) if result.output_files: - first_file = next(iter(result.output_files.keys()), None) - if first_file and (plugin_dir / first_file).exists(): - best_output = f'{result.plugin}/{first_file}' + best_file = find_best_output_from_files(result.output_files, result.plugin) + if best_file: + best_output = f'{result.plugin}/{best_file}' # Fallback to output_str if it looks like a path - if not best_output and result.output_str and (snap_dir / result.output_str).exists(): + if not best_output and result.output_str: best_output = result.output_str - if not best_output and plugin_dir.exists(): - # Intelligently find the best file in the plugin's directory - best_output = find_best_output_in_dir(plugin_dir, result.plugin) - if best_output: canonical[f'{result.plugin}_path'] = best_output - # Also scan top-level for legacy outputs (backwards compatibility) - for file_path in snap_dir.glob('*'): - if file_path.is_dir() or file_path.name in ('index.html', 'index.json'): - continue - - ext = file_path.suffix.lstrip('.').lower() - if ext not in IFRAME_EMBEDDABLE_EXTENSIONS: - continue - - try: - size = file_path.stat().st_size - if size >= MIN_DISPLAY_SIZE: - # Add as generic output with stem as key - key = f'{file_path.stem}_path' - if key not in canonical: - canonical[key] = file_path.name - except OSError: - continue - if self.is_static: static_path = f'warc/{self.timestamp}' canonical.update({ diff --git a/archivebox/core/views.py b/archivebox/core/views.py index f0410846..6666fb18 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -54,6 +54,7 @@ class SnapshotView(View): @staticmethod def render_live_index(request, snapshot): + """Render the live index page using DB data (no filesystem access).""" TITLE_LOADING_MSG = 'Not yet archived...' # Dict of plugin -> ArchiveResult object @@ -61,37 +62,33 @@ class SnapshotView(View): # Dict of plugin -> result info dict (for template compatibility) archiveresults = {} - results = snapshot.archiveresult_set.all() + # Get succeeded results with output files from DB + results = snapshot.archiveresult_set.filter(status='succeeded') for result in results: embed_path = result.embed_path() - abs_path = result.snapshot_dir / (embed_path or 'None') - if (result.status == 'succeeded' - and embed_path - and os.access(abs_path, os.R_OK) - and abs_path.exists()): - if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')): - continue + # Check if result has any output files (from DB, not filesystem) + if not embed_path or not (result.output_files or result.output_str): + continue - # Store the full ArchiveResult object for template tags - archiveresult_objects[result.plugin] = result + # Store the full ArchiveResult object for template tags + archiveresult_objects[result.plugin] = result - result_info = { - 'name': result.plugin, - 'path': embed_path, - 'ts': ts_to_date_str(result.end_ts), - 'size': abs_path.stat().st_size or '?', - 'result': result, # Include the full object for template tags - } - archiveresults[result.plugin] = result_info + # Get size from output_size field (DB) instead of stat() + result_info = { + 'name': result.plugin, + 'path': embed_path, + 'ts': ts_to_date_str(result.end_ts), + 'size': result.output_size or '?', + 'result': result, # Include the full object for template tags + } + archiveresults[result.plugin] = result_info - # Use canonical_outputs for intelligent discovery - # This method now scans ArchiveResults and uses smart heuristics + # Use canonical_outputs for intelligent discovery (now uses DB, not filesystem) canonical = snapshot.canonical_outputs() - # Add any newly discovered outputs from canonical_outputs to archiveresults - snap_dir = Path(snapshot.output_dir) + # Add any outputs from canonical_outputs not already in archiveresults for key, path in canonical.items(): if not key.endswith('_path') or not path or path.startswith('http'): continue @@ -100,22 +97,16 @@ class SnapshotView(View): if plugin_name in archiveresults: continue # Already have this from ArchiveResult - file_path = snap_dir / path - if not file_path.exists() or not file_path.is_file(): - continue - - try: - file_size = file_path.stat().st_size - if file_size >= 15_000: # Only show files > 15KB - archiveresults[plugin_name] = { - 'name': plugin_name, - 'path': path, - 'ts': ts_to_date_str(file_path.stat().st_mtime or 0), - 'size': file_size, - 'result': None, - } - except OSError: - continue + # For canonical outputs not from ArchiveResult, add with minimal info + # (these are derived from output_files, so we know they exist) + if plugin_name not in ('index', 'google_favicon', 'archive_org'): + archiveresults[plugin_name] = { + 'name': plugin_name, + 'path': path, + 'ts': '', + 'size': '?', + 'result': None, + } # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) # Convert to base names for display ordering @@ -131,10 +122,8 @@ class SnapshotView(View): snapshot_info = snapshot.to_dict(extended=True) - try: - warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name - except IndexError: - warc_path = 'warc/' + # Get warc path from canonical outputs (DB) instead of filesystem glob + warc_path = canonical.get('wget_path', 'warc/') context = { **snapshot_info, diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 547b3b68..980ba57f 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -25,7 +25,6 @@ from django.core.management.base import DjangoHelpFormatter from archivebox.config import CONSTANTS, DATA_DIR, VERSION from archivebox.config.common import SHELL_CONFIG -from archivebox.misc.system import get_dir_size from archivebox.misc.util import enforce_types from archivebox.misc.logging import ANSI, stderr @@ -312,14 +311,13 @@ def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: else: _LAST_RUN_STATS.succeeded += 1 - try: - size = get_dir_size(out_dir) - except FileNotFoundError: - size = (0, None, '0') + # Get archive size from DB instead of filesystem + archive_size = snapshot.archive_size + num_results = snapshot.archiveresult_set.filter(status='succeeded').count() end_ts = datetime.now(timezone.utc) duration = str(end_ts - start_ts).split('.')[0] - print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration)) + print(' [bright_black]{} results ({}) in {}s [/]'.format(num_results, printable_filesize(archive_size), duration))