ArchiveBox/archivebox/misc/folders.py

"""
Folder status and integrity checking utilities for ArchiveBox.
"""

__package__ = 'archivebox.misc'

import os
import json
import shutil
from pathlib import Path
from itertools import chain
from typing import Dict, Optional, List, Tuple, TYPE_CHECKING

from django.db.models import QuerySet

from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.misc.util import enforce_types

if TYPE_CHECKING:
    from core.models import Snapshot


def _is_valid_snapshot(snapshot: 'Snapshot') -> bool:
    """Check if a snapshot's data directory is valid"""
    dir_exists = Path(snapshot.output_dir).exists()
    index_exists = (Path(snapshot.output_dir) / "index.json").exists()
    if not dir_exists:
        return False
    if dir_exists and not index_exists:
        return False
    if dir_exists and index_exists:
        try:
            with open(Path(snapshot.output_dir) / "index.json", 'r') as f:
                data = json.load(f)
                return snapshot.url == data.get('url')
        except Exception:
            pass
    return False


def _is_corrupt_snapshot(snapshot: 'Snapshot') -> bool:
    """Check if a snapshot's data directory is corrupted"""
    if not Path(snapshot.output_dir).exists():
        return False
    return not _is_valid_snapshot(snapshot)


def get_indexed_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
    """indexed snapshots without checking archive status or data directory validity"""
    return {
        snapshot.output_dir: snapshot
        for snapshot in snapshots.iterator(chunk_size=500)
    }


def get_archived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
    """indexed snapshots that are archived with a valid data directory"""
    return {
        snapshot.output_dir: snapshot
        for snapshot in snapshots.iterator(chunk_size=500)
        if snapshot.is_archived
    }


def get_unarchived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
    """indexed snapshots that are unarchived with no data directory or an empty data directory"""
    return {
        snapshot.output_dir: snapshot
        for snapshot in snapshots.iterator(chunk_size=500)
        if not snapshot.is_archived
    }


def get_present_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
    """dirs that actually exist in the archive/ folder"""
    from core.models import Snapshot

    all_folders = {}
    for entry in (out_dir / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
        if entry.is_dir():
            snapshot = None
            try:
                snapshot = Snapshot.objects.get(timestamp=entry.name)
            except Snapshot.DoesNotExist:
                pass
            all_folders[entry.name] = snapshot
    return all_folders


def get_valid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
    """dirs with a valid index matched to the main index and archived content"""
    return {
        snapshot.output_dir: snapshot
        for snapshot in snapshots.iterator(chunk_size=500)
        if _is_valid_snapshot(snapshot)
    }


def get_invalid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
    duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
    orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
    corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
    unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
    return {**duplicate, **orphaned, **corrupted, **unrecognized}


def get_duplicate_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
    """dirs that conflict with other directories that have the same URL or timestamp"""
    from core.models import Snapshot as SnapshotModel

    by_url: Dict[str, int] = {}
    by_timestamp: Dict[str, int] = {}
    duplicate_folders: Dict[str, Optional['Snapshot']] = {}

    data_folders = (
        str(entry)
        for entry in CONSTANTS.ARCHIVE_DIR.iterdir()
        if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
    )

    for item in chain(snapshots.iterator(chunk_size=500), data_folders):
        snapshot = None
        if isinstance(item, str):
            path = item
            timestamp = Path(path).name
            try:
                snapshot = SnapshotModel.objects.get(timestamp=timestamp)
            except SnapshotModel.DoesNotExist:
                pass
        else:
            snapshot = item
            path = snapshot.output_dir

        if snapshot:
            by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
            if by_timestamp[snapshot.timestamp] > 1:
                duplicate_folders[path] = snapshot

            by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
            if by_url[snapshot.url] > 1:
                duplicate_folders[path] = snapshot
    return duplicate_folders


def get_orphaned_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
    """dirs that contain a valid index but aren't listed in the main index"""
    orphaned_folders: Dict[str, Optional['Snapshot']] = {}

    for entry in CONSTANTS.ARCHIVE_DIR.iterdir():
        if entry.is_dir():
            index_path = entry / "index.json"
            if index_path.exists() and not snapshots.filter(timestamp=entry.name).exists():
                orphaned_folders[str(entry)] = None
    return orphaned_folders


def get_corrupted_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
    """dirs that exist but have corrupted/invalid index files"""
    corrupted: Dict[str, 'Snapshot'] = {}
    for snapshot in snapshots.iterator(chunk_size=500):
        if _is_corrupt_snapshot(snapshot):
            corrupted[snapshot.output_dir] = snapshot
    return corrupted


def get_unrecognized_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, None]:
    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
    unrecognized_folders: Dict[str, None] = {}

    for entry in (Path(out_dir) / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
        if entry.is_dir():
            index_exists = (entry / "index.json").exists()

            if index_exists:
                try:
                    with open(entry / "index.json", 'r') as f:
                        json.load(f)
                except Exception:
                    unrecognized_folders[str(entry)] = None
            else:
                timestamp = entry.name
                if not snapshots.filter(timestamp=timestamp).exists():
                    unrecognized_folders[str(entry)] = None
    return unrecognized_folders


@enforce_types
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
    """Move folders to their correct timestamp-named locations based on index.json"""
    fixed = []
    cant_fix = []
    for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
        if entry.is_dir(follow_symlinks=True):
            index_path = Path(entry.path) / 'index.json'
            if index_path.exists():
                try:
                    with open(index_path, 'r') as f:
                        data = json.load(f)
                    timestamp = data.get('timestamp')
                    url = data.get('url')
                except Exception:
                    continue

                if not timestamp:
                    continue

                if not entry.path.endswith(f'/{timestamp}'):
                    dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
                    if dest.exists():
                        cant_fix.append(entry.path)
                    else:
                        shutil.move(entry.path, str(dest))
                        fixed.append(str(dest))
    return fixed, cant_fix