wip major changes

2026-04-04 23:07:56 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/misc/checks.py
+++ b/archivebox/misc/checks.py
@@ -54,7 +54,7 @@ def check_data_folder() -> None:
    
 def check_migrations():
    from archivebox import DATA_DIR
-    from ..index.sql import list_migrations
+    from archivebox.misc.db import list_migrations

    pending_migrations = [name for status, name in list_migrations() if not status]
    is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
@@ -210,7 +210,7 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
    
    lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
    
-    assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
+    # assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config"
    
    if not must_exist and not os.path.isdir(lib_dir):
        return True
--- a/archivebox/misc/db.py
+++ b/archivebox/misc/db.py
@@ -0,0 +1,57 @@
+"""
+Database utility functions for ArchiveBox.
+"""
+
+__package__ = 'archivebox.misc'
+
+from io import StringIO
+from pathlib import Path
+from typing import List, Tuple
+
+from archivebox.config import DATA_DIR
+from archivebox.misc.util import enforce_types
+
+
+@enforce_types
+def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
+    """List all Django migrations and their status"""
+    from django.core.management import call_command
+
+    out = StringIO()
+    call_command("showmigrations", list=True, stdout=out)
+    out.seek(0)
+
+    migrations = []
+    for line in out.readlines():
+        if line.strip() and ']' in line:
+            status_str, name_str = line.strip().split(']', 1)
+            is_applied = 'X' in status_str
+            migration_name = name_str.strip()
+            migrations.append((is_applied, migration_name))
+
+    return migrations
+
+
+@enforce_types
+def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
+    """Apply pending Django migrations"""
+    from django.core.management import call_command
+
+    out1, out2 = StringIO(), StringIO()
+
+    call_command("migrate", interactive=False, database='default', stdout=out1)
+    out1.seek(0)
+    call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
+    out2.seek(0)
+
+    return [
+        line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
+    ]
+
+
+@enforce_types
+def get_admins(out_dir: Path = DATA_DIR) -> List:
+    """Get list of superuser accounts"""
+    from django.contrib.auth.models import User
+
+    return User.objects.filter(is_superuser=True).exclude(username='system')
--- a/archivebox/misc/folders.py
+++ b/archivebox/misc/folders.py
@@ -0,0 +1,215 @@
+"""
+Folder status and integrity checking utilities for ArchiveBox.
+"""
+
+__package__ = 'archivebox.misc'
+
+import os
+import json
+import shutil
+from pathlib import Path
+from itertools import chain
+from typing import Dict, Optional, List, Tuple, TYPE_CHECKING
+
+from django.db.models import QuerySet
+
+from archivebox.config import DATA_DIR, CONSTANTS
+from archivebox.misc.util import enforce_types
+
+if TYPE_CHECKING:
+    from core.models import Snapshot
+
+
+def _is_valid_snapshot(snapshot: 'Snapshot') -> bool:
+    """Check if a snapshot's data directory is valid"""
+    dir_exists = Path(snapshot.output_dir).exists()
+    index_exists = (Path(snapshot.output_dir) / "index.json").exists()
+    if not dir_exists:
+        return False
+    if dir_exists and not index_exists:
+        return False
+    if dir_exists and index_exists:
+        try:
+            with open(Path(snapshot.output_dir) / "index.json", 'r') as f:
+                data = json.load(f)
+                return snapshot.url == data.get('url')
+        except Exception:
+            pass
+    return False
+
+
+def _is_corrupt_snapshot(snapshot: 'Snapshot') -> bool:
+    """Check if a snapshot's data directory is corrupted"""
+    if not Path(snapshot.output_dir).exists():
+        return False
+    return not _is_valid_snapshot(snapshot)
+
+
+def get_indexed_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
+    """indexed snapshots without checking archive status or data directory validity"""
+    return {
+        snapshot.output_dir: snapshot
+        for snapshot in snapshots.iterator(chunk_size=500)
+    }
+
+
+def get_archived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
+    """indexed snapshots that are archived with a valid data directory"""
+    return {
+        snapshot.output_dir: snapshot
+        for snapshot in snapshots.iterator(chunk_size=500)
+        if snapshot.is_archived
+    }
+
+
+def get_unarchived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
+    """indexed snapshots that are unarchived with no data directory or an empty data directory"""
+    return {
+        snapshot.output_dir: snapshot
+        for snapshot in snapshots.iterator(chunk_size=500)
+        if not snapshot.is_archived
+    }
+
+
+def get_present_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
+    """dirs that actually exist in the archive/ folder"""
+    from core.models import Snapshot
+
+    all_folders = {}
+    for entry in (out_dir / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
+        if entry.is_dir():
+            snapshot = None
+            try:
+                snapshot = Snapshot.objects.get(timestamp=entry.name)
+            except Snapshot.DoesNotExist:
+                pass
+            all_folders[entry.name] = snapshot
+    return all_folders
+
+
+def get_valid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
+    """dirs with a valid index matched to the main index and archived content"""
+    return {
+        snapshot.output_dir: snapshot
+        for snapshot in snapshots.iterator(chunk_size=500)
+        if _is_valid_snapshot(snapshot)
+    }
+
+
+def get_invalid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
+    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
+    duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
+    orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
+    corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
+    unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
+    return {**duplicate, **orphaned, **corrupted, **unrecognized}
+
+
+def get_duplicate_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
+    """dirs that conflict with other directories that have the same URL or timestamp"""
+    from core.models import Snapshot as SnapshotModel
+
+    by_url: Dict[str, int] = {}
+    by_timestamp: Dict[str, int] = {}
+    duplicate_folders: Dict[str, Optional['Snapshot']] = {}
+
+    data_folders = (
+        str(entry)
+        for entry in CONSTANTS.ARCHIVE_DIR.iterdir()
+        if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
+    )
+
+    for item in chain(snapshots.iterator(chunk_size=500), data_folders):
+        snapshot = None
+        if isinstance(item, str):
+            path = item
+            timestamp = Path(path).name
+            try:
+                snapshot = SnapshotModel.objects.get(timestamp=timestamp)
+            except SnapshotModel.DoesNotExist:
+                pass
+        else:
+            snapshot = item
+            path = snapshot.output_dir
+
+        if snapshot:
+            by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
+            if by_timestamp[snapshot.timestamp] > 1:
+                duplicate_folders[path] = snapshot
+
+            by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
+            if by_url[snapshot.url] > 1:
+                duplicate_folders[path] = snapshot
+    return duplicate_folders
+
+
+def get_orphaned_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
+    """dirs that contain a valid index but aren't listed in the main index"""
+    orphaned_folders: Dict[str, Optional['Snapshot']] = {}
+
+    for entry in CONSTANTS.ARCHIVE_DIR.iterdir():
+        if entry.is_dir():
+            index_path = entry / "index.json"
+            if index_path.exists() and not snapshots.filter(timestamp=entry.name).exists():
+                orphaned_folders[str(entry)] = None
+    return orphaned_folders
+
+
+def get_corrupted_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
+    """dirs that exist but have corrupted/invalid index files"""
+    corrupted: Dict[str, 'Snapshot'] = {}
+    for snapshot in snapshots.iterator(chunk_size=500):
+        if _is_corrupt_snapshot(snapshot):
+            corrupted[snapshot.output_dir] = snapshot
+    return corrupted
+
+
+def get_unrecognized_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, None]:
+    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
+    unrecognized_folders: Dict[str, None] = {}
+
+    for entry in (Path(out_dir) / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
+        if entry.is_dir():
+            index_exists = (entry / "index.json").exists()
+
+            if index_exists:
+                try:
+                    with open(entry / "index.json", 'r') as f:
+                        json.load(f)
+                except Exception:
+                    unrecognized_folders[str(entry)] = None
+            else:
+                timestamp = entry.name
+                if not snapshots.filter(timestamp=timestamp).exists():
+                    unrecognized_folders[str(entry)] = None
+    return unrecognized_folders
+
+
+@enforce_types
+def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
+    """Move folders to their correct timestamp-named locations based on index.json"""
+    fixed = []
+    cant_fix = []
+    for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
+        if entry.is_dir(follow_symlinks=True):
+            index_path = Path(entry.path) / 'index.json'
+            if index_path.exists():
+                try:
+                    with open(index_path, 'r') as f:
+                        data = json.load(f)
+                    timestamp = data.get('timestamp')
+                    url = data.get('url')
+                except Exception:
+                    continue
+
+                if not timestamp:
+                    continue
+
+                if not entry.path.endswith(f'/{timestamp}'):
+                    dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
+                    if dest.exists():
+                        cant_fix.append(entry.path)
+                    else:
+                        shutil.move(entry.path, str(dest))
+                        fixed.append(str(dest))
+    return fixed, cant_fix
--- a/archivebox/misc/hashing.py
+++ b/archivebox/misc/hashing.py
@@ -4,71 +4,65 @@ from functools import lru_cache
 from pathlib import Path
 from typing import Callable
 from datetime import datetime
-import blake3                        # pip install blake3

@lru_cache(maxsize=1024)
-def _cached_file_hashes(filepath: str, size: int, mtime: float) -> tuple[str, str]:
-    """Internal function to calculate file hashes with cache key based on path, size and mtime."""
+def _cached_file_hash(filepath: str, size: int, mtime: float) -> str:
+    """Internal function to calculate file hash with cache key based on path, size and mtime."""
    sha256_hash = hashlib.sha256()
-    blake3_hash = blake3.blake3()
-    
+
    with open(filepath, 'rb') as f:
-        # Read file once and update both hashes simultaneously
        for chunk in iter(lambda: f.read(4096), b''):
            sha256_hash.update(chunk)
-            blake3_hash.update(chunk)
-    
-    return sha256_hash.hexdigest(), blake3_hash.hexdigest()
+
+    return sha256_hash.hexdigest()

@lru_cache(maxsize=10)
-def hash_file(file_path: Path, pwd: Path | None = None) -> tuple[str, str]:
-    """Calculate SHA256 and BLAKE3 hashes of a file with caching based on path, size and mtime."""
+def hash_file(file_path: Path, pwd: Path | None = None) -> str:
+    """Calculate SHA256 hash of a file with caching based on path, size and mtime."""
    pwd = Path(pwd) if pwd else None
    file_path = Path(file_path)
    if not file_path.is_absolute():
        file_path = pwd / file_path if pwd else file_path.absolute()
-    
+
    abs_path = file_path.resolve()
    stat_info = abs_path.stat()
-    
-    return _cached_file_hashes(
+
+    return _cached_file_hash(
        str(abs_path),
        stat_info.st_size,
        stat_info.st_mtime
    )

@lru_cache(maxsize=10)
-def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, tuple[str, str]]:
-    """Calculate SHA256 and BLAKE3 hashes for all files and directories recursively."""
+def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]:
+    """Calculate SHA256 hashes for all files and directories recursively."""
    pwd = Path(pwd) if pwd else None
    dir_path = Path(dir_path)
    if not dir_path.is_absolute():
        dir_path = pwd / dir_path if pwd else dir_path.absolute()
-    
+
    if not dir_path.is_dir():
        raise ValueError(f"Not a directory: {dir_path}")
    if max_depth < -1:
        raise ValueError(f"max_depth must be >= -1, got {max_depth}")
-    
+
    # Get all files recursively
    all_files = get_dir_entries(
        dir_path, pwd=pwd, recursive=True,
        include_files=True, include_dirs=False,
        filter_func=filter_func
    )
-    
-    hashes: dict[str, tuple[str, str]] = {}
-    hashable_summary_sha256 = []
-    hashable_summary_blake3 = []
-    
+
+    hashes: dict[str, str] = {}
+    hashable_summary = []
+
    # Calculate hashes for all files
    for subfile in all_files:
        subfile_path = dir_path / subfile
-        sha256_hash, blake3_hash = hash_file(subfile_path)
-        hashes[subfile] = (sha256_hash, blake3_hash)
-        hashable_summary_sha256.append(f"{sha256_hash}  ./{subfile}")
-        hashable_summary_blake3.append(f"{blake3_hash}  ./{subfile}")
-    
+        sha256_hash = hash_file(subfile_path)
+        hashes[subfile] = sha256_hash
+        hashable_summary.append(f"{sha256_hash}  ./{subfile}")
+
    # Calculate hashes for all directories
    subdirs = get_dir_entries(
        dir_path, pwd=pwd, recursive=True,
@@ -76,7 +70,7 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
        include_hidden=False, filter_func=filter_func,
        max_depth=max_depth
    )
-    
+
    for subdir in subdirs:
        subdir_path = dir_path / subdir
        subdir_hashes = get_dir_hashes(
@@ -84,36 +78,34 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
            max_depth=0
        )
        hashes[subdir] = subdir_hashes['.']
-    
+
    # Filter results by max_depth
    if max_depth >= 0:
        hashes = {
            path: value for path, value in hashes.items()
            if len(Path(path).parts) <= max_depth + 1
        }
-    
-    # Calculate root directory hashes
-    hashable_summary_sha256.sort()
-    hashable_summary_blake3.sort()
-    root_sha256 = hashlib.sha256('\n'.join(hashable_summary_sha256).encode()).hexdigest()
-    root_blake3 = blake3.blake3('\n'.join(hashable_summary_blake3).encode()).hexdigest()
-    hashes['.'] = (root_sha256, root_blake3)
-    
+
+    # Calculate root directory hash
+    hashable_summary.sort()
+    root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest()
+    hashes['.'] = root_sha256
+
    return hashes


@lru_cache(maxsize=128)
-def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True, 
-                    include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False, 
+def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
+                    include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
                    filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
    """Get filtered list of directory entries."""
    pwd = Path(pwd) if pwd else None
    dir_path = Path(dir_path)
    if not dir_path.is_absolute():
        dir_path = pwd / dir_path if pwd else dir_path.absolute()
-    
+
    results = []
-    
+
    def process_path(path: Path, depth: int):
        if not include_hidden and path.name.startswith('.'):
            return False
@@ -127,18 +119,18 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
            if not filter_func(info):
                return False
        return True
-    
+
    for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
        current_depth = len(path.relative_to(dir_path).parts)
-        
+
        if path.is_file() and include_files and process_path(path, current_depth):
            results.append(str(path.relative_to(dir_path)))
        elif path.is_dir() and include_dirs and process_path(path, current_depth):
            results.append(str(path.relative_to(dir_path)))
-            
+
        if not recursive:
            break
-    
+
    return tuple(sorted(results))  # Make immutable for caching

@lru_cache(maxsize=1024)
@@ -147,7 +139,7 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
    sizes: dict[str, int] = {}
    hashes = get_dir_hashes(dir_path, pwd=pwd, **kwargs)
    dir_path = Path(dir_path)
-    
+
    for path_key in hashes:
        full_path = dir_path / path_key
        if full_path.is_file():
@@ -158,25 +150,25 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
                if file_path.is_file() and not file_path.name.startswith('.'):
                    total += file_path.stat().st_size
            sizes[path_key + '/'] = total
-    
+
    return sizes


@lru_cache(maxsize=10)
 def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict:
-    """Get detailed information about directory contents including both hash types and sizes."""
+    """Get detailed information about directory contents including hashes and sizes."""
    pwd = Path(pwd) if pwd else None
    dir_path = Path(dir_path)
    if not dir_path.is_absolute():
        dir_path = pwd / dir_path if pwd else dir_path.absolute()
-    
+
    hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
    sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)
-    
+
    num_total_subpaths = sum(1 for name in hashes if name != '.')
    details = {}
-    
-    for filename, (sha256_hash, blake3_hash) in sorted(hashes.items()):
+
+    for filename, sha256_hash in sorted(hashes.items()):
        abs_path = (dir_path / filename).resolve()
        stat_info = abs_path.stat()
        num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
@@ -197,7 +189,7 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
            extension = abs_path.suffix
            basename = abs_path.name.rsplit(extension, 1)[0]
            num_bytes = sizes[filename]
-        
+
        details[filename] = {
            'basename': basename,
            'mime_type': mime_type,
@@ -205,14 +197,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
            'num_subpaths': num_subpaths,
            'num_bytes': num_bytes,
            'hash_sha256': sha256_hash,
-            'hash_blake3': blake3_hash,
            'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
            'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
        }
-        
+
        if filter_func and not filter_func(details[filename]):
            del details[filename]
-    
+
    return details


@@ -221,7 +212,7 @@ if __name__ == '__main__':
    dir_info = get_dir_info(Path('.'), max_depth=6)
    with open('.hashes.json', 'w') as f:
        json.dump(dir_info, f, indent=4)
-    print('√ Wrote .hashes.json')
+    print('Wrote .hashes.json')

 # Example output:
 # {
@@ -232,7 +223,6 @@ if __name__ == '__main__':
 #         "num_subpaths": 25,
 #         "num_bytes": 214677,
 #         "hash_sha256": "addfacf88b2ff6b564846415fb7b21dcb7e63ee4e911bc0aec255ee354958530",
-#         "hash_blake3": "3403a1f876453c7749f17ee3502769eff05cff20b5d6c2f2cf458e6353a380db",
 #         "created_at": "2024-12-04T00:08:38.537449",
 #         "modified_at": "2024-12-04T00:08:38.537449"
 #     },
@@ -243,31 +233,8 @@ if __name__ == '__main__':
 #         "num_subpaths": null,
 #         "num_bytes": 32,
 #         "hash_sha256": "b0e5e7ff17db3b60535cf664282787767c336e3e203a43e21b6326c6fe457551",
-#         "hash_blake3": "4a801eb2a4cdde8d3422be1e2074b78574a5890afb3027cbe6f3b3cf4d113fd1",
 #         "created_at": "2024-10-08T00:51:41.001359",
 #         "modified_at": "2024-10-08T00:51:41.001359"
 #     },
-#     "__pycache__/": {
-#         "basename": "__pycache__",
-#         "mime_type": "inode/directory",
-#         "extension": "",
-#         "num_subpaths": 8,
-#         "num_bytes": 107593,
-#         "hash_sha256": "9e917a438be774ffc7ea9125de71008c29a7d9003b6f5e09e2085aa1ef3157b3",
-#         "hash_blake3": "e87184485bd67bd9b723a9ee4d472e8c1d24a4388d373046a27e5a1e10467a06",
-#         "created_at": "2024-12-04T00:00:16.149390",
-#         "modified_at": "2024-12-04T00:00:16.149390"
-#     },
-#     "__pycache__/__init__.cpython-313.pyc": {
-#         "basename": "__init__.cpython-313",
-#         "mime_type": "application/x-python-code",
-#         "extension": ".pyc",
-#         "num_subpaths": null,
-#         "num_bytes": 223,
-#         "hash_sha256": "d29e3ee5e6b9b564422d9ef2c7325d28cf759b9fb868f59551ba43cd991d51be",
-#         "hash_blake3": "279a6dc4c8161d6ddb18fa72c882f375324ed152dc6c7c7eac9ef5fdd066f2fd",
-#         "created_at": "2024-12-03T03:13:43.257430",
-#         "modified_at": "2024-12-03T03:13:43.257308"
-#     },
 #     ...
 # }
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -0,0 +1,343 @@
+"""
+JSONL (JSON Lines) utilities for ArchiveBox.
+
+Provides functions for reading, writing, and processing typed JSONL records.
+All CLI commands that accept stdin can read both plain URLs and typed JSONL.
+
+Typed JSONL Format:
+    {"type": "Snapshot", "url": "https://example.com", "title": "...", "tags": "..."}
+    {"type": "ArchiveResult", "snapshot_id": "...", "extractor": "wget", ...}
+    {"type": "Tag", "name": "..."}
+
+Plain URLs (also supported):
+    https://example.com
+    https://foo.com
+"""
+
+__package__ = 'archivebox.misc'
+
+import sys
+import json
+from typing import Iterator, Dict, Any, Optional, TextIO, Callable, Union, List
+from pathlib import Path
+
+
+# Type constants for JSONL records
+TYPE_SNAPSHOT = 'Snapshot'
+TYPE_ARCHIVERESULT = 'ArchiveResult'
+TYPE_TAG = 'Tag'
+TYPE_CRAWL = 'Crawl'
+TYPE_SEED = 'Seed'
+TYPE_INSTALLEDBINARY = 'InstalledBinary'
+
+VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY}
+
+
+def parse_line(line: str) -> Optional[Dict[str, Any]]:
+    """
+    Parse a single line of input as either JSONL or plain URL.
+
+    Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
+    """
+    line = line.strip()
+    if not line or line.startswith('#'):
+        return None
+
+    # Try to parse as JSON first
+    if line.startswith('{'):
+        try:
+            record = json.loads(line)
+            # If it has a type, validate it
+            if 'type' in record and record['type'] not in VALID_TYPES:
+                # Unknown type, treat as raw data
+                pass
+            # If it has url but no type, assume Snapshot
+            if 'url' in record and 'type' not in record:
+                record['type'] = TYPE_SNAPSHOT
+            return record
+        except json.JSONDecodeError:
+            pass
+
+    # Treat as plain URL if it looks like one
+    if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
+        return {'type': TYPE_SNAPSHOT, 'url': line}
+
+    # Could be a snapshot ID (UUID)
+    if len(line) == 36 and line.count('-') == 4:
+        return {'type': TYPE_SNAPSHOT, 'id': line}
+
+    # Unknown format, skip
+    return None
+
+
+def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
+    """
+    Read JSONL or plain URLs from stdin.
+
+    Yields parsed records as dicts.
+    Supports both JSONL format and plain URLs (one per line).
+    """
+    stream = stream or sys.stdin
+
+    # Don't block if stdin is a tty with no input
+    if stream.isatty():
+        return
+
+    for line in stream:
+        record = parse_line(line)
+        if record:
+            yield record
+
+
+def read_file(path: Path) -> Iterator[Dict[str, Any]]:
+    """
+    Read JSONL or plain URLs from a file.
+
+    Yields parsed records as dicts.
+    """
+    with open(path, 'r') as f:
+        for line in f:
+            record = parse_line(line)
+            if record:
+                yield record
+
+
+def read_args_or_stdin(args: tuple, stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
+    """
+    Read from CLI arguments if provided, otherwise from stdin.
+
+    Handles both URLs and JSONL from either source.
+    """
+    if args:
+        for arg in args:
+            # Check if it's a file path
+            path = Path(arg)
+            if path.exists() and path.is_file():
+                yield from read_file(path)
+            else:
+                record = parse_line(arg)
+                if record:
+                    yield record
+    else:
+        yield from read_stdin(stream)
+
+
+def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None:
+    """
+    Write a single JSONL record to stdout (or provided stream).
+    """
+    stream = stream or sys.stdout
+    stream.write(json.dumps(record) + '\n')
+    stream.flush()
+
+
+def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
+    """
+    Write multiple JSONL records to stdout (or provided stream).
+
+    Returns count of records written.
+    """
+    count = 0
+    for record in records:
+        write_record(record, stream)
+        count += 1
+    return count
+
+
+def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
+    """
+    Filter records by type.
+    """
+    for record in records:
+        if record.get('type') == record_type:
+            yield record
+
+
+def snapshot_to_jsonl(snapshot) -> Dict[str, Any]:
+    """
+    Convert a Snapshot model instance to a JSONL record.
+    """
+    return {
+        'type': TYPE_SNAPSHOT,
+        'id': str(snapshot.id),
+        'url': snapshot.url,
+        'title': snapshot.title,
+        'tags': snapshot.tags_str() if hasattr(snapshot, 'tags_str') else '',
+        'bookmarked_at': snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else None,
+        'created_at': snapshot.created_at.isoformat() if snapshot.created_at else None,
+        'timestamp': snapshot.timestamp,
+        'depth': getattr(snapshot, 'depth', 0),
+        'status': snapshot.status if hasattr(snapshot, 'status') else None,
+    }
+
+
+def archiveresult_to_jsonl(result) -> Dict[str, Any]:
+    """
+    Convert an ArchiveResult model instance to a JSONL record.
+    """
+    return {
+        'type': TYPE_ARCHIVERESULT,
+        'id': str(result.id),
+        'snapshot_id': str(result.snapshot_id),
+        'extractor': result.extractor,
+        'status': result.status,
+        'output': result.output,
+        'start_ts': result.start_ts.isoformat() if result.start_ts else None,
+        'end_ts': result.end_ts.isoformat() if result.end_ts else None,
+    }
+
+
+def tag_to_jsonl(tag) -> Dict[str, Any]:
+    """
+    Convert a Tag model instance to a JSONL record.
+    """
+    return {
+        'type': TYPE_TAG,
+        'id': str(tag.id),
+        'name': tag.name,
+        'slug': tag.slug,
+    }
+
+
+def crawl_to_jsonl(crawl) -> Dict[str, Any]:
+    """
+    Convert a Crawl model instance to a JSONL record.
+    """
+    return {
+        'type': TYPE_CRAWL,
+        'id': str(crawl.id),
+        'seed_id': str(crawl.seed_id),
+        'status': crawl.status,
+        'max_depth': crawl.max_depth,
+        'created_at': crawl.created_at.isoformat() if crawl.created_at else None,
+    }
+
+
+def process_records(
+    records: Iterator[Dict[str, Any]],
+    handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
+) -> Iterator[Dict[str, Any]]:
+    """
+    Process records through type-specific handlers.
+
+    Args:
+        records: Input record iterator
+        handlers: Dict mapping type names to handler functions
+                 Handlers return output records or None to skip
+
+    Yields output records from handlers.
+    """
+    for record in records:
+        record_type = record.get('type')
+        handler = handlers.get(record_type)
+        if handler:
+            result = handler(record)
+            if result:
+                yield result
+
+
+def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
+    """
+    Get or create a Snapshot from a JSONL record.
+
+    Returns the Snapshot instance.
+    """
+    from core.models import Snapshot
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from archivebox.misc.util import parse_date
+
+    created_by_id = created_by_id or get_or_create_system_user_pk()
+
+    # Extract fields from record
+    url = record.get('url')
+    if not url:
+        raise ValueError("Record missing required 'url' field")
+
+    title = record.get('title')
+    tags_str = record.get('tags', '')
+    bookmarked_at = record.get('bookmarked_at')
+    depth = record.get('depth', 0)
+    crawl_id = record.get('crawl_id')
+
+    # Parse bookmarked_at if string
+    if bookmarked_at and isinstance(bookmarked_at, str):
+        bookmarked_at = parse_date(bookmarked_at)
+
+    # Use the manager's create_or_update_from_dict method
+    snapshot = Snapshot.objects.create_or_update_from_dict(
+        {'url': url, 'title': title, 'tags': tags_str},
+        created_by_id=created_by_id
+    )
+
+    # Update additional fields if provided
+    update_fields = []
+    if depth and snapshot.depth != depth:
+        snapshot.depth = depth
+        update_fields.append('depth')
+    if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
+        snapshot.bookmarked_at = bookmarked_at
+        update_fields.append('bookmarked_at')
+    if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
+        snapshot.crawl_id = crawl_id
+        update_fields.append('crawl_id')
+
+    if update_fields:
+        snapshot.save(update_fields=update_fields + ['modified_at'])
+
+    return snapshot
+
+
+def get_or_create_tag(record: Dict[str, Any]):
+    """
+    Get or create a Tag from a JSONL record.
+
+    Returns the Tag instance.
+    """
+    from core.models import Tag
+
+    name = record.get('name')
+    if not name:
+        raise ValueError("Record missing required 'name' field")
+
+    tag, _ = Tag.objects.get_or_create(name=name)
+    return tag
+
+
+def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Optional[int] = None) -> Dict[str, List]:
+    """
+    Process JSONL records, creating Tags and Snapshots as needed.
+
+    Args:
+        records: Iterator of JSONL record dicts
+        created_by_id: User ID for created objects
+
+    Returns:
+        Dict with 'tags' and 'snapshots' lists of created objects
+    """
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+    created_by_id = created_by_id or get_or_create_system_user_pk()
+
+    results = {
+        'tags': [],
+        'snapshots': [],
+    }
+
+    for record in records:
+        record_type = record.get('type', TYPE_SNAPSHOT)
+
+        if record_type == TYPE_TAG:
+            try:
+                tag = get_or_create_tag(record)
+                results['tags'].append(tag)
+            except ValueError:
+                continue
+
+        elif record_type == TYPE_SNAPSHOT or 'url' in record:
+            try:
+                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+                results['snapshots'].append(snapshot)
+            except ValueError:
+                continue
+
+    return results
--- a/archivebox/misc/legacy.py
+++ b/archivebox/misc/legacy.py
@@ -0,0 +1,90 @@
+"""
+Legacy archive import utilities.
+
+These functions are used to import data from old ArchiveBox archive formats
+(JSON indexes, archive directory structures) into the new database.
+
+This is separate from the hooks-based parser system which handles importing
+new URLs from bookmark files, RSS feeds, etc.
+"""
+
+__package__ = 'archivebox.misc'
+
+import os
+import json
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Iterator, TypedDict, List
+
+
+class SnapshotDict(TypedDict, total=False):
+    """
+    Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
+    """
+    url: str              # Required: the URL to archive
+    timestamp: str        # Optional: unix timestamp string
+    title: str            # Optional: page title
+    tags: str             # Optional: comma-separated tags string
+    sources: List[str]    # Optional: list of source file paths
+
+
+def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
+    """
+    Parse links from the main JSON index file (archive/index.json).
+
+    This is used to recover links from old archive formats.
+    """
+    from archivebox.config import CONSTANTS
+
+    index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
+    if not index_path.exists():
+        return
+
+    try:
+        with open(index_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        links = data.get('links', [])
+        for link in links:
+            yield {
+                'url': link.get('url', ''),
+                'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
+                'title': link.get('title'),
+                'tags': link.get('tags', ''),
+            }
+    except (json.JSONDecodeError, KeyError, TypeError):
+        return
+
+
+def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
+    """
+    Parse links from individual snapshot index.json files in archive directories.
+
+    Walks through archive/*/index.json files to discover orphaned snapshots.
+    """
+    from archivebox.config import CONSTANTS
+
+    archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
+    if not archive_dir.exists():
+        return
+
+    for entry in os.scandir(archive_dir):
+        if not entry.is_dir():
+            continue
+
+        index_file = Path(entry.path) / 'index.json'
+        if not index_file.exists():
+            continue
+
+        try:
+            with open(index_file, 'r', encoding='utf-8') as f:
+                link = json.load(f)
+
+            yield {
+                'url': link.get('url', ''),
+                'timestamp': link.get('timestamp', entry.name),
+                'title': link.get('title'),
+                'tags': link.get('tags', ''),
+            }
+        except (json.JSONDecodeError, KeyError, TypeError):
+            continue
--- a/archivebox/misc/logging.py
+++ b/archivebox/misc/logging.py
@@ -1,7 +1,7 @@
 __package__ = 'archivebox.misc'

-# TODO: merge/dedupe this file with archivebox/logging_util.py
-
+# Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers)
+# Higher-level logging functions are in logging_util.py

 import sys
 from typing import Optional, Union, Tuple, List
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -1,9 +1,11 @@
 __package__ = 'archivebox'

+# High-level logging functions for CLI output and progress tracking
+# Low-level primitives (Rich console, ANSI colors) are in logging.py
+
 import re
 import os
 import sys
-import stat
 import time

 from math import log
@@ -15,7 +17,7 @@ from dataclasses import dataclass
 from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING

 if TYPE_CHECKING:
-    from ..index.schema import Link, ArchiveResult
+    from core.models import Snapshot

 from rich import print
 from rich.panel import Panel
@@ -48,77 +50,6 @@ class RuntimeStats:
 _LAST_RUN_STATS = RuntimeStats()


-def debug_dict_summary(obj: Dict[Any, Any]) -> None:
-    stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
-
-
-def get_fd_info(fd) -> Dict[str, Any]:
-    NAME = fd.name[1:-1]
-    FILENO = fd.fileno()
-    MODE = os.fstat(FILENO).st_mode
-    IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
-    IS_PIPE = stat.S_ISFIFO(MODE)
-    IS_FILE = stat.S_ISREG(MODE)
-    IS_TERMINAL =  not (IS_PIPE or IS_FILE)
-    IS_LINE_BUFFERED = fd.line_buffering
-    IS_READABLE = fd.readable()
-    return {
-        'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
-        'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
-        'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
-        'IS_READABLE': IS_READABLE,
-    }
-    
-
-# # Log debug information about stdin, stdout, and stderr
-# sys.stdout.write('[>&1] this is python stdout\n')
-# sys.stderr.write('[>&2] this is python stderr\n')
-
-# debug_dict_summary(get_fd_info(sys.stdin))
-# debug_dict_summary(get_fd_info(sys.stdout))
-# debug_dict_summary(get_fd_info(sys.stderr))
-
-
-def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
-    """Tell the user they passed stdin to a command that doesn't accept it"""
-
-    if not stdin:
-        return None
-
-    if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
-        # when TTY is disabled in docker we cant tell if stdin is being piped in or not
-        # if we try to read stdin when its not piped we will hang indefinitely waiting for it
-        return None
-
-    if not stdin.isatty():
-        # stderr('READING STDIN TO REJECT...')
-        stdin_raw_text = stdin.read()
-        if stdin_raw_text.strip():
-            # stderr('GOT STDIN!', len(stdin_str))
-            stderr(f'[!] The "{caller}" command does not accept stdin (ignoring).', color='red')
-            stderr(f'    Run archivebox "{caller} --help" to see usage and examples.')
-            stderr()
-            # raise SystemExit(1)
-    return None
-
-
-def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
-    """accept any standard input and return it as a string or None"""
-    
-    if not stdin:
-        return None
-
-    if not stdin.isatty():
-        # stderr('READING STDIN TO ACCEPT...')
-        stdin_str = stdin.read()
-
-        if stdin_str:
-            # stderr('GOT STDIN...', len(stdin_str))
-            return stdin_str
-
-    return None
-
-
 class TimedProgress:
    """Show a progress bar and measure elapsed time until .end() is called"""

@@ -353,7 +284,7 @@ def log_archiving_finished(num_links: int):
        print('        archivebox server 0.0.0.0:8000')


-def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
+def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):

    # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
    #     http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
@@ -363,15 +294,15 @@ def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
        symbol_color='green' if is_new else 'bright_black',
        symbol='+' if is_new else '√',
        now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
-        title=link.title or link.base_url,
+        title=snapshot.title or snapshot.base_url,
    ))
-    print(f'    [sky_blue1]{link.url}[/]')
+    print(f'    [sky_blue1]{snapshot.url}[/]')
    print('    {} {}'.format(
        '>' if is_new else '√',
-        pretty_path(link_dir),
+        pretty_path(out_dir),
    ))

-def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict, start_ts: datetime):
+def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
    total = sum(stats.values())

    if stats['failed'] > 0 :
@@ -382,7 +313,7 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
        _LAST_RUN_STATS.succeeded += 1

    try:
-        size = get_dir_size(link_dir)
+        size = get_dir_size(out_dir)
    except FileNotFoundError:
        size = (0, None, '0')

@@ -391,38 +322,38 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
    print('        [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))


+
 def log_archive_method_started(method: str):
    print('      > {}'.format(method))


-def log_archive_method_finished(result: "ArchiveResult"):
+def log_archive_method_finished(result: dict):
    """
-    quote the argument with whitespace in a command so the user can 
+    quote the argument with whitespace in a command so the user can
    copy-paste the outputted string directly to run the cmd
    """
    # Prettify CMD string and make it safe to copy-paste by quoting arguments
    quoted_cmd = ' '.join(
        '"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
-        for arg in result.cmd
+        for arg in result['cmd']
    )

-    if result.status == 'failed':
-        if result.output.__class__.__name__ == 'TimeoutExpired':
-            duration = (result.end_ts - result.start_ts).seconds
+    if result['status'] == 'failed':
+        output = result.get('output')
+        if output and output.__class__.__name__ == 'TimeoutExpired':
+            duration = (result['end_ts'] - result['start_ts']).seconds
            hint_header = [
                f'[yellow3]Extractor timed out after {duration}s.[/]',
            ]
        else:
-            error_name = result.output.__class__.__name__.replace('ArchiveError', '')
+            error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
            hint_header = [
                '[yellow3]Extractor failed:[/]',
-                f'    {error_name} [red1]{result.output}[/]',
+                f'    {error_name} [red1]{output}[/]',
            ]
-        
-        # import pudb; pudb.set_trace()

        # Prettify error output hints string and limit to five lines
-        hints = getattr(result.output, 'hints', None) or ()
+        hints = getattr(output, 'hints', None) or () if output else ()
        if hints:
            if isinstance(hints, (list, tuple, type(_ for _ in ()))):
                hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
@@ -448,7 +379,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
            *hints,
            '[violet]Run to see full output:[/]',
            *docker_hints,
-            *(['    cd {};'.format(result.pwd)] if result.pwd else []),
+            *(['    cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
            '    {}'.format(quoted_cmd),
        ]
        print('\n'.join(
@@ -463,21 +394,22 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
    print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
    print('    {}'.format(' '.join(filter_patterns or ())))

-def log_list_finished(links):
-    from archivebox.index.csv import links_to_csv
+def log_list_finished(snapshots):
+    from core.models import Snapshot
    print()
    print('---------------------------------------------------------------------------------------------------')
-    print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
+    print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
    print('---------------------------------------------------------------------------------------------------')
    print()


-def log_removal_started(links: List["Link"], yes: bool, delete: bool):
-    print(f'[yellow3][i] Found {len(links)} matching URLs to remove.[/]')
+def log_removal_started(snapshots, yes: bool, delete: bool):
+    count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
+    print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
    if delete:
-        file_counts = [link.num_outputs for link in links if os.access(link.link_dir, os.R_OK)]
+        file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
        print(
-            f'    {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
+            f'    {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
            f'    ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
        )
    else:
@@ -488,7 +420,7 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):

    if not yes:
        print()
-        print(f'[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
+        print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
        try:
            assert input('    y/[n]: ').lower() == 'y'
        except (KeyboardInterrupt, EOFError, AssertionError):
@@ -504,6 +436,13 @@ def log_removal_finished(all_links: int, to_remove: int):
        print(f'    Index now contains {all_links - to_remove} links.')


+### Search Indexing Stage
+
+def log_index_started(url: str):
+    print('[green][*] Indexing url: {} in the search index[/]'.format(url))
+    print()
+
+
 ### Helpers

@enforce_types
@@ -542,10 +481,10 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:


@enforce_types
-def printable_folders(folders: Dict[str, Optional["Link"]], with_headers: bool=False) -> str:
+def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
    return '\n'.join(
-        f'{folder} {link and link.url} "{link and link.title}"'
-        for folder, link in folders.items()
+        f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
+        for folder, snapshot in folders.items()
    )


--- a/archivebox/misc/shell_welcome_message.py
+++ b/archivebox/misc/shell_welcome_message.py
@@ -13,7 +13,6 @@ import pydantic                            # noqa
 import requests                            # noqa
 import subprocess                          # noqa
 import archivebox                          # noqa
-import abx                                 # noqa
 from benedict import benedict              # noqa
 from django.utils import timezone          # noqa
 from datetime import datetime, timedelta   # noqa
@@ -21,8 +20,9 @@ from django.conf import settings           # noqa

 from archivebox import CONSTANTS           # noqa
 from archivebox.cli import *               # noqa
+from archivebox.config.configset import get_config

-CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
+CONFIG = get_config()

 if __name__ == '__main__':
    # load the rich extension for ipython for pretty printing
@@ -35,7 +35,7 @@ if __name__ == '__main__':


    # print the welcome message
-    prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
+    prnt('[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]')
    prnt('[yellow4]# ArchiveBox Imports[/]')
    prnt('[yellow4]import archivebox[/]')
    prnt('[yellow4]from archivebox.cli import *[/]')
--- a/archivebox/misc/util.py
+++ b/archivebox/misc/util.py
@@ -345,19 +345,41 @@ class ExtendedEncoder(pyjson.JSONEncoder):

        elif isinstance(obj, Exception):
            return '{}: {}'.format(obj.__class__.__name__, obj)
-        
+
        elif isinstance(obj, Path):
            return str(obj)
-        
+
        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
-            return tuple(obj)
-        
+            return list(obj)
+
        elif isinstance(obj, Callable):
            return str(obj)

+        # Try dict/list conversion as fallback
+        try:
+            return dict(obj)
+        except Exception:
+            pass
+
+        try:
+            return list(obj)
+        except Exception:
+            pass
+
+        try:
+            return str(obj)
+        except Exception:
+            pass
+
        return pyjson.JSONEncoder.default(self, obj)


+@enforce_types
+def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
+    """Serialize object to JSON string with extended type support"""
+    return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
+
+
 ### URL PARSING TESTS / ASSERTIONS

 # Check that plain text regex URL parsing works as expected
@@ -452,3 +474,78 @@ _test_url_strs = {
 for url_str, num_urls in _test_url_strs.items():
    assert len(list(find_all_urls(url_str))) == num_urls, (
        f'{url_str} does not contain {num_urls} urls')
+
+
+### Chrome Helpers
+
+def chrome_args(**options) -> List[str]:
+    """Helper to build up a chrome shell command with arguments."""
+    import shutil
+    from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
+    
+    chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
+    chrome_headless = options.get('CHROME_HEADLESS', True)
+    chrome_sandbox = options.get('CHROME_SANDBOX', True)
+    check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
+    user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
+    resolution = options.get('RESOLUTION', RESOLUTION)
+    timeout = options.get('CHROME_TIMEOUT', 0)
+    user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
+    
+    if not chrome_binary:
+        raise Exception('Could not find any CHROME_BINARY installed on your system')
+    
+    cmd_args = [chrome_binary]
+    
+    if chrome_headless:
+        cmd_args += ("--headless=new",)
+    
+    if not chrome_sandbox:
+        # running in docker or other sandboxed environment
+        cmd_args += (
+            "--no-sandbox",
+            "--no-zygote",
+            "--disable-dev-shm-usage",
+            "--disable-software-rasterizer",
+            "--run-all-compositor-stages-before-draw",
+            "--hide-scrollbars",
+            "--autoplay-policy=no-user-gesture-required",
+            "--no-first-run",
+            "--use-fake-ui-for-media-stream",
+            "--use-fake-device-for-media-stream",
+            "--disable-sync",
+        )
+    
+    if not check_ssl:
+        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
+    
+    if user_agent:
+        cmd_args += (f'--user-agent={user_agent}',)
+    
+    if resolution:
+        cmd_args += (f'--window-size={resolution}',)
+    
+    if timeout:
+        cmd_args += (f'--timeout={timeout * 1000}',)
+    
+    if user_data_dir:
+        cmd_args += (f'--user-data-dir={user_data_dir}',)
+    
+    return cmd_args
+
+
+def chrome_cleanup():
+    """
+    Cleans up any state or runtime files that chrome leaves behind when killed by
+    a timeout or other error
+    """
+    import os
+    from archivebox.config.permissions import IN_DOCKER
+    
+    if IN_DOCKER:
+        singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
+        if os.path.lexists(singleton_lock):
+            try:
+                os.remove(singleton_lock)
+            except OSError:
+                pass