mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 23:07:56 +10:00
wip major changes
This commit is contained in:
@@ -54,7 +54,7 @@ def check_data_folder() -> None:
|
||||
|
||||
def check_migrations():
|
||||
from archivebox import DATA_DIR
|
||||
from ..index.sql import list_migrations
|
||||
from archivebox.misc.db import list_migrations
|
||||
|
||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
||||
is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
|
||||
@@ -210,7 +210,7 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
|
||||
|
||||
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
|
||||
|
||||
assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
|
||||
# assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config"
|
||||
|
||||
if not must_exist and not os.path.isdir(lib_dir):
|
||||
return True
|
||||
|
||||
57
archivebox/misc/db.py
Normal file
57
archivebox/misc/db.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""
|
||||
Database utility functions for ArchiveBox.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
|
||||
"""List all Django migrations and their status"""
|
||||
from django.core.management import call_command
|
||||
|
||||
out = StringIO()
|
||||
call_command("showmigrations", list=True, stdout=out)
|
||||
out.seek(0)
|
||||
|
||||
migrations = []
|
||||
for line in out.readlines():
|
||||
if line.strip() and ']' in line:
|
||||
status_str, name_str = line.strip().split(']', 1)
|
||||
is_applied = 'X' in status_str
|
||||
migration_name = name_str.strip()
|
||||
migrations.append((is_applied, migration_name))
|
||||
|
||||
return migrations
|
||||
|
||||
|
||||
@enforce_types
|
||||
def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
|
||||
"""Apply pending Django migrations"""
|
||||
from django.core.management import call_command
|
||||
|
||||
out1, out2 = StringIO(), StringIO()
|
||||
|
||||
call_command("migrate", interactive=False, database='default', stdout=out1)
|
||||
out1.seek(0)
|
||||
call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
|
||||
out2.seek(0)
|
||||
|
||||
return [
|
||||
line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
|
||||
]
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_admins(out_dir: Path = DATA_DIR) -> List:
|
||||
"""Get list of superuser accounts"""
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
return User.objects.filter(is_superuser=True).exclude(username='system')
|
||||
215
archivebox/misc/folders.py
Normal file
215
archivebox/misc/folders.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Folder status and integrity checking utilities for ArchiveBox.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from itertools import chain
|
||||
from typing import Dict, Optional, List, Tuple, TYPE_CHECKING
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR, CONSTANTS
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot
|
||||
|
||||
|
||||
def _is_valid_snapshot(snapshot: 'Snapshot') -> bool:
|
||||
"""Check if a snapshot's data directory is valid"""
|
||||
dir_exists = Path(snapshot.output_dir).exists()
|
||||
index_exists = (Path(snapshot.output_dir) / "index.json").exists()
|
||||
if not dir_exists:
|
||||
return False
|
||||
if dir_exists and not index_exists:
|
||||
return False
|
||||
if dir_exists and index_exists:
|
||||
try:
|
||||
with open(Path(snapshot.output_dir) / "index.json", 'r') as f:
|
||||
data = json.load(f)
|
||||
return snapshot.url == data.get('url')
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _is_corrupt_snapshot(snapshot: 'Snapshot') -> bool:
|
||||
"""Check if a snapshot's data directory is corrupted"""
|
||||
if not Path(snapshot.output_dir).exists():
|
||||
return False
|
||||
return not _is_valid_snapshot(snapshot)
|
||||
|
||||
|
||||
def get_indexed_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
|
||||
"""indexed snapshots without checking archive status or data directory validity"""
|
||||
return {
|
||||
snapshot.output_dir: snapshot
|
||||
for snapshot in snapshots.iterator(chunk_size=500)
|
||||
}
|
||||
|
||||
|
||||
def get_archived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
|
||||
"""indexed snapshots that are archived with a valid data directory"""
|
||||
return {
|
||||
snapshot.output_dir: snapshot
|
||||
for snapshot in snapshots.iterator(chunk_size=500)
|
||||
if snapshot.is_archived
|
||||
}
|
||||
|
||||
|
||||
def get_unarchived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
|
||||
"""indexed snapshots that are unarchived with no data directory or an empty data directory"""
|
||||
return {
|
||||
snapshot.output_dir: snapshot
|
||||
for snapshot in snapshots.iterator(chunk_size=500)
|
||||
if not snapshot.is_archived
|
||||
}
|
||||
|
||||
|
||||
def get_present_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
|
||||
"""dirs that actually exist in the archive/ folder"""
|
||||
from core.models import Snapshot
|
||||
|
||||
all_folders = {}
|
||||
for entry in (out_dir / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
|
||||
if entry.is_dir():
|
||||
snapshot = None
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(timestamp=entry.name)
|
||||
except Snapshot.DoesNotExist:
|
||||
pass
|
||||
all_folders[entry.name] = snapshot
|
||||
return all_folders
|
||||
|
||||
|
||||
def get_valid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
return {
|
||||
snapshot.output_dir: snapshot
|
||||
for snapshot in snapshots.iterator(chunk_size=500)
|
||||
if _is_valid_snapshot(snapshot)
|
||||
}
|
||||
|
||||
|
||||
def get_invalid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
|
||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||
duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
|
||||
orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
|
||||
corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
|
||||
unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
|
||||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||
|
||||
|
||||
def get_duplicate_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
|
||||
"""dirs that conflict with other directories that have the same URL or timestamp"""
|
||||
from core.models import Snapshot as SnapshotModel
|
||||
|
||||
by_url: Dict[str, int] = {}
|
||||
by_timestamp: Dict[str, int] = {}
|
||||
duplicate_folders: Dict[str, Optional['Snapshot']] = {}
|
||||
|
||||
data_folders = (
|
||||
str(entry)
|
||||
for entry in CONSTANTS.ARCHIVE_DIR.iterdir()
|
||||
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
|
||||
)
|
||||
|
||||
for item in chain(snapshots.iterator(chunk_size=500), data_folders):
|
||||
snapshot = None
|
||||
if isinstance(item, str):
|
||||
path = item
|
||||
timestamp = Path(path).name
|
||||
try:
|
||||
snapshot = SnapshotModel.objects.get(timestamp=timestamp)
|
||||
except SnapshotModel.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
snapshot = item
|
||||
path = snapshot.output_dir
|
||||
|
||||
if snapshot:
|
||||
by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
|
||||
if by_timestamp[snapshot.timestamp] > 1:
|
||||
duplicate_folders[path] = snapshot
|
||||
|
||||
by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
|
||||
if by_url[snapshot.url] > 1:
|
||||
duplicate_folders[path] = snapshot
|
||||
return duplicate_folders
|
||||
|
||||
|
||||
def get_orphaned_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
|
||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||
orphaned_folders: Dict[str, Optional['Snapshot']] = {}
|
||||
|
||||
for entry in CONSTANTS.ARCHIVE_DIR.iterdir():
|
||||
if entry.is_dir():
|
||||
index_path = entry / "index.json"
|
||||
if index_path.exists() and not snapshots.filter(timestamp=entry.name).exists():
|
||||
orphaned_folders[str(entry)] = None
|
||||
return orphaned_folders
|
||||
|
||||
|
||||
def get_corrupted_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
|
||||
"""dirs that exist but have corrupted/invalid index files"""
|
||||
corrupted: Dict[str, 'Snapshot'] = {}
|
||||
for snapshot in snapshots.iterator(chunk_size=500):
|
||||
if _is_corrupt_snapshot(snapshot):
|
||||
corrupted[snapshot.output_dir] = snapshot
|
||||
return corrupted
|
||||
|
||||
|
||||
def get_unrecognized_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, None]:
|
||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||
unrecognized_folders: Dict[str, None] = {}
|
||||
|
||||
for entry in (Path(out_dir) / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
|
||||
if entry.is_dir():
|
||||
index_exists = (entry / "index.json").exists()
|
||||
|
||||
if index_exists:
|
||||
try:
|
||||
with open(entry / "index.json", 'r') as f:
|
||||
json.load(f)
|
||||
except Exception:
|
||||
unrecognized_folders[str(entry)] = None
|
||||
else:
|
||||
timestamp = entry.name
|
||||
if not snapshots.filter(timestamp=timestamp).exists():
|
||||
unrecognized_folders[str(entry)] = None
|
||||
return unrecognized_folders
|
||||
|
||||
|
||||
@enforce_types
|
||||
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
|
||||
"""Move folders to their correct timestamp-named locations based on index.json"""
|
||||
fixed = []
|
||||
cant_fix = []
|
||||
for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
index_path = Path(entry.path) / 'index.json'
|
||||
if index_path.exists():
|
||||
try:
|
||||
with open(index_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
timestamp = data.get('timestamp')
|
||||
url = data.get('url')
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not timestamp:
|
||||
continue
|
||||
|
||||
if not entry.path.endswith(f'/{timestamp}'):
|
||||
dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
|
||||
if dest.exists():
|
||||
cant_fix.append(entry.path)
|
||||
else:
|
||||
shutil.move(entry.path, str(dest))
|
||||
fixed.append(str(dest))
|
||||
return fixed, cant_fix
|
||||
@@ -4,71 +4,65 @@ from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from datetime import datetime
|
||||
import blake3 # pip install blake3
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def _cached_file_hashes(filepath: str, size: int, mtime: float) -> tuple[str, str]:
|
||||
"""Internal function to calculate file hashes with cache key based on path, size and mtime."""
|
||||
def _cached_file_hash(filepath: str, size: int, mtime: float) -> str:
|
||||
"""Internal function to calculate file hash with cache key based on path, size and mtime."""
|
||||
sha256_hash = hashlib.sha256()
|
||||
blake3_hash = blake3.blake3()
|
||||
|
||||
|
||||
with open(filepath, 'rb') as f:
|
||||
# Read file once and update both hashes simultaneously
|
||||
for chunk in iter(lambda: f.read(4096), b''):
|
||||
sha256_hash.update(chunk)
|
||||
blake3_hash.update(chunk)
|
||||
|
||||
return sha256_hash.hexdigest(), blake3_hash.hexdigest()
|
||||
|
||||
return sha256_hash.hexdigest()
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def hash_file(file_path: Path, pwd: Path | None = None) -> tuple[str, str]:
|
||||
"""Calculate SHA256 and BLAKE3 hashes of a file with caching based on path, size and mtime."""
|
||||
def hash_file(file_path: Path, pwd: Path | None = None) -> str:
|
||||
"""Calculate SHA256 hash of a file with caching based on path, size and mtime."""
|
||||
pwd = Path(pwd) if pwd else None
|
||||
file_path = Path(file_path)
|
||||
if not file_path.is_absolute():
|
||||
file_path = pwd / file_path if pwd else file_path.absolute()
|
||||
|
||||
|
||||
abs_path = file_path.resolve()
|
||||
stat_info = abs_path.stat()
|
||||
|
||||
return _cached_file_hashes(
|
||||
|
||||
return _cached_file_hash(
|
||||
str(abs_path),
|
||||
stat_info.st_size,
|
||||
stat_info.st_mtime
|
||||
)
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, tuple[str, str]]:
|
||||
"""Calculate SHA256 and BLAKE3 hashes for all files and directories recursively."""
|
||||
def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]:
|
||||
"""Calculate SHA256 hashes for all files and directories recursively."""
|
||||
pwd = Path(pwd) if pwd else None
|
||||
dir_path = Path(dir_path)
|
||||
if not dir_path.is_absolute():
|
||||
dir_path = pwd / dir_path if pwd else dir_path.absolute()
|
||||
|
||||
|
||||
if not dir_path.is_dir():
|
||||
raise ValueError(f"Not a directory: {dir_path}")
|
||||
if max_depth < -1:
|
||||
raise ValueError(f"max_depth must be >= -1, got {max_depth}")
|
||||
|
||||
|
||||
# Get all files recursively
|
||||
all_files = get_dir_entries(
|
||||
dir_path, pwd=pwd, recursive=True,
|
||||
include_files=True, include_dirs=False,
|
||||
filter_func=filter_func
|
||||
)
|
||||
|
||||
hashes: dict[str, tuple[str, str]] = {}
|
||||
hashable_summary_sha256 = []
|
||||
hashable_summary_blake3 = []
|
||||
|
||||
|
||||
hashes: dict[str, str] = {}
|
||||
hashable_summary = []
|
||||
|
||||
# Calculate hashes for all files
|
||||
for subfile in all_files:
|
||||
subfile_path = dir_path / subfile
|
||||
sha256_hash, blake3_hash = hash_file(subfile_path)
|
||||
hashes[subfile] = (sha256_hash, blake3_hash)
|
||||
hashable_summary_sha256.append(f"{sha256_hash} ./{subfile}")
|
||||
hashable_summary_blake3.append(f"{blake3_hash} ./{subfile}")
|
||||
|
||||
sha256_hash = hash_file(subfile_path)
|
||||
hashes[subfile] = sha256_hash
|
||||
hashable_summary.append(f"{sha256_hash} ./{subfile}")
|
||||
|
||||
# Calculate hashes for all directories
|
||||
subdirs = get_dir_entries(
|
||||
dir_path, pwd=pwd, recursive=True,
|
||||
@@ -76,7 +70,7 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
|
||||
include_hidden=False, filter_func=filter_func,
|
||||
max_depth=max_depth
|
||||
)
|
||||
|
||||
|
||||
for subdir in subdirs:
|
||||
subdir_path = dir_path / subdir
|
||||
subdir_hashes = get_dir_hashes(
|
||||
@@ -84,36 +78,34 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
|
||||
max_depth=0
|
||||
)
|
||||
hashes[subdir] = subdir_hashes['.']
|
||||
|
||||
|
||||
# Filter results by max_depth
|
||||
if max_depth >= 0:
|
||||
hashes = {
|
||||
path: value for path, value in hashes.items()
|
||||
if len(Path(path).parts) <= max_depth + 1
|
||||
}
|
||||
|
||||
# Calculate root directory hashes
|
||||
hashable_summary_sha256.sort()
|
||||
hashable_summary_blake3.sort()
|
||||
root_sha256 = hashlib.sha256('\n'.join(hashable_summary_sha256).encode()).hexdigest()
|
||||
root_blake3 = blake3.blake3('\n'.join(hashable_summary_blake3).encode()).hexdigest()
|
||||
hashes['.'] = (root_sha256, root_blake3)
|
||||
|
||||
|
||||
# Calculate root directory hash
|
||||
hashable_summary.sort()
|
||||
root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest()
|
||||
hashes['.'] = root_sha256
|
||||
|
||||
return hashes
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
|
||||
include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
|
||||
def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
|
||||
include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
|
||||
filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
|
||||
"""Get filtered list of directory entries."""
|
||||
pwd = Path(pwd) if pwd else None
|
||||
dir_path = Path(dir_path)
|
||||
if not dir_path.is_absolute():
|
||||
dir_path = pwd / dir_path if pwd else dir_path.absolute()
|
||||
|
||||
|
||||
results = []
|
||||
|
||||
|
||||
def process_path(path: Path, depth: int):
|
||||
if not include_hidden and path.name.startswith('.'):
|
||||
return False
|
||||
@@ -127,18 +119,18 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
|
||||
if not filter_func(info):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
|
||||
current_depth = len(path.relative_to(dir_path).parts)
|
||||
|
||||
|
||||
if path.is_file() and include_files and process_path(path, current_depth):
|
||||
results.append(str(path.relative_to(dir_path)))
|
||||
elif path.is_dir() and include_dirs and process_path(path, current_depth):
|
||||
results.append(str(path.relative_to(dir_path)))
|
||||
|
||||
|
||||
if not recursive:
|
||||
break
|
||||
|
||||
|
||||
return tuple(sorted(results)) # Make immutable for caching
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
@@ -147,7 +139,7 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
|
||||
sizes: dict[str, int] = {}
|
||||
hashes = get_dir_hashes(dir_path, pwd=pwd, **kwargs)
|
||||
dir_path = Path(dir_path)
|
||||
|
||||
|
||||
for path_key in hashes:
|
||||
full_path = dir_path / path_key
|
||||
if full_path.is_file():
|
||||
@@ -158,25 +150,25 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
|
||||
if file_path.is_file() and not file_path.name.startswith('.'):
|
||||
total += file_path.stat().st_size
|
||||
sizes[path_key + '/'] = total
|
||||
|
||||
|
||||
return sizes
|
||||
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict:
|
||||
"""Get detailed information about directory contents including both hash types and sizes."""
|
||||
"""Get detailed information about directory contents including hashes and sizes."""
|
||||
pwd = Path(pwd) if pwd else None
|
||||
dir_path = Path(dir_path)
|
||||
if not dir_path.is_absolute():
|
||||
dir_path = pwd / dir_path if pwd else dir_path.absolute()
|
||||
|
||||
|
||||
hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
|
||||
sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)
|
||||
|
||||
|
||||
num_total_subpaths = sum(1 for name in hashes if name != '.')
|
||||
details = {}
|
||||
|
||||
for filename, (sha256_hash, blake3_hash) in sorted(hashes.items()):
|
||||
|
||||
for filename, sha256_hash in sorted(hashes.items()):
|
||||
abs_path = (dir_path / filename).resolve()
|
||||
stat_info = abs_path.stat()
|
||||
num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
|
||||
@@ -197,7 +189,7 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
|
||||
extension = abs_path.suffix
|
||||
basename = abs_path.name.rsplit(extension, 1)[0]
|
||||
num_bytes = sizes[filename]
|
||||
|
||||
|
||||
details[filename] = {
|
||||
'basename': basename,
|
||||
'mime_type': mime_type,
|
||||
@@ -205,14 +197,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
|
||||
'num_subpaths': num_subpaths,
|
||||
'num_bytes': num_bytes,
|
||||
'hash_sha256': sha256_hash,
|
||||
'hash_blake3': blake3_hash,
|
||||
'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
|
||||
'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
|
||||
}
|
||||
|
||||
|
||||
if filter_func and not filter_func(details[filename]):
|
||||
del details[filename]
|
||||
|
||||
|
||||
return details
|
||||
|
||||
|
||||
@@ -221,7 +212,7 @@ if __name__ == '__main__':
|
||||
dir_info = get_dir_info(Path('.'), max_depth=6)
|
||||
with open('.hashes.json', 'w') as f:
|
||||
json.dump(dir_info, f, indent=4)
|
||||
print('√ Wrote .hashes.json')
|
||||
print('Wrote .hashes.json')
|
||||
|
||||
# Example output:
|
||||
# {
|
||||
@@ -232,7 +223,6 @@ if __name__ == '__main__':
|
||||
# "num_subpaths": 25,
|
||||
# "num_bytes": 214677,
|
||||
# "hash_sha256": "addfacf88b2ff6b564846415fb7b21dcb7e63ee4e911bc0aec255ee354958530",
|
||||
# "hash_blake3": "3403a1f876453c7749f17ee3502769eff05cff20b5d6c2f2cf458e6353a380db",
|
||||
# "created_at": "2024-12-04T00:08:38.537449",
|
||||
# "modified_at": "2024-12-04T00:08:38.537449"
|
||||
# },
|
||||
@@ -243,31 +233,8 @@ if __name__ == '__main__':
|
||||
# "num_subpaths": null,
|
||||
# "num_bytes": 32,
|
||||
# "hash_sha256": "b0e5e7ff17db3b60535cf664282787767c336e3e203a43e21b6326c6fe457551",
|
||||
# "hash_blake3": "4a801eb2a4cdde8d3422be1e2074b78574a5890afb3027cbe6f3b3cf4d113fd1",
|
||||
# "created_at": "2024-10-08T00:51:41.001359",
|
||||
# "modified_at": "2024-10-08T00:51:41.001359"
|
||||
# },
|
||||
# "__pycache__/": {
|
||||
# "basename": "__pycache__",
|
||||
# "mime_type": "inode/directory",
|
||||
# "extension": "",
|
||||
# "num_subpaths": 8,
|
||||
# "num_bytes": 107593,
|
||||
# "hash_sha256": "9e917a438be774ffc7ea9125de71008c29a7d9003b6f5e09e2085aa1ef3157b3",
|
||||
# "hash_blake3": "e87184485bd67bd9b723a9ee4d472e8c1d24a4388d373046a27e5a1e10467a06",
|
||||
# "created_at": "2024-12-04T00:00:16.149390",
|
||||
# "modified_at": "2024-12-04T00:00:16.149390"
|
||||
# },
|
||||
# "__pycache__/__init__.cpython-313.pyc": {
|
||||
# "basename": "__init__.cpython-313",
|
||||
# "mime_type": "application/x-python-code",
|
||||
# "extension": ".pyc",
|
||||
# "num_subpaths": null,
|
||||
# "num_bytes": 223,
|
||||
# "hash_sha256": "d29e3ee5e6b9b564422d9ef2c7325d28cf759b9fb868f59551ba43cd991d51be",
|
||||
# "hash_blake3": "279a6dc4c8161d6ddb18fa72c882f375324ed152dc6c7c7eac9ef5fdd066f2fd",
|
||||
# "created_at": "2024-12-03T03:13:43.257430",
|
||||
# "modified_at": "2024-12-03T03:13:43.257308"
|
||||
# },
|
||||
# ...
|
||||
# }
|
||||
|
||||
343
archivebox/misc/jsonl.py
Normal file
343
archivebox/misc/jsonl.py
Normal file
@@ -0,0 +1,343 @@
|
||||
"""
|
||||
JSONL (JSON Lines) utilities for ArchiveBox.
|
||||
|
||||
Provides functions for reading, writing, and processing typed JSONL records.
|
||||
All CLI commands that accept stdin can read both plain URLs and typed JSONL.
|
||||
|
||||
Typed JSONL Format:
|
||||
{"type": "Snapshot", "url": "https://example.com", "title": "...", "tags": "..."}
|
||||
{"type": "ArchiveResult", "snapshot_id": "...", "extractor": "wget", ...}
|
||||
{"type": "Tag", "name": "..."}
|
||||
|
||||
Plain URLs (also supported):
|
||||
https://example.com
|
||||
https://foo.com
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
import sys
|
||||
import json
|
||||
from typing import Iterator, Dict, Any, Optional, TextIO, Callable, Union, List
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Type constants for JSONL records
|
||||
TYPE_SNAPSHOT = 'Snapshot'
|
||||
TYPE_ARCHIVERESULT = 'ArchiveResult'
|
||||
TYPE_TAG = 'Tag'
|
||||
TYPE_CRAWL = 'Crawl'
|
||||
TYPE_SEED = 'Seed'
|
||||
TYPE_INSTALLEDBINARY = 'InstalledBinary'
|
||||
|
||||
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY}
|
||||
|
||||
|
||||
def parse_line(line: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Parse a single line of input as either JSONL or plain URL.
|
||||
|
||||
Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
|
||||
"""
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
return None
|
||||
|
||||
# Try to parse as JSON first
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
# If it has a type, validate it
|
||||
if 'type' in record and record['type'] not in VALID_TYPES:
|
||||
# Unknown type, treat as raw data
|
||||
pass
|
||||
# If it has url but no type, assume Snapshot
|
||||
if 'url' in record and 'type' not in record:
|
||||
record['type'] = TYPE_SNAPSHOT
|
||||
return record
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Treat as plain URL if it looks like one
|
||||
if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
|
||||
return {'type': TYPE_SNAPSHOT, 'url': line}
|
||||
|
||||
# Could be a snapshot ID (UUID)
|
||||
if len(line) == 36 and line.count('-') == 4:
|
||||
return {'type': TYPE_SNAPSHOT, 'id': line}
|
||||
|
||||
# Unknown format, skip
|
||||
return None
|
||||
|
||||
|
||||
def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Read JSONL or plain URLs from stdin.
|
||||
|
||||
Yields parsed records as dicts.
|
||||
Supports both JSONL format and plain URLs (one per line).
|
||||
"""
|
||||
stream = stream or sys.stdin
|
||||
|
||||
# Don't block if stdin is a tty with no input
|
||||
if stream.isatty():
|
||||
return
|
||||
|
||||
for line in stream:
|
||||
record = parse_line(line)
|
||||
if record:
|
||||
yield record
|
||||
|
||||
|
||||
def read_file(path: Path) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Read JSONL or plain URLs from a file.
|
||||
|
||||
Yields parsed records as dicts.
|
||||
"""
|
||||
with open(path, 'r') as f:
|
||||
for line in f:
|
||||
record = parse_line(line)
|
||||
if record:
|
||||
yield record
|
||||
|
||||
|
||||
def read_args_or_stdin(args: tuple, stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Read from CLI arguments if provided, otherwise from stdin.
|
||||
|
||||
Handles both URLs and JSONL from either source.
|
||||
"""
|
||||
if args:
|
||||
for arg in args:
|
||||
# Check if it's a file path
|
||||
path = Path(arg)
|
||||
if path.exists() and path.is_file():
|
||||
yield from read_file(path)
|
||||
else:
|
||||
record = parse_line(arg)
|
||||
if record:
|
||||
yield record
|
||||
else:
|
||||
yield from read_stdin(stream)
|
||||
|
||||
|
||||
def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None:
|
||||
"""
|
||||
Write a single JSONL record to stdout (or provided stream).
|
||||
"""
|
||||
stream = stream or sys.stdout
|
||||
stream.write(json.dumps(record) + '\n')
|
||||
stream.flush()
|
||||
|
||||
|
||||
def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
|
||||
"""
|
||||
Write multiple JSONL records to stdout (or provided stream).
|
||||
|
||||
Returns count of records written.
|
||||
"""
|
||||
count = 0
|
||||
for record in records:
|
||||
write_record(record, stream)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Filter records by type.
|
||||
"""
|
||||
for record in records:
|
||||
if record.get('type') == record_type:
|
||||
yield record
|
||||
|
||||
|
||||
def snapshot_to_jsonl(snapshot) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Snapshot model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
'type': TYPE_SNAPSHOT,
|
||||
'id': str(snapshot.id),
|
||||
'url': snapshot.url,
|
||||
'title': snapshot.title,
|
||||
'tags': snapshot.tags_str() if hasattr(snapshot, 'tags_str') else '',
|
||||
'bookmarked_at': snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else None,
|
||||
'created_at': snapshot.created_at.isoformat() if snapshot.created_at else None,
|
||||
'timestamp': snapshot.timestamp,
|
||||
'depth': getattr(snapshot, 'depth', 0),
|
||||
'status': snapshot.status if hasattr(snapshot, 'status') else None,
|
||||
}
|
||||
|
||||
|
||||
def archiveresult_to_jsonl(result) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert an ArchiveResult model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
'type': TYPE_ARCHIVERESULT,
|
||||
'id': str(result.id),
|
||||
'snapshot_id': str(result.snapshot_id),
|
||||
'extractor': result.extractor,
|
||||
'status': result.status,
|
||||
'output': result.output,
|
||||
'start_ts': result.start_ts.isoformat() if result.start_ts else None,
|
||||
'end_ts': result.end_ts.isoformat() if result.end_ts else None,
|
||||
}
|
||||
|
||||
|
||||
def tag_to_jsonl(tag) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Tag model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
'type': TYPE_TAG,
|
||||
'id': str(tag.id),
|
||||
'name': tag.name,
|
||||
'slug': tag.slug,
|
||||
}
|
||||
|
||||
|
||||
def crawl_to_jsonl(crawl) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Crawl model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
'type': TYPE_CRAWL,
|
||||
'id': str(crawl.id),
|
||||
'seed_id': str(crawl.seed_id),
|
||||
'status': crawl.status,
|
||||
'max_depth': crawl.max_depth,
|
||||
'created_at': crawl.created_at.isoformat() if crawl.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
def process_records(
|
||||
records: Iterator[Dict[str, Any]],
|
||||
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
|
||||
) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Process records through type-specific handlers.
|
||||
|
||||
Args:
|
||||
records: Input record iterator
|
||||
handlers: Dict mapping type names to handler functions
|
||||
Handlers return output records or None to skip
|
||||
|
||||
Yields output records from handlers.
|
||||
"""
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
handler = handlers.get(record_type)
|
||||
if handler:
|
||||
result = handler(record)
|
||||
if result:
|
||||
yield result
|
||||
|
||||
|
||||
def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
|
||||
"""
|
||||
Get or create a Snapshot from a JSONL record.
|
||||
|
||||
Returns the Snapshot instance.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
|
||||
# Extract fields from record
|
||||
url = record.get('url')
|
||||
if not url:
|
||||
raise ValueError("Record missing required 'url' field")
|
||||
|
||||
title = record.get('title')
|
||||
tags_str = record.get('tags', '')
|
||||
bookmarked_at = record.get('bookmarked_at')
|
||||
depth = record.get('depth', 0)
|
||||
crawl_id = record.get('crawl_id')
|
||||
|
||||
# Parse bookmarked_at if string
|
||||
if bookmarked_at and isinstance(bookmarked_at, str):
|
||||
bookmarked_at = parse_date(bookmarked_at)
|
||||
|
||||
# Use the manager's create_or_update_from_dict method
|
||||
snapshot = Snapshot.objects.create_or_update_from_dict(
|
||||
{'url': url, 'title': title, 'tags': tags_str},
|
||||
created_by_id=created_by_id
|
||||
)
|
||||
|
||||
# Update additional fields if provided
|
||||
update_fields = []
|
||||
if depth and snapshot.depth != depth:
|
||||
snapshot.depth = depth
|
||||
update_fields.append('depth')
|
||||
if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
|
||||
snapshot.bookmarked_at = bookmarked_at
|
||||
update_fields.append('bookmarked_at')
|
||||
if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
|
||||
snapshot.crawl_id = crawl_id
|
||||
update_fields.append('crawl_id')
|
||||
|
||||
if update_fields:
|
||||
snapshot.save(update_fields=update_fields + ['modified_at'])
|
||||
|
||||
return snapshot
|
||||
|
||||
|
||||
def get_or_create_tag(record: Dict[str, Any]):
|
||||
"""
|
||||
Get or create a Tag from a JSONL record.
|
||||
|
||||
Returns the Tag instance.
|
||||
"""
|
||||
from core.models import Tag
|
||||
|
||||
name = record.get('name')
|
||||
if not name:
|
||||
raise ValueError("Record missing required 'name' field")
|
||||
|
||||
tag, _ = Tag.objects.get_or_create(name=name)
|
||||
return tag
|
||||
|
||||
|
||||
def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Optional[int] = None) -> Dict[str, List]:
|
||||
"""
|
||||
Process JSONL records, creating Tags and Snapshots as needed.
|
||||
|
||||
Args:
|
||||
records: Iterator of JSONL record dicts
|
||||
created_by_id: User ID for created objects
|
||||
|
||||
Returns:
|
||||
Dict with 'tags' and 'snapshots' lists of created objects
|
||||
"""
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
|
||||
results = {
|
||||
'tags': [],
|
||||
'snapshots': [],
|
||||
}
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', TYPE_SNAPSHOT)
|
||||
|
||||
if record_type == TYPE_TAG:
|
||||
try:
|
||||
tag = get_or_create_tag(record)
|
||||
results['tags'].append(tag)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or 'url' in record:
|
||||
try:
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
results['snapshots'].append(snapshot)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return results
|
||||
90
archivebox/misc/legacy.py
Normal file
90
archivebox/misc/legacy.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Legacy archive import utilities.
|
||||
|
||||
These functions are used to import data from old ArchiveBox archive formats
|
||||
(JSON indexes, archive directory structures) into the new database.
|
||||
|
||||
This is separate from the hooks-based parser system which handles importing
|
||||
new URLs from bookmark files, RSS feeds, etc.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterator, TypedDict, List
|
||||
|
||||
|
||||
class SnapshotDict(TypedDict, total=False):
|
||||
"""
|
||||
Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
|
||||
"""
|
||||
url: str # Required: the URL to archive
|
||||
timestamp: str # Optional: unix timestamp string
|
||||
title: str # Optional: page title
|
||||
tags: str # Optional: comma-separated tags string
|
||||
sources: List[str] # Optional: list of source file paths
|
||||
|
||||
|
||||
def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
"""
|
||||
Parse links from the main JSON index file (archive/index.json).
|
||||
|
||||
This is used to recover links from old archive formats.
|
||||
"""
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
|
||||
if not index_path.exists():
|
||||
return
|
||||
|
||||
try:
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
links = data.get('links', [])
|
||||
for link in links:
|
||||
yield {
|
||||
'url': link.get('url', ''),
|
||||
'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
|
||||
'title': link.get('title'),
|
||||
'tags': link.get('tags', ''),
|
||||
}
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
return
|
||||
|
||||
|
||||
def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
"""
|
||||
Parse links from individual snapshot index.json files in archive directories.
|
||||
|
||||
Walks through archive/*/index.json files to discover orphaned snapshots.
|
||||
"""
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
|
||||
if not archive_dir.exists():
|
||||
return
|
||||
|
||||
for entry in os.scandir(archive_dir):
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
|
||||
index_file = Path(entry.path) / 'index.json'
|
||||
if not index_file.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(index_file, 'r', encoding='utf-8') as f:
|
||||
link = json.load(f)
|
||||
|
||||
yield {
|
||||
'url': link.get('url', ''),
|
||||
'timestamp': link.get('timestamp', entry.name),
|
||||
'title': link.get('title'),
|
||||
'tags': link.get('tags', ''),
|
||||
}
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
continue
|
||||
@@ -1,7 +1,7 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
# TODO: merge/dedupe this file with archivebox/logging_util.py
|
||||
|
||||
# Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers)
|
||||
# Higher-level logging functions are in logging_util.py
|
||||
|
||||
import sys
|
||||
from typing import Optional, Union, Tuple, List
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
__package__ = 'archivebox'
|
||||
|
||||
# High-level logging functions for CLI output and progress tracking
|
||||
# Low-level primitives (Rich console, ANSI colors) are in logging.py
|
||||
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import stat
|
||||
import time
|
||||
|
||||
from math import log
|
||||
@@ -15,7 +17,7 @@ from dataclasses import dataclass
|
||||
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..index.schema import Link, ArchiveResult
|
||||
from core.models import Snapshot
|
||||
|
||||
from rich import print
|
||||
from rich.panel import Panel
|
||||
@@ -48,77 +50,6 @@ class RuntimeStats:
|
||||
_LAST_RUN_STATS = RuntimeStats()
|
||||
|
||||
|
||||
def debug_dict_summary(obj: Dict[Any, Any]) -> None:
|
||||
stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
|
||||
|
||||
|
||||
def get_fd_info(fd) -> Dict[str, Any]:
|
||||
NAME = fd.name[1:-1]
|
||||
FILENO = fd.fileno()
|
||||
MODE = os.fstat(FILENO).st_mode
|
||||
IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
|
||||
IS_PIPE = stat.S_ISFIFO(MODE)
|
||||
IS_FILE = stat.S_ISREG(MODE)
|
||||
IS_TERMINAL = not (IS_PIPE or IS_FILE)
|
||||
IS_LINE_BUFFERED = fd.line_buffering
|
||||
IS_READABLE = fd.readable()
|
||||
return {
|
||||
'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
|
||||
'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
|
||||
'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
|
||||
'IS_READABLE': IS_READABLE,
|
||||
}
|
||||
|
||||
|
||||
# # Log debug information about stdin, stdout, and stderr
|
||||
# sys.stdout.write('[>&1] this is python stdout\n')
|
||||
# sys.stderr.write('[>&2] this is python stderr\n')
|
||||
|
||||
# debug_dict_summary(get_fd_info(sys.stdin))
|
||||
# debug_dict_summary(get_fd_info(sys.stdout))
|
||||
# debug_dict_summary(get_fd_info(sys.stderr))
|
||||
|
||||
|
||||
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
|
||||
"""Tell the user they passed stdin to a command that doesn't accept it"""
|
||||
|
||||
if not stdin:
|
||||
return None
|
||||
|
||||
if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
|
||||
# when TTY is disabled in docker we cant tell if stdin is being piped in or not
|
||||
# if we try to read stdin when its not piped we will hang indefinitely waiting for it
|
||||
return None
|
||||
|
||||
if not stdin.isatty():
|
||||
# stderr('READING STDIN TO REJECT...')
|
||||
stdin_raw_text = stdin.read()
|
||||
if stdin_raw_text.strip():
|
||||
# stderr('GOT STDIN!', len(stdin_str))
|
||||
stderr(f'[!] The "{caller}" command does not accept stdin (ignoring).', color='red')
|
||||
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
|
||||
stderr()
|
||||
# raise SystemExit(1)
|
||||
return None
|
||||
|
||||
|
||||
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
|
||||
"""accept any standard input and return it as a string or None"""
|
||||
|
||||
if not stdin:
|
||||
return None
|
||||
|
||||
if not stdin.isatty():
|
||||
# stderr('READING STDIN TO ACCEPT...')
|
||||
stdin_str = stdin.read()
|
||||
|
||||
if stdin_str:
|
||||
# stderr('GOT STDIN...', len(stdin_str))
|
||||
return stdin_str
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class TimedProgress:
|
||||
"""Show a progress bar and measure elapsed time until .end() is called"""
|
||||
|
||||
@@ -353,7 +284,7 @@ def log_archiving_finished(num_links: int):
|
||||
print(' archivebox server 0.0.0.0:8000')
|
||||
|
||||
|
||||
def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
|
||||
def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):
|
||||
|
||||
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
|
||||
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
||||
@@ -363,15 +294,15 @@ def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
|
||||
symbol_color='green' if is_new else 'bright_black',
|
||||
symbol='+' if is_new else '√',
|
||||
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
title=link.title or link.base_url,
|
||||
title=snapshot.title or snapshot.base_url,
|
||||
))
|
||||
print(f' [sky_blue1]{link.url}[/]')
|
||||
print(f' [sky_blue1]{snapshot.url}[/]')
|
||||
print(' {} {}'.format(
|
||||
'>' if is_new else '√',
|
||||
pretty_path(link_dir),
|
||||
pretty_path(out_dir),
|
||||
))
|
||||
|
||||
def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict, start_ts: datetime):
|
||||
def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
|
||||
total = sum(stats.values())
|
||||
|
||||
if stats['failed'] > 0 :
|
||||
@@ -382,7 +313,7 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
|
||||
_LAST_RUN_STATS.succeeded += 1
|
||||
|
||||
try:
|
||||
size = get_dir_size(link_dir)
|
||||
size = get_dir_size(out_dir)
|
||||
except FileNotFoundError:
|
||||
size = (0, None, '0')
|
||||
|
||||
@@ -391,38 +322,38 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
|
||||
print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
|
||||
|
||||
|
||||
|
||||
def log_archive_method_started(method: str):
|
||||
print(' > {}'.format(method))
|
||||
|
||||
|
||||
def log_archive_method_finished(result: "ArchiveResult"):
|
||||
def log_archive_method_finished(result: dict):
|
||||
"""
|
||||
quote the argument with whitespace in a command so the user can
|
||||
quote the argument with whitespace in a command so the user can
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
|
||||
for arg in result.cmd
|
||||
for arg in result['cmd']
|
||||
)
|
||||
|
||||
if result.status == 'failed':
|
||||
if result.output.__class__.__name__ == 'TimeoutExpired':
|
||||
duration = (result.end_ts - result.start_ts).seconds
|
||||
if result['status'] == 'failed':
|
||||
output = result.get('output')
|
||||
if output and output.__class__.__name__ == 'TimeoutExpired':
|
||||
duration = (result['end_ts'] - result['start_ts']).seconds
|
||||
hint_header = [
|
||||
f'[yellow3]Extractor timed out after {duration}s.[/]',
|
||||
]
|
||||
else:
|
||||
error_name = result.output.__class__.__name__.replace('ArchiveError', '')
|
||||
error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
|
||||
hint_header = [
|
||||
'[yellow3]Extractor failed:[/]',
|
||||
f' {error_name} [red1]{result.output}[/]',
|
||||
f' {error_name} [red1]{output}[/]',
|
||||
]
|
||||
|
||||
# import pudb; pudb.set_trace()
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result.output, 'hints', None) or ()
|
||||
hints = getattr(output, 'hints', None) or () if output else ()
|
||||
if hints:
|
||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
|
||||
@@ -448,7 +379,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
||||
*hints,
|
||||
'[violet]Run to see full output:[/]',
|
||||
*docker_hints,
|
||||
*([' cd {};'.format(result.pwd)] if result.pwd else []),
|
||||
*([' cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
|
||||
' {}'.format(quoted_cmd),
|
||||
]
|
||||
print('\n'.join(
|
||||
@@ -463,21 +394,22 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||
print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
|
||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||
|
||||
def log_list_finished(links):
|
||||
from archivebox.index.csv import links_to_csv
|
||||
def log_list_finished(snapshots):
|
||||
from core.models import Snapshot
|
||||
print()
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print()
|
||||
|
||||
|
||||
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
||||
print(f'[yellow3][i] Found {len(links)} matching URLs to remove.[/]')
|
||||
def log_removal_started(snapshots, yes: bool, delete: bool):
|
||||
count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
|
||||
print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
|
||||
if delete:
|
||||
file_counts = [link.num_outputs for link in links if os.access(link.link_dir, os.R_OK)]
|
||||
file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
|
||||
print(
|
||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||
f' {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||
)
|
||||
else:
|
||||
@@ -488,7 +420,7 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
||||
|
||||
if not yes:
|
||||
print()
|
||||
print(f'[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
|
||||
print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
|
||||
try:
|
||||
assert input(' y/[n]: ').lower() == 'y'
|
||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||
@@ -504,6 +436,13 @@ def log_removal_finished(all_links: int, to_remove: int):
|
||||
print(f' Index now contains {all_links - to_remove} links.')
|
||||
|
||||
|
||||
### Search Indexing Stage
|
||||
|
||||
def log_index_started(url: str):
|
||||
print('[green][*] Indexing url: {} in the search index[/]'.format(url))
|
||||
print()
|
||||
|
||||
|
||||
### Helpers
|
||||
|
||||
@enforce_types
|
||||
@@ -542,10 +481,10 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folders(folders: Dict[str, Optional["Link"]], with_headers: bool=False) -> str:
|
||||
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
|
||||
return '\n'.join(
|
||||
f'{folder} {link and link.url} "{link and link.title}"'
|
||||
for folder, link in folders.items()
|
||||
f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
|
||||
for folder, snapshot in folders.items()
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ import pydantic # noqa
|
||||
import requests # noqa
|
||||
import subprocess # noqa
|
||||
import archivebox # noqa
|
||||
import abx # noqa
|
||||
from benedict import benedict # noqa
|
||||
from django.utils import timezone # noqa
|
||||
from datetime import datetime, timedelta # noqa
|
||||
@@ -21,8 +20,9 @@ from django.conf import settings # noqa
|
||||
|
||||
from archivebox import CONSTANTS # noqa
|
||||
from archivebox.cli import * # noqa
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CONFIG = get_config()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# load the rich extension for ipython for pretty printing
|
||||
@@ -35,7 +35,7 @@ if __name__ == '__main__':
|
||||
|
||||
|
||||
# print the welcome message
|
||||
prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
|
||||
prnt('[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]')
|
||||
prnt('[yellow4]# ArchiveBox Imports[/]')
|
||||
prnt('[yellow4]import archivebox[/]')
|
||||
prnt('[yellow4]from archivebox.cli import *[/]')
|
||||
|
||||
@@ -345,19 +345,41 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
|
||||
elif isinstance(obj, Exception):
|
||||
return '{}: {}'.format(obj.__class__.__name__, obj)
|
||||
|
||||
|
||||
elif isinstance(obj, Path):
|
||||
return str(obj)
|
||||
|
||||
|
||||
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
||||
return tuple(obj)
|
||||
|
||||
return list(obj)
|
||||
|
||||
elif isinstance(obj, Callable):
|
||||
return str(obj)
|
||||
|
||||
# Try dict/list conversion as fallback
|
||||
try:
|
||||
return dict(obj)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return list(obj)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return str(obj)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return pyjson.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
|
||||
"""Serialize object to JSON string with extended type support"""
|
||||
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
||||
|
||||
|
||||
### URL PARSING TESTS / ASSERTIONS
|
||||
|
||||
# Check that plain text regex URL parsing works as expected
|
||||
@@ -452,3 +474,78 @@ _test_url_strs = {
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(list(find_all_urls(url_str))) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
|
||||
|
||||
### Chrome Helpers
|
||||
|
||||
def chrome_args(**options) -> List[str]:
|
||||
"""Helper to build up a chrome shell command with arguments."""
|
||||
import shutil
|
||||
from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
|
||||
|
||||
chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
|
||||
chrome_headless = options.get('CHROME_HEADLESS', True)
|
||||
chrome_sandbox = options.get('CHROME_SANDBOX', True)
|
||||
check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
|
||||
user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
|
||||
resolution = options.get('RESOLUTION', RESOLUTION)
|
||||
timeout = options.get('CHROME_TIMEOUT', 0)
|
||||
user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
|
||||
|
||||
if not chrome_binary:
|
||||
raise Exception('Could not find any CHROME_BINARY installed on your system')
|
||||
|
||||
cmd_args = [chrome_binary]
|
||||
|
||||
if chrome_headless:
|
||||
cmd_args += ("--headless=new",)
|
||||
|
||||
if not chrome_sandbox:
|
||||
# running in docker or other sandboxed environment
|
||||
cmd_args += (
|
||||
"--no-sandbox",
|
||||
"--no-zygote",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-software-rasterizer",
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--disable-sync",
|
||||
)
|
||||
|
||||
if not check_ssl:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if user_agent:
|
||||
cmd_args += (f'--user-agent={user_agent}',)
|
||||
|
||||
if resolution:
|
||||
cmd_args += (f'--window-size={resolution}',)
|
||||
|
||||
if timeout:
|
||||
cmd_args += (f'--timeout={timeout * 1000}',)
|
||||
|
||||
if user_data_dir:
|
||||
cmd_args += (f'--user-data-dir={user_data_dir}',)
|
||||
|
||||
return cmd_args
|
||||
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
import os
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
|
||||
if IN_DOCKER:
|
||||
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
|
||||
if os.path.lexists(singleton_lock):
|
||||
try:
|
||||
os.remove(singleton_lock)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user