Improve scheduling, runtime paths, and API behavior

This commit is contained in:
Nick Sweeting
2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions

View File

@@ -131,6 +131,7 @@ def check_data_dir_permissions():
from archivebox import DATA_DIR
from archivebox.misc.logging import STDERR
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
data_dir_stat = Path(DATA_DIR).stat()
data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
@@ -156,11 +157,21 @@ def check_data_dir_permissions():
from archivebox.config.common import STORAGE_CONFIG
try:
tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR
except Exception:
tmp_dir = STORAGE_CONFIG.TMP_DIR
try:
lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True) or STORAGE_CONFIG.LIB_DIR
except Exception:
lib_dir = STORAGE_CONFIG.LIB_DIR
# Check /tmp dir permissions
check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True)
check_tmp_dir(tmp_dir, throw=False, must_exist=True)
# Check /lib dir permissions
check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
check_lib_dir(lib_dir, throw=False, must_exist=True)
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821

View File

@@ -426,14 +426,15 @@ def log_removal_started(snapshots, yes: bool, delete: bool):
except (KeyboardInterrupt, EOFError, AssertionError):
raise SystemExit(0)
def log_removal_finished(all_links: int, to_remove: int):
if all_links == 0:
def log_removal_finished(remaining_links: int, removed_links: int):
if remaining_links == 0 and removed_links == 0:
print()
print('[red1][X] No matching links found.[/]')
else:
total_before = remaining_links + removed_links
print()
print(f'[red1][√] Removed {to_remove} out of {all_links} links from the archive index.[/]')
print(f' Index now contains {all_links - to_remove} links.')
print(f'[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]')
print(f' Index now contains {remaining_links} links.')
### Search Indexing Stage

View File

@@ -10,7 +10,6 @@ from pathlib import Path
from typing import Optional, Union, Set, Tuple
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from crontab import CronTab
from atomicwrites import atomic_write as lib_atomic_write
from archivebox.config.common import STORAGE_CONFIG
@@ -170,28 +169,6 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
pass
return num_bytes, num_dirs, num_files
CRON_COMMENT = 'archivebox_schedule'
@enforce_types
def dedupe_cron_jobs(cron: CronTab) -> CronTab:
deduped: Set[Tuple[str, str]] = set()
for job in list(cron):
unique_tuple = (str(job.slices), str(job.command))
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron
class suppress_output(object):
"""
A context manager for doing a "deep suppression" of stdout and stderr in