Files
ArchiveBox/archivebox/misc/system.py
2026-03-15 20:12:27 -07:00

217 lines
8.7 KiB
Python

__package__ = 'archivebox.misc'
import os
import signal
import shutil
import sys
from json import dump
from pathlib import Path
from typing import Optional, Union, Tuple
from subprocess import PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from atomicwrites import atomic_write as lib_atomic_write
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.util import enforce_types, ExtendedEncoder
IS_WINDOWS = os.name == 'nt'
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
"""Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
"""
cmd = [str(arg) for arg in cmd]
if input is not None:
if kwargs.get('stdin') is not None:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
pgid = None
try:
if isinstance(cmd, (list, tuple)) and cmd[0].endswith('.py'):
PYTHON_BINARY = sys.executable
cmd = (PYTHON_BINARY, *cmd)
with Popen(cmd, *args, start_new_session=start_new_session, text=text, **kwargs) as process:
pgid = os.getpgid(process.pid)
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired as exc:
process.kill()
if IS_WINDOWS:
# Windows accumulates the output in a single blocking
# read() call run on child threads, with the timeout
# being done in a join() on those threads. communicate()
# _after_ kill() is required to collect that and add it
# to the exception.
timed_out_stdout, timed_out_stderr = process.communicate()
exc.stdout = timed_out_stdout.encode() if isinstance(timed_out_stdout, str) else timed_out_stdout
exc.stderr = timed_out_stderr.encode() if isinstance(timed_out_stderr, str) else timed_out_stderr
else:
# POSIX _communicate already populated the output so
# far into the TimeoutExpired exception.
process.wait()
raise
except BaseException: # Including KeyboardInterrupt, communicate handled that.
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
finally:
# force kill any straggler subprocesses that were forked from the main proc
try:
if pgid is not None:
os.killpg(pgid, signal.SIGINT)
except Exception:
pass
return CompletedProcess(process.args, retcode or 0, stdout, stderr)
@enforce_types
def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
mode = 'wb+' if isinstance(contents, bytes) else 'w'
encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
try:
with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
if isinstance(contents, dict):
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
elif isinstance(contents, (bytes, str)):
f.write(contents)
except OSError as e:
if STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES:
print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})")
print(" You can store the archive/ subfolder on a hard drive or network share that doesn't support support syncronous writes,")
print(" but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.")
raise SystemExit(1)
# retry the write without forcing FSYNC (aka atomic mode)
with open(path, mode=mode, encoding=encoding) as f:
if isinstance(contents, dict):
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
elif isinstance(contents, (bytes, str)):
f.write(contents)
# set file permissions
os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
@enforce_types
def chmod_file(path: str, cwd: str='') -> None:
"""chmod -R <permissions> <cwd>/<path>"""
root = Path(cwd or os.getcwd()) / path
if not os.access(root, os.R_OK):
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
if not root.is_dir():
# path is just a plain file
os.chmod(root, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
else:
for subpath in Path(path).glob('**/*'):
if subpath.is_dir():
# directories need execute permissions to be able to list contents
os.chmod(subpath, int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))
else:
os.chmod(subpath, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
@enforce_types
def copy_and_overwrite(from_path: Union[str, Path], to_path: Union[str, Path]):
"""copy a given file or directory to a given path, overwriting the destination"""
assert os.access(from_path, os.R_OK)
if Path(from_path).is_dir():
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
contents = src.read()
atomic_write(to_path, contents)
@enforce_types
def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
"""get the total disk size of a given directory, optionally summing up
recursively and limiting to a given filter list
"""
num_bytes, num_dirs, num_files = 0, 0, 0
try:
for entry in os.scandir(path):
if (pattern is not None) and (pattern not in entry.path):
continue
if entry.is_dir(follow_symlinks=False):
if not recursive:
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
except OSError:
# e.g. FileNameTooLong or other error while trying to read dir
pass
return num_bytes, num_dirs, num_files
class suppress_output(object):
"""
A context manager for doing a "deep suppression" of stdout and stderr in
Python, i.e. will suppress all print, even if the print originates in a
compiled C/Fortran sub-function.
This will not suppress raised exceptions, since exceptions are printed
to stderr just before a script exits, and after the context manager has
exited (at least, I think that is why it lets exceptions through).
with suppress_stdout_stderr():
rogue_function()
"""
def __init__(self, stdout=True, stderr=True):
# Open a pair of null files
# Save the actual stdout (1) and stderr (2) file descriptors.
self.stdout, self.stderr = stdout, stderr
if stdout:
self.null_stdout = os.open(os.devnull, os.O_RDWR)
self.real_stdout = os.dup(1)
if stderr:
self.null_stderr = os.open(os.devnull, os.O_RDWR)
self.real_stderr = os.dup(2)
def __enter__(self):
# Assign the null pointers to stdout and stderr.
if self.stdout:
os.dup2(self.null_stdout, 1)
if self.stderr:
os.dup2(self.null_stderr, 2)
def __exit__(self, *_):
# Re-assign the real stdout/stderr back to (1) and (2)
if self.stdout:
os.dup2(self.real_stdout, 1)
os.close(self.null_stdout)
if self.stderr:
os.dup2(self.real_stderr, 2)
os.close(self.null_stderr)