more consistent crawl, snapshot, hook cleanup and Process tracking

This commit is contained in:
Nick Sweeting
2026-01-02 04:27:38 -08:00
parent dd77511026
commit 3da523fc74
4 changed files with 151 additions and 45 deletions

View File

@@ -1413,27 +1413,49 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def cleanup(self):
"""
Clean up background ArchiveResult hooks.
Clean up background ArchiveResult hooks and empty results.
Called by the state machine when entering the 'sealed' state.
Kills any background hooks and finalizes their ArchiveResults.
Uses Process records to kill background hooks, then deletes empty ArchiveResults.
"""
from archivebox.misc.process_utils import safe_kill_process
from archivebox.machine.models import Process
# Kill any background ArchiveResult hooks
if not self.OUTPUT_DIR.exists():
return
# Kill any background ArchiveResult hooks using Process records
# Find all running hook Processes linked to this snapshot's ArchiveResults
running_hooks = Process.objects.filter(
archiveresult__snapshot=self,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
).distinct()
# Find all .pid files in this snapshot's output directory
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
cmd_file = pid_file.parent / 'cmd.sh'
safe_kill_process(pid_file, cmd_file)
for process in running_hooks:
# Use Process.kill_tree() to gracefully kill parent + children
killed_count = process.kill_tree(graceful_timeout=2.0)
if killed_count > 0:
print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
# Clean up .pid files from output directory
if self.OUTPUT_DIR.exists():
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
pid_file.unlink(missing_ok=True)
# Update all STARTED ArchiveResults from filesystem
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
for ar in results:
ar.update_from_output()
# Delete ArchiveResults that produced no output files
empty_ars = self.archiveresult_set.filter(
output_files={} # No output files
).filter(
status__in=ArchiveResult.FINAL_STATES # Only delete finished ones
)
deleted_count = empty_ars.count()
if deleted_count > 0:
empty_ars.delete()
print(f'[yellow]🗑️ Deleted {deleted_count} empty ArchiveResults for {self.url}[/yellow]')
def has_running_background_hooks(self) -> bool:
"""
Check if any ArchiveResult background hooks are still running.