From 88d7906033874be6a644e9a754eaed7b8274b57a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 29 Dec 2025 20:20:50 +0000 Subject: [PATCH] Add MAX_URL_ATTEMPTS config option to stop retries after too many failures Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that stops retrying ArchiveResult hooks for a snapshot once that many failures have been recorded. This prevents infinite retry loops for problematic URLs. When the limit is reached, any pending ArchiveResults for that snapshot are marked as SKIPPED with an explanatory message. --- archivebox/config/common.py | 1 + archivebox/core/models.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/archivebox/config/common.py b/archivebox/config/common.py index f1844219..b7f6b1e7 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -123,6 +123,7 @@ class ArchivingConfig(BaseConfigSet): OVERWRITE: bool = Field(default=False) TIMEOUT: int = Field(default=60) + MAX_URL_ATTEMPTS: int = Field(default=50) RESOLUTION: str = Field(default="1440,2000") CHECK_SSL_VALIDITY: bool = Field(default=True) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 6c940126..90542bed 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2594,8 +2594,32 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): ) def can_start(self) -> bool: - can_start = bool(self.archiveresult.snapshot.url) - return can_start + if not self.archiveresult.snapshot.url: + return False + + # Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results + from archivebox.config.configset import get_config + + config = get_config( + crawl=self.archiveresult.snapshot.crawl, + snapshot=self.archiveresult.snapshot, + ) + max_attempts = config.get('MAX_URL_ATTEMPTS', 50) + + # Count failed ArchiveResults for this snapshot (any plugin type) + failed_count = self.archiveresult.snapshot.archiveresult_set.filter( + status=ArchiveResult.StatusChoices.FAILED + ).count() + + if failed_count >= max_attempts: + # Mark this result as skipped since we've hit the limit + self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED + self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)' + self.archiveresult.retry_at = None + self.archiveresult.save() + return False + + return True def is_succeeded(self) -> bool: """Check if extractor plugin succeeded (status was set by run())."""