diff --git a/archivebox/config/common.py b/archivebox/config/common.py index f1844219..b7f6b1e7 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -123,6 +123,7 @@ class ArchivingConfig(BaseConfigSet): OVERWRITE: bool = Field(default=False) TIMEOUT: int = Field(default=60) + MAX_URL_ATTEMPTS: int = Field(default=50) RESOLUTION: str = Field(default="1440,2000") CHECK_SSL_VALIDITY: bool = Field(default=True) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 9a160773..c30061c2 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2580,8 +2580,32 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): ) def can_start(self) -> bool: - can_start = bool(self.archiveresult.snapshot.url) - return can_start + if not self.archiveresult.snapshot.url: + return False + + # Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results + from archivebox.config.configset import get_config + + config = get_config( + crawl=self.archiveresult.snapshot.crawl, + snapshot=self.archiveresult.snapshot, + ) + max_attempts = config.get('MAX_URL_ATTEMPTS', 50) + + # Count failed ArchiveResults for this snapshot (any plugin type) + failed_count = self.archiveresult.snapshot.archiveresult_set.filter( + status=ArchiveResult.StatusChoices.FAILED + ).count() + + if failed_count >= max_attempts: + # Mark this result as skipped since we've hit the limit + self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED + self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)' + self.archiveresult.retry_at = None + self.archiveresult.save() + return False + + return True def is_succeeded(self) -> bool: """Check if extractor plugin succeeded (status was set by run())."""