Add MAX_URL_ATTEMPTS config option to stop retries after too many failures

Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that stops
retrying ArchiveResult hooks for a snapshot once that many failures have
been recorded. This prevents infinite retry loops for problematic URLs.

When the limit is reached, any pending ArchiveResults for that snapshot
are marked as SKIPPED with an explanatory message.
This commit is contained in:
Claude
2025-12-29 20:20:50 +00:00
parent e38ddf3a25
commit 88d7906033
2 changed files with 27 additions and 2 deletions

View File

@@ -123,6 +123,7 @@ class ArchivingConfig(BaseConfigSet):
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MAX_URL_ATTEMPTS: int = Field(default=50)
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)

View File

@@ -2594,8 +2594,32 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
return can_start
if not self.archiveresult.snapshot.url:
return False
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
# Count failed ArchiveResults for this snapshot (any plugin type)
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.FAILED
).count()
if failed_count >= max_attempts:
# Mark this result as skipped since we've hit the limit
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.retry_at = None
self.archiveresult.save()
return False
return True
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""