mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 14:27:55 +10:00
Add MAX_URL_ATTEMPTS config option to stop retries after too many failures
Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that stops retrying ArchiveResult hooks for a snapshot once that many failures have been recorded. This prevents infinite retry loops for problematic URLs. When the limit is reached, any pending ArchiveResults for that snapshot are marked as SKIPPED with an explanatory message.
This commit is contained in:
@@ -123,6 +123,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
OVERWRITE: bool = Field(default=False)
|
||||
|
||||
TIMEOUT: int = Field(default=60)
|
||||
MAX_URL_ATTEMPTS: int = Field(default=50)
|
||||
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
|
||||
@@ -2594,8 +2594,32 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
return can_start
|
||||
if not self.archiveresult.snapshot.url:
|
||||
return False
|
||||
|
||||
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
|
||||
# Count failed ArchiveResults for this snapshot (any plugin type)
|
||||
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.FAILED
|
||||
).count()
|
||||
|
||||
if failed_count >= max_attempts:
|
||||
# Mark this result as skipped since we've hit the limit
|
||||
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
|
||||
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
|
||||
self.archiveresult.retry_at = None
|
||||
self.archiveresult.save()
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
|
||||
Reference in New Issue
Block a user