Add MAX_URL_ATTEMPTS option to ArchiveBox (#1723)

…lures

Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that
stops retrying ArchiveResult hooks for a snapshot once that many
failures have been recorded. This prevents infinite retry loops for
problematic URLs.

When the limit is reached, any pending ArchiveResults for that snapshot
are marked as SKIPPED with an explanatory message.

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
This commit is contained in:
Nick Sweeting
2025-12-29 13:32:11 -08:00
committed by GitHub
2 changed files with 27 additions and 2 deletions

View File

@@ -123,6 +123,7 @@ class ArchivingConfig(BaseConfigSet):
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MAX_URL_ATTEMPTS: int = Field(default=50)
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)

View File

@@ -2580,8 +2580,32 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
return can_start
if not self.archiveresult.snapshot.url:
return False
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
# Count failed ArchiveResults for this snapshot (any plugin type)
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.FAILED
).count()
if failed_count >= max_attempts:
# Mark this result as skipped since we've hit the limit
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.retry_at = None
self.archiveresult.save()
return False
return True
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""