mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Add MAX_URL_ATTEMPTS option to ArchiveBox (#1723)
…lures Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that stops retrying ArchiveResult hooks for a snapshot once that many failures have been recorded. This prevents infinite retry loops for problematic URLs. When the limit is reached, any pending ArchiveResults for that snapshot are marked as SKIPPED with an explanatory message. <!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes. --> # Summary <!--e.g. This PR fixes ABC or adds the ability to do XYZ...--> # Related issues <!-- e.g. #123 or Roadmap goal # https://github.com/pirate/ArchiveBox/wiki/Roadmap --> # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
This commit is contained in:
@@ -123,6 +123,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
OVERWRITE: bool = Field(default=False)
|
||||
|
||||
TIMEOUT: int = Field(default=60)
|
||||
MAX_URL_ATTEMPTS: int = Field(default=50)
|
||||
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
|
||||
@@ -2580,8 +2580,32 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
return can_start
|
||||
if not self.archiveresult.snapshot.url:
|
||||
return False
|
||||
|
||||
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
|
||||
# Count failed ArchiveResults for this snapshot (any plugin type)
|
||||
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.FAILED
|
||||
).count()
|
||||
|
||||
if failed_count >= max_attempts:
|
||||
# Mark this result as skipped since we've hit the limit
|
||||
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
|
||||
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
|
||||
self.archiveresult.retry_at = None
|
||||
self.archiveresult.save()
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
|
||||
Reference in New Issue
Block a user