__package__ = 'archivebox.core' from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING from uuid import uuid7 from datetime import datetime, timedelta from django_stubs_ext.db.models import TypedModelMeta import os import json from pathlib import Path from django.db import models from django.db.models import QuerySet, Value, Case, When, IntegerField from django.utils.functional import cached_property from django.utils.text import slugify from django.utils import timezone from django.core.cache import cache from django.urls import reverse, reverse_lazy from django.contrib import admin from django.conf import settings from archivebox.config import CONSTANTS from archivebox.misc.system import get_dir_size, atomic_write from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.misc.hashing import get_dir_info from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE from archivebox.base_models.models import ( ModelWithUUID, ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk, ) from workers.models import ModelWithStateMachine from workers.tasks import bg_archive_snapshot from crawls.models import Crawl from machine.models import NetworkInterface class Tag(ModelWithSerializers): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set') created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) name = models.CharField(unique=True, blank=False, max_length=100) slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) snapshot_set: models.Manager['Snapshot'] class Meta(TypedModelMeta): verbose_name = "Tag" verbose_name_plural = "Tags" def __str__(self): return self.name def save(self, *args, **kwargs): if self._state.adding: self.slug = slugify(self.name) existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True)) i = None while True: slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name) if slug not in existing: self.slug = slug break i = (i or 0) + 1 super().save(*args, **kwargs) @property def api_url(self) -> str: return reverse_lazy('api-1:get_tag', args=[self.id]) class SnapshotTag(models.Model): id = models.AutoField(primary_key=True) snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id') tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id') class Meta: db_table = 'core_snapshot_tags' unique_together = [('snapshot', 'tag')] class SnapshotManager(models.Manager): def filter(self, *args, **kwargs): domain = kwargs.pop('domain', None) qs = super().filter(*args, **kwargs) if domain: qs = qs.filter(url__icontains=f'://{domain}') return qs def get_queryset(self): return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # ========================================================================= # Filtering Methods # ========================================================================= FILTER_TYPES = { 'exact': lambda pattern: models.Q(url=pattern), 'substring': lambda pattern: models.Q(url__icontains=pattern), 'regex': lambda pattern: models.Q(url__iregex=pattern), 'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"), 'tag': lambda pattern: models.Q(tags__name=pattern), 'timestamp': lambda pattern: models.Q(timestamp=pattern), } def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> QuerySet: """Filter snapshots by URL patterns using specified filter type""" from archivebox.misc.logging import stderr q_filter = models.Q() for pattern in patterns: try: q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern) except KeyError: stderr() stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red') stderr(f' {pattern}') raise SystemExit(2) return self.filter(q_filter) def search(self, patterns: List[str]) -> QuerySet: """Search snapshots using the configured search backend""" from archivebox.config.common import SEARCH_BACKEND_CONFIG from archivebox.search import query_search_index from archivebox.misc.logging import stderr if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND: stderr() stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red') raise SystemExit(2) qsearch = self.none() for pattern in patterns: try: qsearch |= query_search_index(pattern) except: raise SystemExit(2) return self.all() & qsearch # ========================================================================= # Export Methods # ========================================================================= def to_json(self, with_headers: bool = False) -> str: """Generate JSON index from snapshots""" import sys from datetime import datetime, timezone as tz from archivebox.config import VERSION from archivebox.config.common import SERVER_CONFIG MAIN_INDEX_HEADER = { 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', 'schema': 'archivebox.index.json', 'copyright_info': SERVER_CONFIG.FOOTER_INFO, 'meta': { 'project': 'ArchiveBox', 'version': VERSION, 'git_sha': VERSION, 'website': 'https://ArchiveBox.io', 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', 'source': 'https://github.com/ArchiveBox/ArchiveBox', 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', 'dependencies': {}, }, } if with_headers else {} snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)] if with_headers: output = { **MAIN_INDEX_HEADER, 'num_links': len(snapshot_dicts), 'updated': datetime.now(tz.utc), 'last_run_cmd': sys.argv, 'links': snapshot_dicts, } else: output = snapshot_dicts return to_json(output, indent=4, sort_keys=True) def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str: """Generate CSV output from snapshots""" cols = cols or ['timestamp', 'is_archived', 'url'] header_str = separator.join(col.ljust(ljust) for col in cols) if header else '' row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500)) return '\n'.join((header_str, *row_strs)) def to_html(self, with_headers: bool = True) -> str: """Generate main index HTML from snapshots""" from datetime import datetime, timezone as tz from django.template.loader import render_to_string from archivebox.config import VERSION from archivebox.config.common import SERVER_CONFIG from archivebox.config.version import get_COMMIT_HASH template = 'static_index.html' if with_headers else 'minimal_index.html' snapshot_list = list(self.iterator(chunk_size=500)) return render_to_string(template, { 'version': VERSION, 'git_sha': get_COMMIT_HASH() or VERSION, 'num_links': str(len(snapshot_list)), 'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'), 'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'), 'links': snapshot_list, 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, }) # ========================================================================= # Import Methods # ========================================================================= def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot': """Create or update a Snapshot from a SnapshotDict (parser output)""" import re from archivebox.config.common import GENERAL_CONFIG url = link_dict['url'] timestamp = link_dict.get('timestamp') title = link_dict.get('title') tags_str = link_dict.get('tags') tag_list = [] if tags_str: tag_list = list(dict.fromkeys( tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str) if tag.strip() )) try: snapshot = self.get(url=url) if title and (not snapshot.title or len(title) > len(snapshot.title or '')): snapshot.title = title snapshot.save(update_fields=['title', 'modified_at']) except self.model.DoesNotExist: if timestamp: while self.filter(timestamp=timestamp).exists(): timestamp = str(float(timestamp) + 1.0) snapshot = self.create( url=url, timestamp=timestamp, title=title, created_by_id=created_by_id or get_or_create_system_user_pk(), ) if tag_list: existing_tags = set(snapshot.tags.values_list('name', flat=True)) new_tags = set(tag_list) | existing_tags snapshot.save_tags(new_tags) return snapshot def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']: """Create or update multiple Snapshots from a list of SnapshotDicts""" return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts] def remove(self, atomic: bool = False) -> tuple: """Remove snapshots from the database""" from django.db import transaction if atomic: with transaction.atomic(): return self.delete() return self.delete() class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) url = models.URLField(unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True) crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore title = models.CharField(max_length=512, null=True, blank=True, db_index=True) downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) config = models.JSONField(default=dict, null=False, blank=False, editable=True) notes = models.TextField(blank=True, null=False, default='') output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True) tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) state_machine_name = 'core.statemachines.SnapshotMachine' state_field_name = 'status' retry_at_field_name = 'retry_at' StatusChoices = ModelWithStateMachine.StatusChoices active_state = StatusChoices.STARTED objects = SnapshotManager() archiveresult_set: models.Manager['ArchiveResult'] class Meta(TypedModelMeta): verbose_name = "Snapshot" verbose_name_plural = "Snapshots" def __str__(self): return f'[{self.id}] {self.url[:64]}' def save(self, *args, **kwargs): if not self.bookmarked_at: self.bookmarked_at = self.created_at or timezone.now() if not self.timestamp: self.timestamp = str(self.bookmarked_at.timestamp()) super().save(*args, **kwargs) if self.crawl and self.url not in self.crawl.urls: self.crawl.urls += f'\n{self.url}' self.crawl.save() def output_dir_parent(self) -> str: return 'archive' def output_dir_name(self) -> str: return str(self.timestamp) def archive(self, overwrite=False, methods=None): return bg_archive_snapshot(self, overwrite=overwrite, methods=methods) @admin.display(description='Tags') def tags_str(self, nocache=True) -> str | None: calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all())) if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache: return calc_tags_str() cache_key = f'{self.pk}-tags' return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() def icons(self) -> str: """Generate HTML icons showing which extractors have succeeded for this snapshot""" from django.utils.html import format_html, mark_safe from collections import defaultdict cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}' def calc_icons(): if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output] else: archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False) path = self.archive_path canon = self.canonical_outputs() output = "" output_template = '{}  ' icons = { "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄", "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛", "readability": "🆁", "mercury": "🅼", "warc": "📦" } exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"] extractor_outputs = defaultdict(lambda: None) for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES: for result in archive_results: if result.extractor == extractor: extractor_outputs[extractor] = result for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES: if extractor not in exclude: existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?")) if extractor == "wget": exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?")) if extractor == "archive_org": exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output output += '{} '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?")) return format_html('{}', mark_safe(output)) cache_result = cache.get(cache_key) if cache_result: return cache_result fresh_result = calc_icons() cache.set(cache_key, fresh_result, timeout=60 * 60 * 24) return fresh_result @property def api_url(self) -> str: return reverse_lazy('api-1:get_snapshot', args=[self.id]) def get_absolute_url(self): return f'/{self.archive_path}' @cached_property def domain(self) -> str: return url_domain(self.url) @cached_property def output_dir(self): """The filesystem path to the snapshot's output directory.""" return str(CONSTANTS.ARCHIVE_DIR / self.timestamp) @cached_property def archive_path(self): return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' @cached_property def archive_size(self): try: return get_dir_size(self.output_dir)[0] except Exception: return 0 def save_tags(self, tags: Iterable[str] = ()) -> None: tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()] self.tags.clear() self.tags.add(*tags_id) def pending_archiveresults(self) -> QuerySet['ArchiveResult']: return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) def run(self) -> list['ArchiveResult']: """ Execute this Snapshot by creating ArchiveResults for all enabled extractors. Called by the state machine when entering the 'started' state. """ return self.create_pending_archiveresults() def create_pending_archiveresults(self) -> list['ArchiveResult']: """ Create ArchiveResult records for all enabled extractors. Uses the hooks system to discover available extractors from: - archivebox/plugins/*/on_Snapshot__*.{py,sh,js} - data/plugins/*/on_Snapshot__*.{py,sh,js} """ from archivebox.hooks import get_enabled_extractors extractors = get_enabled_extractors() archiveresults = [] for extractor in extractors: if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists(): continue archiveresult, _ = ArchiveResult.objects.get_or_create( snapshot=self, extractor=extractor, defaults={ 'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now(), 'created_by_id': self.created_by_id, }, ) if archiveresult.status == ArchiveResult.INITIAL_STATE: archiveresults.append(archiveresult) return archiveresults def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int: """ Reset failed/skipped ArchiveResults to queued for retry. This enables seamless retry of the entire extraction pipeline: - Resets FAILED and SKIPPED results to QUEUED - Sets retry_at so workers pick them up - Extractors run in order (numeric prefix) - Each extractor checks its dependencies at runtime Dependency handling (e.g., chrome_session → screenshot): - Extractors check if required outputs exist before running - If dependency output missing → extractor returns 'skipped' - On retry, if dependency now succeeds → dependent can run Returns count of ArchiveResults reset. """ retry_at = retry_at or timezone.now() count = self.archiveresult_set.filter( status__in=[ ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ] ).update( status=ArchiveResult.StatusChoices.QUEUED, retry_at=retry_at, output=None, start_ts=None, end_ts=None, ) # Also reset the snapshot so it gets re-checked if count > 0: self.status = self.StatusChoices.STARTED self.retry_at = retry_at self.save(update_fields=['status', 'retry_at', 'modified_at']) return count # ========================================================================= # URL Helper Properties (migrated from Link schema) # ========================================================================= @cached_property def url_hash(self) -> str: from hashlib import sha256 return sha256(self.url.encode()).hexdigest()[:8] @cached_property def scheme(self) -> str: return self.url.split('://')[0] @cached_property def path(self) -> str: parts = self.url.split('://', 1) return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/' @cached_property def basename(self) -> str: return self.path.split('/')[-1] @cached_property def extension(self) -> str: basename = self.basename return basename.split('.')[-1] if '.' in basename else '' @cached_property def base_url(self) -> str: return f'{self.scheme}://{self.domain}' @cached_property def is_static(self) -> bool: static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'} return any(self.url.lower().endswith(ext) for ext in static_extensions) @cached_property def is_archived(self) -> bool: output_paths = ( self.domain, 'output.html', 'output.pdf', 'screenshot.png', 'singlefile.html', 'readability/content.html', 'mercury/content.html', 'htmltotext.txt', 'media', 'git', ) return any((Path(self.output_dir) / path).exists() for path in output_paths) # ========================================================================= # Date/Time Properties (migrated from Link schema) # ========================================================================= @cached_property def bookmarked_date(self) -> Optional[str]: max_ts = (timezone.now() + timedelta(days=30)).timestamp() if self.timestamp and self.timestamp.replace('.', '').isdigit(): if 0 < float(self.timestamp) < max_ts: return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp))) return str(self.timestamp) return None @cached_property def downloaded_datestr(self) -> Optional[str]: return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None @cached_property def archive_dates(self) -> List[datetime]: return [ result.start_ts for result in self.archiveresult_set.all() if result.start_ts ] @cached_property def oldest_archive_date(self) -> Optional[datetime]: dates = self.archive_dates return min(dates) if dates else None @cached_property def newest_archive_date(self) -> Optional[datetime]: dates = self.archive_dates return max(dates) if dates else None @cached_property def num_outputs(self) -> int: return self.archiveresult_set.filter(status='succeeded').count() @cached_property def num_failures(self) -> int: return self.archiveresult_set.filter(status='failed').count() # ========================================================================= # Output Path Methods (migrated from Link schema) # ========================================================================= def canonical_outputs(self) -> Dict[str, Optional[str]]: """Predict the expected output paths that should be present after archiving""" FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}' canonical = { 'index_path': 'index.html', 'favicon_path': 'favicon.ico', 'google_favicon_path': FAVICON_PROVIDER.format(self.domain), 'wget_path': f'warc/{self.timestamp}', 'warc_path': 'warc/', 'singlefile_path': 'singlefile.html', 'readability_path': 'readability/content.html', 'mercury_path': 'mercury/content.html', 'htmltotext_path': 'htmltotext.txt', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', 'archive_org_path': f'https://web.archive.org/web/{self.base_url}', 'git_path': 'git/', 'media_path': 'media/', 'headers_path': 'headers.json', } if self.is_static: static_path = f'warc/{self.timestamp}' canonical.update({ 'title': self.basename, 'wget_path': static_path, 'pdf_path': static_path, 'screenshot_path': static_path, 'dom_path': static_path, 'singlefile_path': static_path, 'readability_path': static_path, 'mercury_path': static_path, 'htmltotext_path': static_path, }) return canonical def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]: """Get the latest output that each archive method produced""" from archivebox.hooks import get_extractors latest: Dict[str, Any] = {} for archive_method in get_extractors(): results = self.archiveresult_set.filter(extractor=archive_method) if status is not None: results = results.filter(status=status) results = results.filter(output__isnull=False).order_by('-start_ts') latest[archive_method] = results.first().output if results.exists() else None return latest # ========================================================================= # Serialization Methods # ========================================================================= def to_dict(self, extended: bool = False) -> Dict[str, Any]: """Convert Snapshot to a dictionary (replacement for Link._asdict())""" from archivebox.misc.util import ts_to_date_str result = { 'TYPE': 'core.models.Snapshot', 'id': str(self.id), 'url': self.url, 'timestamp': self.timestamp, 'title': self.title, 'tags': self.tags_str(), 'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None, 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None, # Computed properties 'domain': self.domain, 'scheme': self.scheme, 'base_url': self.base_url, 'path': self.path, 'basename': self.basename, 'extension': self.extension, 'is_static': self.is_static, 'is_archived': self.is_archived, 'archive_path': self.archive_path, 'output_dir': self.output_dir, 'link_dir': self.output_dir, # backwards compatibility alias 'archive_size': self.archive_size, 'bookmarked_date': self.bookmarked_date, 'downloaded_datestr': self.downloaded_datestr, 'num_outputs': self.num_outputs, 'num_failures': self.num_failures, } if extended: result['canonical'] = self.canonical_outputs() return result def to_json(self, indent: int = 4) -> str: """Convert to JSON string""" return to_json(self.to_dict(extended=True), indent=indent) def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str: """Convert to CSV string""" data = self.to_dict() cols = cols or ['timestamp', 'is_archived', 'url'] return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols) def write_json_details(self, out_dir: Optional[str] = None) -> None: """Write JSON index file for this snapshot to its output directory""" out_dir = out_dir or self.output_dir path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME atomic_write(str(path), self.to_dict(extended=True)) def write_html_details(self, out_dir: Optional[str] = None) -> None: """Write HTML detail page for this snapshot to its output directory""" from django.template.loader import render_to_string from archivebox.config.common import SERVER_CONFIG from archivebox.config.configset import get_config from archivebox.misc.logging_util import printable_filesize out_dir = out_dir or self.output_dir config = get_config() SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True) TITLE_LOADING_MSG = 'Not yet archived...' canonical = self.canonical_outputs() context = { **self.to_dict(extended=True), **{f'{k}_path': v for k, v in canonical.items()}, 'canonical': {f'{k}_path': v for k, v in canonical.items()}, 'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)), 'url_str': htmlencode(urldecode(self.base_url)), 'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank', 'extension': self.extension or 'html', 'tags': self.tags_str() or 'untagged', 'size': printable_filesize(self.archive_size) if self.archive_size else 'pending', 'status': 'archived' if self.is_archived else 'not yet archived', 'status_color': 'success' if self.is_archived else 'danger', 'oldest_archive_date': ts_to_date_str(self.oldest_archive_date), 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, } rendered_html = render_to_string('snapshot.html', context) atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) # ========================================================================= # Helper Methods # ========================================================================= @staticmethod def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]: return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): INDEXABLE_METHODS = [r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE] qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded') if sorted: precedence = [When(extractor=method, then=Value(p)) for method, p in ARCHIVE_METHODS_INDEXING_PRECEDENCE] qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence') return qs class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' STARTED = 'started', 'Started' BACKOFF = 'backoff', 'Waiting to retry' SUCCEEDED = 'succeeded', 'Succeeded' FAILED = 'failed', 'Failed' SKIPPED = 'skipped', 'Skipped' EXTRACTOR_CHOICES = ( ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget'), ) id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True) pwd = models.CharField(max_length=256, default=None, null=True, blank=True) cmd = models.JSONField(default=None, null=True, blank=True) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) output = models.CharField(max_length=1024, default=None, null=True, blank=True) start_ts = models.DateTimeField(default=None, null=True, blank=True) end_ts = models.DateTimeField(default=None, null=True, blank=True) status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) notes = models.TextField(blank=True, null=False, default='') output_dir = models.CharField(max_length=256, default=None, null=True, blank=True) iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True) state_machine_name = 'core.statemachines.ArchiveResultMachine' retry_at_field_name = 'retry_at' state_field_name = 'status' active_state = StatusChoices.STARTED objects = ArchiveResultManager() class Meta(TypedModelMeta): verbose_name = 'Archive Result' verbose_name_plural = 'Archive Results Log' def __str__(self): return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}' @cached_property def snapshot_dir(self): return Path(self.snapshot.output_dir) @cached_property def url(self): return self.snapshot.url @property def api_url(self) -> str: return reverse_lazy('api-1:get_archiveresult', args=[self.id]) def get_absolute_url(self): return f'/{self.snapshot.archive_path}/{self.extractor}' @property def extractor_module(self) -> Any | None: # Hook scripts are now used instead of Python extractor modules # The extractor name maps to hooks in archivebox/plugins/{extractor}/ return None def output_exists(self) -> bool: return os.path.exists(Path(self.snapshot_dir) / self.extractor) def create_output_dir(self): output_dir = Path(self.snapshot_dir) / self.extractor output_dir.mkdir(parents=True, exist_ok=True) return output_dir @property def output_dir_name(self) -> str: return self.extractor @property def output_dir_parent(self) -> str: return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR)) def write_indexes(self): super().write_indexes() def save_search_index(self): pass def run(self): """ Execute this ArchiveResult's extractor and update status. Discovers and runs the hook script for self.extractor, updates status/output fields, queues discovered URLs, and triggers indexing. """ from django.utils import timezone from archivebox.hooks import discover_hooks, run_hook extractor_dir = Path(self.snapshot.output_dir) / self.extractor config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] # Discover hook for this extractor hooks = discover_hooks(f'Snapshot__{self.extractor}') if not hooks: self.status = self.StatusChoices.FAILED self.output = f'No hook found for: {self.extractor}' self.retry_at = None self.save() return # Run the hook start_ts = timezone.now() result = run_hook( hooks[0], output_dir=extractor_dir, config_objects=config_objects, url=self.snapshot.url, ) end_ts = timezone.now() # Determine status from return code and JSON output output_json = result.get('output_json') or {} json_status = output_json.get('status') if json_status == 'skipped': status = 'skipped' elif json_status == 'failed': status = 'failed' elif result['returncode'] == 0: status = 'succeeded' else: status = 'failed' # Update self from result status_map = { 'succeeded': self.StatusChoices.SUCCEEDED, 'failed': self.StatusChoices.FAILED, 'skipped': self.StatusChoices.SKIPPED, } self.status = status_map.get(status, self.StatusChoices.FAILED) self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None self.start_ts = start_ts self.end_ts = end_ts self.retry_at = None self.save() # Queue any discovered URLs for crawling (parser extractors write urls.jsonl) self._queue_urls_for_crawl(extractor_dir) # Trigger search indexing if succeeded if self.status == self.StatusChoices.SUCCEEDED: self.trigger_search_indexing() def _queue_urls_for_crawl(self, extractor_dir: Path): """ Read urls.jsonl and queue discovered URLs for crawling. Parser extractors output urls.jsonl with discovered URLs and Tags. - Tag records: {"type": "Tag", "name": "..."} - Snapshot records: {"type": "Snapshot", "url": "...", ...} Tags are created in the database. URLs get added to the parent Crawl's queue with metadata (depth, via_snapshot, via_extractor) for recursive crawling. Used at all depths: - depth=0: Initial source file (e.g., bookmarks.html) parsed for URLs - depth>0: Crawled pages parsed for outbound links """ import json if not self.snapshot.crawl: return urls_file = extractor_dir / 'urls.jsonl' if not urls_file.exists(): return urls_added = 0 tags_created = 0 with open(urls_file, 'r') as f: for line in f: line = line.strip() if not line: continue try: entry = json.loads(line) record_type = entry.get('type', 'Snapshot') # Handle Tag records if record_type == 'Tag': tag_name = entry.get('name') if tag_name: Tag.objects.get_or_create(name=tag_name) tags_created += 1 continue # Handle Snapshot records (or records without type) if not entry.get('url'): continue # Add crawl metadata entry['depth'] = self.snapshot.depth + 1 entry['via_snapshot'] = str(self.snapshot.id) entry['via_extractor'] = self.extractor if self.snapshot.crawl.add_url(entry): urls_added += 1 except json.JSONDecodeError: continue if urls_added > 0: self.snapshot.crawl.create_snapshots_from_urls() def trigger_search_indexing(self): """Run any ArchiveResult__index hooks to update search indexes.""" from archivebox.hooks import discover_hooks, run_hook # Pass config objects in priority order (later overrides earlier) config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] for hook in discover_hooks('ArchiveResult__index'): run_hook( hook, output_dir=self.output_dir, config_objects=config_objects, snapshot_id=str(self.snapshot.id), extractor=self.extractor, ) @property def output_dir(self) -> Path: """Get the output directory for this extractor's results.""" return Path(self.snapshot.output_dir) / self.extractor