mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
wip major changes
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Optional, Dict, Iterable, Any
|
||||
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
|
||||
from uuid import uuid7
|
||||
from datetime import datetime, timedelta
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
import os
|
||||
@@ -18,15 +19,11 @@ from django.urls import reverse, reverse_lazy
|
||||
from django.contrib import admin
|
||||
from django.conf import settings
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain
|
||||
from archivebox.misc.system import get_dir_size, atomic_write
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||
from archivebox.misc.hashing import get_dir_info
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.index.html import snapshot_icons
|
||||
from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from archivebox.base_models.models import (
|
||||
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
|
||||
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
||||
@@ -38,6 +35,7 @@ from crawls.models import Crawl
|
||||
from machine.models import NetworkInterface
|
||||
|
||||
|
||||
|
||||
class Tag(ModelWithSerializers):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
|
||||
@@ -94,8 +92,181 @@ class SnapshotManager(models.Manager):
|
||||
def get_queryset(self):
|
||||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
|
||||
|
||||
# =========================================================================
|
||||
# Filtering Methods
|
||||
# =========================================================================
|
||||
|
||||
class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
FILTER_TYPES = {
|
||||
'exact': lambda pattern: models.Q(url=pattern),
|
||||
'substring': lambda pattern: models.Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: models.Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
|
||||
'tag': lambda pattern: models.Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: models.Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> QuerySet:
|
||||
"""Filter snapshots by URL patterns using specified filter type"""
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
q_filter = models.Q()
|
||||
for pattern in patterns:
|
||||
try:
|
||||
q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
|
||||
except KeyError:
|
||||
stderr()
|
||||
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
|
||||
stderr(f' {pattern}')
|
||||
raise SystemExit(2)
|
||||
return self.filter(q_filter)
|
||||
|
||||
def search(self, patterns: List[str]) -> QuerySet:
|
||||
"""Search snapshots using the configured search backend"""
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
from archivebox.search import query_search_index
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
|
||||
stderr()
|
||||
stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
|
||||
raise SystemExit(2)
|
||||
|
||||
qsearch = self.none()
|
||||
for pattern in patterns:
|
||||
try:
|
||||
qsearch |= query_search_index(pattern)
|
||||
except:
|
||||
raise SystemExit(2)
|
||||
return self.all() & qsearch
|
||||
|
||||
# =========================================================================
|
||||
# Export Methods
|
||||
# =========================================================================
|
||||
|
||||
def to_json(self, with_headers: bool = False) -> str:
|
||||
"""Generate JSON index from snapshots"""
|
||||
import sys
|
||||
from datetime import datetime, timezone as tz
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
|
||||
MAIN_INDEX_HEADER = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': VERSION,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': {},
|
||||
},
|
||||
} if with_headers else {}
|
||||
|
||||
snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
|
||||
|
||||
if with_headers:
|
||||
output = {
|
||||
**MAIN_INDEX_HEADER,
|
||||
'num_links': len(snapshot_dicts),
|
||||
'updated': datetime.now(tz.utc),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': snapshot_dicts,
|
||||
}
|
||||
else:
|
||||
output = snapshot_dicts
|
||||
return to_json(output, indent=4, sort_keys=True)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
|
||||
"""Generate CSV output from snapshots"""
|
||||
cols = cols or ['timestamp', 'is_archived', 'url']
|
||||
header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
|
||||
row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
|
||||
return '\n'.join((header_str, *row_strs))
|
||||
|
||||
def to_html(self, with_headers: bool = True) -> str:
|
||||
"""Generate main index HTML from snapshots"""
|
||||
from datetime import datetime, timezone as tz
|
||||
from django.template.loader import render_to_string
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
template = 'static_index.html' if with_headers else 'minimal_index.html'
|
||||
snapshot_list = list(self.iterator(chunk_size=500))
|
||||
|
||||
return render_to_string(template, {
|
||||
'version': VERSION,
|
||||
'git_sha': get_COMMIT_HASH() or VERSION,
|
||||
'num_links': str(len(snapshot_list)),
|
||||
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
|
||||
'links': snapshot_list,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
})
|
||||
|
||||
# =========================================================================
|
||||
# Import Methods
|
||||
# =========================================================================
|
||||
|
||||
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
|
||||
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
|
||||
import re
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
url = link_dict['url']
|
||||
timestamp = link_dict.get('timestamp')
|
||||
title = link_dict.get('title')
|
||||
tags_str = link_dict.get('tags')
|
||||
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
try:
|
||||
snapshot = self.get(url=url)
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
except self.model.DoesNotExist:
|
||||
if timestamp:
|
||||
while self.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot = self.create(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
created_by_id=created_by_id or get_or_create_system_user_pk(),
|
||||
)
|
||||
|
||||
if tag_list:
|
||||
existing_tags = set(snapshot.tags.values_list('name', flat=True))
|
||||
new_tags = set(tag_list) | existing_tags
|
||||
snapshot.save_tags(new_tags)
|
||||
|
||||
return snapshot
|
||||
|
||||
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
|
||||
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
|
||||
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
|
||||
|
||||
def remove(self, atomic: bool = False) -> tuple:
|
||||
"""Remove snapshots from the database"""
|
||||
from django.db import transaction
|
||||
if atomic:
|
||||
with transaction.atomic():
|
||||
return self.delete()
|
||||
return self.delete()
|
||||
|
||||
|
||||
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
@@ -108,6 +279,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
|
||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
||||
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
|
||||
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||
@@ -152,9 +324,6 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
def archive(self, overwrite=False, methods=None):
|
||||
return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
|
||||
|
||||
def as_link(self) -> Link:
|
||||
return Link.from_json(self.as_json())
|
||||
|
||||
@admin.display(description='Tags')
|
||||
def tags_str(self, nocache=True) -> str | None:
|
||||
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
|
||||
@@ -164,7 +333,55 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
|
||||
|
||||
def icons(self) -> str:
|
||||
return snapshot_icons(self)
|
||||
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from collections import defaultdict
|
||||
|
||||
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
||||
|
||||
def calc_icons():
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
|
||||
else:
|
||||
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||
|
||||
path = self.archive_path
|
||||
canon = self.canonical_outputs()
|
||||
output = ""
|
||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||
icons = {
|
||||
"singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
|
||||
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
|
||||
"readability": "🆁", "mercury": "🅼", "warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
||||
for result in archive_results:
|
||||
if result.extractor == extractor:
|
||||
extractor_outputs[extractor] = result
|
||||
|
||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
||||
if extractor not in exclude:
|
||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
|
||||
if extractor == "wget":
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||
if extractor == "archive_org":
|
||||
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
|
||||
|
||||
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
||||
|
||||
cache_result = cache.get(cache_key)
|
||||
if cache_result:
|
||||
return cache_result
|
||||
|
||||
fresh_result = calc_icons()
|
||||
cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
|
||||
return fresh_result
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
@@ -178,7 +395,8 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
return url_domain(self.url)
|
||||
|
||||
@cached_property
|
||||
def link_dir(self):
|
||||
def output_dir(self):
|
||||
"""The filesystem path to the snapshot's output directory."""
|
||||
return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
|
||||
|
||||
@cached_property
|
||||
@@ -188,7 +406,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
@cached_property
|
||||
def archive_size(self):
|
||||
try:
|
||||
return get_dir_size(self.link_dir)[0]
|
||||
return get_dir_size(self.output_dir)[0]
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
@@ -200,20 +418,327 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
|
||||
return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
|
||||
|
||||
def run(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
|
||||
|
||||
Called by the state machine when entering the 'started' state.
|
||||
"""
|
||||
return self.create_pending_archiveresults()
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
|
||||
"""
|
||||
Create ArchiveResult records for all enabled extractors.
|
||||
|
||||
Uses the hooks system to discover available extractors from:
|
||||
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
- data/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
"""
|
||||
from archivebox.hooks import get_enabled_extractors
|
||||
|
||||
extractors = get_enabled_extractors()
|
||||
archiveresults = []
|
||||
for extractor in ALL_EXTRACTORS:
|
||||
|
||||
for extractor in extractors:
|
||||
if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
|
||||
continue
|
||||
archiveresult, _ = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self, extractor=extractor,
|
||||
defaults={'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now()},
|
||||
defaults={
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': self.created_by_id,
|
||||
},
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
archiveresults.append(archiveresult)
|
||||
return archiveresults
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
|
||||
This enables seamless retry of the entire extraction pipeline:
|
||||
- Resets FAILED and SKIPPED results to QUEUED
|
||||
- Sets retry_at so workers pick them up
|
||||
- Extractors run in order (numeric prefix)
|
||||
- Each extractor checks its dependencies at runtime
|
||||
|
||||
Dependency handling (e.g., chrome_session → screenshot):
|
||||
- Extractors check if required outputs exist before running
|
||||
- If dependency output missing → extractor returns 'skipped'
|
||||
- On retry, if dependency now succeeds → dependent can run
|
||||
|
||||
Returns count of ArchiveResults reset.
|
||||
"""
|
||||
retry_at = retry_at or timezone.now()
|
||||
|
||||
count = self.archiveresult_set.filter(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
]
|
||||
).update(
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
retry_at=retry_at,
|
||||
output=None,
|
||||
start_ts=None,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
# Also reset the snapshot so it gets re-checked
|
||||
if count > 0:
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.retry_at = retry_at
|
||||
self.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
return count
|
||||
|
||||
# =========================================================================
|
||||
# URL Helper Properties (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
@cached_property
|
||||
def url_hash(self) -> str:
|
||||
from hashlib import sha256
|
||||
return sha256(self.url.encode()).hexdigest()[:8]
|
||||
|
||||
@cached_property
|
||||
def scheme(self) -> str:
|
||||
return self.url.split('://')[0]
|
||||
|
||||
@cached_property
|
||||
def path(self) -> str:
|
||||
parts = self.url.split('://', 1)
|
||||
return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
|
||||
|
||||
@cached_property
|
||||
def basename(self) -> str:
|
||||
return self.path.split('/')[-1]
|
||||
|
||||
@cached_property
|
||||
def extension(self) -> str:
|
||||
basename = self.basename
|
||||
return basename.split('.')[-1] if '.' in basename else ''
|
||||
|
||||
@cached_property
|
||||
def base_url(self) -> str:
|
||||
return f'{self.scheme}://{self.domain}'
|
||||
|
||||
@cached_property
|
||||
def is_static(self) -> bool:
|
||||
static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
|
||||
return any(self.url.lower().endswith(ext) for ext in static_extensions)
|
||||
|
||||
@cached_property
|
||||
def is_archived(self) -> bool:
|
||||
output_paths = (
|
||||
self.domain,
|
||||
'output.html',
|
||||
'output.pdf',
|
||||
'screenshot.png',
|
||||
'singlefile.html',
|
||||
'readability/content.html',
|
||||
'mercury/content.html',
|
||||
'htmltotext.txt',
|
||||
'media',
|
||||
'git',
|
||||
)
|
||||
return any((Path(self.output_dir) / path).exists() for path in output_paths)
|
||||
|
||||
# =========================================================================
|
||||
# Date/Time Properties (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
@cached_property
|
||||
def bookmarked_date(self) -> Optional[str]:
|
||||
max_ts = (timezone.now() + timedelta(days=30)).timestamp()
|
||||
if self.timestamp and self.timestamp.replace('.', '').isdigit():
|
||||
if 0 < float(self.timestamp) < max_ts:
|
||||
return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
|
||||
return str(self.timestamp)
|
||||
return None
|
||||
|
||||
@cached_property
|
||||
def downloaded_datestr(self) -> Optional[str]:
|
||||
return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
|
||||
|
||||
@cached_property
|
||||
def archive_dates(self) -> List[datetime]:
|
||||
return [
|
||||
result.start_ts
|
||||
for result in self.archiveresult_set.all()
|
||||
if result.start_ts
|
||||
]
|
||||
|
||||
@cached_property
|
||||
def oldest_archive_date(self) -> Optional[datetime]:
|
||||
dates = self.archive_dates
|
||||
return min(dates) if dates else None
|
||||
|
||||
@cached_property
|
||||
def newest_archive_date(self) -> Optional[datetime]:
|
||||
dates = self.archive_dates
|
||||
return max(dates) if dates else None
|
||||
|
||||
@cached_property
|
||||
def num_outputs(self) -> int:
|
||||
return self.archiveresult_set.filter(status='succeeded').count()
|
||||
|
||||
@cached_property
|
||||
def num_failures(self) -> int:
|
||||
return self.archiveresult_set.filter(status='failed').count()
|
||||
|
||||
# =========================================================================
|
||||
# Output Path Methods (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""Predict the expected output paths that should be present after archiving"""
|
||||
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'favicon_path': 'favicon.ico',
|
||||
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
|
||||
'wget_path': f'warc/{self.timestamp}',
|
||||
'warc_path': 'warc/',
|
||||
'singlefile_path': 'singlefile.html',
|
||||
'readability_path': 'readability/content.html',
|
||||
'mercury_path': 'mercury/content.html',
|
||||
'htmltotext_path': 'htmltotext.txt',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
|
||||
'git_path': 'git/',
|
||||
'media_path': 'media/',
|
||||
'headers_path': 'headers.json',
|
||||
}
|
||||
|
||||
if self.is_static:
|
||||
static_path = f'warc/{self.timestamp}'
|
||||
canonical.update({
|
||||
'title': self.basename,
|
||||
'wget_path': static_path,
|
||||
'pdf_path': static_path,
|
||||
'screenshot_path': static_path,
|
||||
'dom_path': static_path,
|
||||
'singlefile_path': static_path,
|
||||
'readability_path': static_path,
|
||||
'mercury_path': static_path,
|
||||
'htmltotext_path': static_path,
|
||||
})
|
||||
return canonical
|
||||
|
||||
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get the latest output that each archive method produced"""
|
||||
from archivebox.hooks import get_extractors
|
||||
|
||||
latest: Dict[str, Any] = {}
|
||||
for archive_method in get_extractors():
|
||||
results = self.archiveresult_set.filter(extractor=archive_method)
|
||||
if status is not None:
|
||||
results = results.filter(status=status)
|
||||
results = results.filter(output__isnull=False).order_by('-start_ts')
|
||||
latest[archive_method] = results.first().output if results.exists() else None
|
||||
return latest
|
||||
|
||||
# =========================================================================
|
||||
# Serialization Methods
|
||||
# =========================================================================
|
||||
|
||||
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
|
||||
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
|
||||
from archivebox.misc.util import ts_to_date_str
|
||||
|
||||
result = {
|
||||
'TYPE': 'core.models.Snapshot',
|
||||
'id': str(self.id),
|
||||
'url': self.url,
|
||||
'timestamp': self.timestamp,
|
||||
'title': self.title,
|
||||
'tags': self.tags_str(),
|
||||
'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
|
||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
# Computed properties
|
||||
'domain': self.domain,
|
||||
'scheme': self.scheme,
|
||||
'base_url': self.base_url,
|
||||
'path': self.path,
|
||||
'basename': self.basename,
|
||||
'extension': self.extension,
|
||||
'is_static': self.is_static,
|
||||
'is_archived': self.is_archived,
|
||||
'archive_path': self.archive_path,
|
||||
'output_dir': self.output_dir,
|
||||
'link_dir': self.output_dir, # backwards compatibility alias
|
||||
'archive_size': self.archive_size,
|
||||
'bookmarked_date': self.bookmarked_date,
|
||||
'downloaded_datestr': self.downloaded_datestr,
|
||||
'num_outputs': self.num_outputs,
|
||||
'num_failures': self.num_failures,
|
||||
}
|
||||
if extended:
|
||||
result['canonical'] = self.canonical_outputs()
|
||||
return result
|
||||
|
||||
def to_json(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string"""
|
||||
return to_json(self.to_dict(extended=True), indent=indent)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
|
||||
"""Convert to CSV string"""
|
||||
data = self.to_dict()
|
||||
cols = cols or ['timestamp', 'is_archived', 'url']
|
||||
return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
|
||||
|
||||
def write_json_details(self, out_dir: Optional[str] = None) -> None:
|
||||
"""Write JSON index file for this snapshot to its output directory"""
|
||||
out_dir = out_dir or self.output_dir
|
||||
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
||||
atomic_write(str(path), self.to_dict(extended=True))
|
||||
|
||||
def write_html_details(self, out_dir: Optional[str] = None) -> None:
|
||||
"""Write HTML detail page for this snapshot to its output directory"""
|
||||
from django.template.loader import render_to_string
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
out_dir = out_dir or self.output_dir
|
||||
config = get_config()
|
||||
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
canonical = self.canonical_outputs()
|
||||
context = {
|
||||
**self.to_dict(extended=True),
|
||||
**{f'{k}_path': v for k, v in canonical.items()},
|
||||
'canonical': {f'{k}_path': v for k, v in canonical.items()},
|
||||
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
|
||||
'url_str': htmlencode(urldecode(self.base_url)),
|
||||
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
|
||||
'extension': self.extension or 'html',
|
||||
'tags': self.tags_str() or 'untagged',
|
||||
'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
|
||||
'status': 'archived' if self.is_archived else 'not yet archived',
|
||||
'status_color': 'success' if self.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
}
|
||||
rendered_html = render_to_string('snapshot.html', context)
|
||||
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
||||
|
||||
# =========================================================================
|
||||
# Helper Methods
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
|
||||
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
|
||||
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
@@ -225,7 +750,7 @@ class ArchiveResultManager(models.Manager):
|
||||
return qs
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
@@ -277,7 +802,7 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.link_dir)
|
||||
return Path(self.snapshot.output_dir)
|
||||
|
||||
@cached_property
|
||||
def url(self):
|
||||
@@ -292,7 +817,9 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
|
||||
|
||||
@property
|
||||
def extractor_module(self) -> Any | None:
|
||||
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
|
||||
# Hook scripts are now used instead of Python extractor modules
|
||||
# The extractor name maps to hooks in archivebox/plugins/{extractor}/
|
||||
return None
|
||||
|
||||
def output_exists(self) -> bool:
|
||||
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
|
||||
@@ -315,3 +842,150 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
|
||||
|
||||
def save_search_index(self):
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's extractor and update status.
|
||||
|
||||
Discovers and runs the hook script for self.extractor,
|
||||
updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
# Discover hook for this extractor
|
||||
hooks = discover_hooks(f'Snapshot__{self.extractor}')
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output = f'No hook found for: {self.extractor}'
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Run the hook
|
||||
start_ts = timezone.now()
|
||||
result = run_hook(
|
||||
hooks[0],
|
||||
output_dir=extractor_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
)
|
||||
end_ts = timezone.now()
|
||||
|
||||
# Determine status from return code and JSON output
|
||||
output_json = result.get('output_json') or {}
|
||||
json_status = output_json.get('status')
|
||||
|
||||
if json_status == 'skipped':
|
||||
status = 'skipped'
|
||||
elif json_status == 'failed':
|
||||
status = 'failed'
|
||||
elif result['returncode'] == 0:
|
||||
status = 'succeeded'
|
||||
else:
|
||||
status = 'failed'
|
||||
|
||||
# Update self from result
|
||||
status_map = {
|
||||
'succeeded': self.StatusChoices.SUCCEEDED,
|
||||
'failed': self.StatusChoices.FAILED,
|
||||
'skipped': self.StatusChoices.SKIPPED,
|
||||
}
|
||||
self.status = status_map.get(status, self.StatusChoices.FAILED)
|
||||
self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
|
||||
self.start_ts = start_ts
|
||||
self.end_ts = end_ts
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
|
||||
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
|
||||
self._queue_urls_for_crawl(extractor_dir)
|
||||
|
||||
# Trigger search indexing if succeeded
|
||||
if self.status == self.StatusChoices.SUCCEEDED:
|
||||
self.trigger_search_indexing()
|
||||
|
||||
def _queue_urls_for_crawl(self, extractor_dir: Path):
|
||||
"""
|
||||
Read urls.jsonl and queue discovered URLs for crawling.
|
||||
|
||||
Parser extractors output urls.jsonl with discovered URLs and Tags.
|
||||
- Tag records: {"type": "Tag", "name": "..."}
|
||||
- Snapshot records: {"type": "Snapshot", "url": "...", ...}
|
||||
|
||||
Tags are created in the database.
|
||||
URLs get added to the parent Crawl's queue with metadata
|
||||
(depth, via_snapshot, via_extractor) for recursive crawling.
|
||||
|
||||
Used at all depths:
|
||||
- depth=0: Initial source file (e.g., bookmarks.html) parsed for URLs
|
||||
- depth>0: Crawled pages parsed for outbound links
|
||||
"""
|
||||
import json
|
||||
|
||||
if not self.snapshot.crawl:
|
||||
return
|
||||
|
||||
urls_file = extractor_dir / 'urls.jsonl'
|
||||
if not urls_file.exists():
|
||||
return
|
||||
|
||||
urls_added = 0
|
||||
tags_created = 0
|
||||
with open(urls_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
record_type = entry.get('type', 'Snapshot')
|
||||
|
||||
# Handle Tag records
|
||||
if record_type == 'Tag':
|
||||
tag_name = entry.get('name')
|
||||
if tag_name:
|
||||
Tag.objects.get_or_create(name=tag_name)
|
||||
tags_created += 1
|
||||
continue
|
||||
|
||||
# Handle Snapshot records (or records without type)
|
||||
if not entry.get('url'):
|
||||
continue
|
||||
|
||||
# Add crawl metadata
|
||||
entry['depth'] = self.snapshot.depth + 1
|
||||
entry['via_snapshot'] = str(self.snapshot.id)
|
||||
entry['via_extractor'] = self.extractor
|
||||
|
||||
if self.snapshot.crawl.add_url(entry):
|
||||
urls_added += 1
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if urls_added > 0:
|
||||
self.snapshot.crawl.create_snapshots_from_urls()
|
||||
|
||||
def trigger_search_indexing(self):
|
||||
"""Run any ArchiveResult__index hooks to update search indexes."""
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
# Pass config objects in priority order (later overrides earlier)
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
for hook in discover_hooks('ArchiveResult__index'):
|
||||
run_hook(
|
||||
hook,
|
||||
output_dir=self.output_dir,
|
||||
config_objects=config_objects,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
extractor=self.extractor,
|
||||
)
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
"""Get the output directory for this extractor's results."""
|
||||
return Path(self.snapshot.output_dir) / self.extractor
|
||||
|
||||
Reference in New Issue
Block a user