mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
3383 lines
130 KiB
Python
Executable File
3383 lines
130 KiB
Python
Executable File
__package__ = "archivebox.core"
|
|
|
|
from typing import Optional, Any, cast
|
|
from collections.abc import Iterable, Sequence
|
|
import uuid
|
|
from archivebox.uuid_compat import uuid7
|
|
from datetime import datetime, timedelta
|
|
|
|
import os
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from statemachine import State, registry
|
|
|
|
from django.db import models
|
|
from django.db.models import QuerySet
|
|
from django.utils.functional import cached_property
|
|
from django.utils.text import slugify
|
|
from django.utils import timezone
|
|
from django.core.cache import cache
|
|
from django.urls import reverse_lazy
|
|
from django.contrib import admin
|
|
from django.conf import settings
|
|
from django.utils.safestring import mark_safe
|
|
|
|
from archivebox.config import CONSTANTS
|
|
from archivebox.misc.system import get_dir_size, atomic_write
|
|
from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
|
from archivebox.hooks import (
|
|
get_plugins,
|
|
get_plugin_name,
|
|
get_plugin_icon,
|
|
)
|
|
from archivebox.base_models.models import (
|
|
ModelWithUUID,
|
|
ModelWithOutputDir,
|
|
ModelWithConfig,
|
|
ModelWithNotes,
|
|
ModelWithHealthStats,
|
|
get_or_create_system_user_pk,
|
|
)
|
|
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
|
from archivebox.workers.tasks import bg_archive_snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.machine.models import Binary
|
|
|
|
|
|
class Tag(ModelWithUUID):
|
|
# Keep AutoField for compatibility with main branch migrations
|
|
# Don't use UUIDField here - requires complex FK transformation
|
|
id = models.AutoField(primary_key=True, serialize=False, verbose_name="ID")
|
|
created_by = models.ForeignKey(
|
|
settings.AUTH_USER_MODEL,
|
|
on_delete=models.CASCADE,
|
|
default=get_or_create_system_user_pk,
|
|
null=True,
|
|
related_name="tag_set",
|
|
)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
name = models.CharField(unique=True, blank=False, max_length=100)
|
|
slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
|
|
|
|
snapshot_set: models.Manager["Snapshot"]
|
|
|
|
class Meta(ModelWithUUID.Meta):
|
|
app_label = "core"
|
|
verbose_name = "Tag"
|
|
verbose_name_plural = "Tags"
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
def _generate_unique_slug(self) -> str:
|
|
base_slug = slugify(self.name) or "tag"
|
|
existing = Tag.objects.filter(slug__startswith=base_slug)
|
|
if self.pk:
|
|
existing = existing.exclude(pk=self.pk)
|
|
existing_slugs = set(existing.values_list("slug", flat=True))
|
|
|
|
slug = base_slug
|
|
i = 1
|
|
while slug in existing_slugs:
|
|
slug = f"{base_slug}_{i}"
|
|
i += 1
|
|
return slug
|
|
|
|
def save(self, *args, **kwargs):
|
|
existing_name = None
|
|
if self.pk:
|
|
existing_name = Tag.objects.filter(pk=self.pk).values_list("name", flat=True).first()
|
|
|
|
if not self.slug or existing_name != self.name:
|
|
self.slug = self._generate_unique_slug()
|
|
super().save(*args, **kwargs)
|
|
|
|
# if is_new:
|
|
# from archivebox.misc.logging_util import log_worker_event
|
|
# log_worker_event(
|
|
# worker_type='DB',
|
|
# event='Created Tag',
|
|
# indent_level=0,
|
|
# metadata={
|
|
# 'id': self.id,
|
|
# 'name': self.name,
|
|
# 'slug': self.slug,
|
|
# },
|
|
# )
|
|
|
|
@property
|
|
def api_url(self) -> str:
|
|
return str(reverse_lazy("api-1:get_tag", args=[self.id]))
|
|
|
|
def to_json(self) -> dict:
|
|
"""
|
|
Convert Tag model instance to a JSON-serializable dict.
|
|
"""
|
|
from archivebox.config import VERSION
|
|
|
|
return {
|
|
"type": "Tag",
|
|
"schema_version": VERSION,
|
|
"id": str(self.id),
|
|
"name": self.name,
|
|
"slug": self.slug,
|
|
}
|
|
|
|
@staticmethod
|
|
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
|
"""
|
|
Create/update Tag from JSON dict.
|
|
|
|
Args:
|
|
record: JSON dict with 'name' field
|
|
overrides: Optional dict with 'snapshot' to auto-attach tag
|
|
|
|
Returns:
|
|
Tag instance or None
|
|
"""
|
|
name = record.get("name")
|
|
if not name:
|
|
return None
|
|
|
|
tag, _ = Tag.objects.get_or_create(name=name)
|
|
|
|
# Auto-attach to snapshot if in overrides
|
|
if overrides and "snapshot" in overrides and tag:
|
|
overrides["snapshot"].tags.add(tag)
|
|
|
|
return tag
|
|
|
|
|
|
class SnapshotTag(models.Model):
|
|
id = models.AutoField(primary_key=True)
|
|
snapshot = models.ForeignKey("Snapshot", db_column="snapshot_id", on_delete=models.CASCADE, to_field="id")
|
|
tag = models.ForeignKey(Tag, db_column="tag_id", on_delete=models.CASCADE, to_field="id")
|
|
|
|
class Meta:
|
|
app_label = "core"
|
|
db_table = "core_snapshot_tags"
|
|
unique_together = [("snapshot", "tag")]
|
|
|
|
|
|
class SnapshotQuerySet(models.QuerySet):
|
|
"""Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
|
|
|
|
# =========================================================================
|
|
# Filtering Methods
|
|
# =========================================================================
|
|
|
|
FILTER_TYPES = {
|
|
"exact": lambda pattern: models.Q(url=pattern),
|
|
"substring": lambda pattern: models.Q(url__icontains=pattern),
|
|
"regex": lambda pattern: models.Q(url__iregex=pattern),
|
|
"domain": lambda pattern: (
|
|
models.Q(url__istartswith=f"http://{pattern}")
|
|
| models.Q(url__istartswith=f"https://{pattern}")
|
|
| models.Q(url__istartswith=f"ftp://{pattern}")
|
|
),
|
|
"tag": lambda pattern: models.Q(tags__name=pattern),
|
|
"timestamp": lambda pattern: models.Q(timestamp=pattern),
|
|
}
|
|
|
|
def filter_by_patterns(self, patterns: list[str], filter_type: str = "exact") -> "SnapshotQuerySet":
|
|
"""Filter snapshots by URL patterns using specified filter type"""
|
|
from archivebox.misc.logging import stderr
|
|
|
|
q_filter = models.Q()
|
|
for pattern in patterns:
|
|
try:
|
|
q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
|
|
except KeyError:
|
|
stderr()
|
|
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}:", color="red")
|
|
stderr(f" {pattern}")
|
|
raise SystemExit(2)
|
|
return self.filter(q_filter)
|
|
|
|
def search(self, patterns: list[str]) -> "SnapshotQuerySet":
|
|
"""Search snapshots using the configured search backend"""
|
|
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
|
from archivebox.search import query_search_index
|
|
from archivebox.misc.logging import stderr
|
|
|
|
if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
|
|
stderr()
|
|
stderr("[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True", color="red")
|
|
raise SystemExit(2)
|
|
|
|
qsearch = self.none()
|
|
for pattern in patterns:
|
|
try:
|
|
qsearch |= query_search_index(pattern)
|
|
except BaseException:
|
|
raise SystemExit(2)
|
|
return self.all() & qsearch
|
|
|
|
# =========================================================================
|
|
# Export Methods
|
|
# =========================================================================
|
|
|
|
def to_json(self, with_headers: bool = False) -> str:
|
|
"""Generate JSON index from snapshots"""
|
|
import sys
|
|
from datetime import datetime, timezone as tz
|
|
from archivebox.config import VERSION
|
|
from archivebox.config.common import SERVER_CONFIG
|
|
|
|
MAIN_INDEX_HEADER = (
|
|
{
|
|
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
|
|
"schema": "archivebox.index.json",
|
|
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
|
|
"meta": {
|
|
"project": "ArchiveBox",
|
|
"version": VERSION,
|
|
"git_sha": VERSION,
|
|
"website": "https://ArchiveBox.io",
|
|
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
|
|
"source": "https://github.com/ArchiveBox/ArchiveBox",
|
|
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
|
|
"dependencies": {},
|
|
},
|
|
}
|
|
if with_headers
|
|
else {}
|
|
)
|
|
|
|
snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
|
|
|
|
if with_headers:
|
|
output = {
|
|
**MAIN_INDEX_HEADER,
|
|
"num_links": len(snapshot_dicts),
|
|
"updated": datetime.now(tz.utc),
|
|
"last_run_cmd": sys.argv,
|
|
"links": snapshot_dicts,
|
|
}
|
|
else:
|
|
output = snapshot_dicts
|
|
return to_json(output, indent=4, sort_keys=True)
|
|
|
|
def to_csv(self, cols: list[str] | None = None, header: bool = True, separator: str = ",", ljust: int = 0) -> str:
|
|
"""Generate CSV output from snapshots"""
|
|
cols = cols or ["timestamp", "is_archived", "url"]
|
|
header_str = separator.join(col.ljust(ljust) for col in cols) if header else ""
|
|
row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
|
|
return "\n".join((header_str, *row_strs))
|
|
|
|
def to_html(self, with_headers: bool = True) -> str:
|
|
"""Generate main index HTML from snapshots"""
|
|
from datetime import datetime, timezone as tz
|
|
from django.template.loader import render_to_string
|
|
from archivebox.config import VERSION
|
|
from archivebox.config.common import SERVER_CONFIG
|
|
from archivebox.config.version import get_COMMIT_HASH
|
|
|
|
template = "static_index.html" if with_headers else "minimal_index.html"
|
|
snapshot_list = list(self.iterator(chunk_size=500))
|
|
|
|
return render_to_string(
|
|
template,
|
|
{
|
|
"version": VERSION,
|
|
"git_sha": get_COMMIT_HASH() or VERSION,
|
|
"num_links": str(len(snapshot_list)),
|
|
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
|
|
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
|
|
"links": snapshot_list,
|
|
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
|
},
|
|
)
|
|
|
|
|
|
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base]
|
|
"""Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
|
|
|
|
def filter(self, *args, **kwargs):
|
|
domain = kwargs.pop("domain", None)
|
|
qs = super().filter(*args, **kwargs)
|
|
if domain:
|
|
qs = qs.filter(url__icontains=f"://{domain}")
|
|
return qs
|
|
|
|
def get_queryset(self):
|
|
# Don't prefetch by default - it causes "too many open files" during bulk operations
|
|
# Views/templates can add .prefetch_related('tags', 'archiveresult_set') where needed
|
|
return super().get_queryset()
|
|
|
|
# =========================================================================
|
|
# Import Methods
|
|
# =========================================================================
|
|
|
|
def remove(self, atomic: bool = False) -> tuple:
|
|
"""Remove snapshots from the database"""
|
|
from django.db import transaction
|
|
|
|
if atomic:
|
|
with transaction.atomic():
|
|
return self.get_queryset().delete()
|
|
return self.get_queryset().delete()
|
|
|
|
|
|
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
|
|
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
|
|
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
|
|
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name="snapshot_set", db_index=True) # type: ignore[assignment]
|
|
parent_snapshot = models.ForeignKey(
|
|
"self",
|
|
on_delete=models.SET_NULL,
|
|
null=True,
|
|
blank=True,
|
|
related_name="child_snapshots",
|
|
db_index=True,
|
|
help_text="Parent snapshot that discovered this URL (for recursive crawling)",
|
|
)
|
|
|
|
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
|
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
|
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
|
|
fs_version = models.CharField(
|
|
max_length=10,
|
|
default="0.9.0",
|
|
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().',
|
|
)
|
|
current_step = models.PositiveSmallIntegerField(
|
|
default=0,
|
|
db_index=True,
|
|
help_text="Current hook step being executed (0-9). Used for sequential hook execution.",
|
|
)
|
|
|
|
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
|
status = ModelWithStateMachine.StatusField(
|
|
choices=ModelWithStateMachine.StatusChoices,
|
|
default=ModelWithStateMachine.StatusChoices.QUEUED,
|
|
)
|
|
config = models.JSONField(default=dict, null=False, blank=False, editable=True)
|
|
notes = models.TextField(blank=True, null=False, default="")
|
|
# output_dir is computed via @cached_property from fs_version and get_storage_path_for_version()
|
|
|
|
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name="snapshot_set", through_fields=("snapshot", "tag"))
|
|
|
|
state_machine_name = "archivebox.core.models.SnapshotMachine"
|
|
state_field_name = "status"
|
|
retry_at_field_name = "retry_at"
|
|
StatusChoices = ModelWithStateMachine.StatusChoices
|
|
active_state = StatusChoices.STARTED
|
|
|
|
crawl_id: uuid.UUID
|
|
parent_snapshot_id: uuid.UUID | None
|
|
_prefetched_objects_cache: dict[str, Any]
|
|
|
|
objects = SnapshotManager()
|
|
archiveresult_set: models.Manager["ArchiveResult"]
|
|
|
|
class Meta(
|
|
ModelWithOutputDir.Meta,
|
|
ModelWithConfig.Meta,
|
|
ModelWithNotes.Meta,
|
|
ModelWithHealthStats.Meta,
|
|
ModelWithStateMachine.Meta,
|
|
):
|
|
app_label = "core"
|
|
verbose_name = "Snapshot"
|
|
verbose_name_plural = "Snapshots"
|
|
constraints = [
|
|
# Allow same URL in different crawls, but not duplicates within same crawl
|
|
models.UniqueConstraint(fields=["url", "crawl"], name="unique_url_per_crawl"),
|
|
# Global timestamp uniqueness for 1:1 symlink mapping
|
|
models.UniqueConstraint(fields=["timestamp"], name="unique_timestamp"),
|
|
]
|
|
|
|
def __str__(self):
|
|
return f"[{self.id}] {self.url[:64]}"
|
|
|
|
@property
|
|
def created_by(self):
|
|
"""Convenience property to access the user who created this snapshot via its crawl."""
|
|
return self.crawl.created_by
|
|
|
|
@property
|
|
def process_set(self):
|
|
"""Get all Process objects related to this snapshot's ArchiveResults."""
|
|
from archivebox.machine.models import Process
|
|
|
|
return Process.objects.filter(archiveresult__snapshot_id=self.id)
|
|
|
|
@property
|
|
def binary_set(self):
|
|
"""Get all Binary objects used by processes related to this snapshot."""
|
|
from archivebox.machine.models import Binary
|
|
|
|
return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct()
|
|
|
|
def save(self, *args, **kwargs):
|
|
if not self.bookmarked_at:
|
|
self.bookmarked_at = self.created_at or timezone.now()
|
|
if not self.timestamp:
|
|
self.timestamp = str(self.bookmarked_at.timestamp())
|
|
|
|
# Migrate filesystem if needed (happens automatically on save)
|
|
if self.pk and self.fs_migration_needed:
|
|
print(
|
|
f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}",
|
|
)
|
|
# Walk through migration chain automatically
|
|
current = self.fs_version
|
|
target = self._fs_current_version()
|
|
|
|
while current != target:
|
|
next_ver = self._fs_next_version(current)
|
|
method = f"_fs_migrate_from_{current.replace('.', '_')}_to_{next_ver.replace('.', '_')}"
|
|
|
|
# Only run if method exists (most are no-ops)
|
|
if hasattr(self, method):
|
|
print(f"[DEBUG save()] Running {method}()")
|
|
getattr(self, method)()
|
|
|
|
current = next_ver
|
|
|
|
# Update version
|
|
self.fs_version = target
|
|
|
|
super().save(*args, **kwargs)
|
|
self.ensure_legacy_archive_symlink()
|
|
existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url}
|
|
if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls:
|
|
self.crawl.urls += f"\n{self.url}"
|
|
self.crawl.save()
|
|
|
|
# if is_new:
|
|
# from archivebox.misc.logging_util import log_worker_event
|
|
# log_worker_event(
|
|
# worker_type='DB',
|
|
# event='Created Snapshot',
|
|
# indent_level=2,
|
|
# url=self.url,
|
|
# metadata={
|
|
# 'id': str(self.id),
|
|
# 'crawl_id': str(self.crawl_id),
|
|
# 'depth': self.depth,
|
|
# 'status': self.status,
|
|
# },
|
|
# )
|
|
|
|
# =========================================================================
|
|
# Filesystem Migration Methods
|
|
# =========================================================================
|
|
|
|
@staticmethod
|
|
def _fs_current_version() -> str:
|
|
"""Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
|
|
from archivebox.config import VERSION
|
|
|
|
# Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
|
|
parts = VERSION.split(".")
|
|
if len(parts) >= 2:
|
|
major, minor = parts[0], parts[1]
|
|
# Strip any non-numeric suffix from minor version
|
|
minor = "".join(c for c in minor if c.isdigit())
|
|
return f"{major}.{minor}.0"
|
|
return "0.9.0" # Fallback if version parsing fails
|
|
|
|
@property
|
|
def fs_migration_needed(self) -> bool:
|
|
"""Check if snapshot needs filesystem migration"""
|
|
return self.fs_version != self._fs_current_version()
|
|
|
|
def _fs_next_version(self, version: str) -> str:
|
|
"""Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
|
|
# Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
|
|
if version in ("0.7.0", "0.8.0"):
|
|
return "0.9.0"
|
|
return self._fs_current_version()
|
|
|
|
def _fs_migrate_from_0_8_0_to_0_9_0(self):
|
|
"""
|
|
Migrate from flat to nested structure.
|
|
|
|
0.8.x: archive/{timestamp}/
|
|
0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
|
|
|
|
Transaction handling:
|
|
1. Copy files INSIDE transaction
|
|
2. Convert index.json to index.jsonl INSIDE transaction
|
|
3. Create symlink INSIDE transaction
|
|
4. Update fs_version INSIDE transaction (done by save())
|
|
5. Exit transaction (DB commit)
|
|
6. Delete old files OUTSIDE transaction (after commit)
|
|
"""
|
|
import shutil
|
|
from django.db import transaction
|
|
|
|
old_dir = self.get_storage_path_for_version("0.8.0")
|
|
new_dir = self.get_storage_path_for_version("0.9.0")
|
|
|
|
print(
|
|
f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}",
|
|
)
|
|
|
|
if not old_dir.exists() or old_dir == new_dir:
|
|
# No migration needed
|
|
print("[DEBUG _fs_migrate] Returning None (early return)")
|
|
return None
|
|
|
|
if new_dir.exists():
|
|
# New directory already exists (files already copied), but we still need cleanup
|
|
# Return cleanup info so old directory can be cleaned up
|
|
print("[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
|
|
return (old_dir, new_dir)
|
|
|
|
new_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Copy all files (idempotent), skipping index.json (will be converted to jsonl)
|
|
for old_file in old_dir.rglob("*"):
|
|
if not old_file.is_file():
|
|
continue
|
|
|
|
rel_path = old_file.relative_to(old_dir)
|
|
new_file = new_dir / rel_path
|
|
|
|
# Skip if already copied
|
|
if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
|
|
continue
|
|
|
|
new_file.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(old_file, new_file)
|
|
|
|
# Verify all copied
|
|
old_files = {f.relative_to(old_dir): f.stat().st_size for f in old_dir.rglob("*") if f.is_file()}
|
|
new_files = {f.relative_to(new_dir): f.stat().st_size for f in new_dir.rglob("*") if f.is_file()}
|
|
|
|
if old_files.keys() != new_files.keys():
|
|
missing = old_files.keys() - new_files.keys()
|
|
raise Exception(f"Migration incomplete: missing {missing}")
|
|
|
|
# Convert index.json to index.jsonl in the new directory
|
|
self.convert_index_json_to_jsonl()
|
|
|
|
# Schedule cleanup AFTER transaction commits successfully
|
|
# This ensures DB changes are committed before we delete old files
|
|
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
|
|
|
|
# Return cleanup info for manual cleanup if needed (when called directly)
|
|
return (old_dir, new_dir)
|
|
|
|
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
|
|
"""
|
|
Delete old directory and create symlink after successful migration.
|
|
"""
|
|
import shutil
|
|
import logging
|
|
|
|
# Delete old directory
|
|
if old_dir.exists() and not old_dir.is_symlink():
|
|
try:
|
|
shutil.rmtree(old_dir)
|
|
except Exception as e:
|
|
logging.getLogger("archivebox.migration").warning(
|
|
f"Could not remove old migration directory {old_dir}: {e}",
|
|
)
|
|
return # Don't create symlink if cleanup failed
|
|
|
|
# Create backwards-compat symlink (after old dir is deleted)
|
|
symlink_path = old_dir # Same path as old_dir
|
|
if symlink_path.is_symlink():
|
|
symlink_path.unlink()
|
|
|
|
if not symlink_path.exists():
|
|
try:
|
|
symlink_path.symlink_to(new_dir, target_is_directory=True)
|
|
except Exception as e:
|
|
logging.getLogger("archivebox.migration").warning(
|
|
f"Could not create symlink from {symlink_path} to {new_dir}: {e}",
|
|
)
|
|
|
|
# =========================================================================
|
|
# Path Calculation and Migration Helpers
|
|
# =========================================================================
|
|
|
|
@staticmethod
|
|
def extract_domain_from_url(url: str) -> str:
|
|
"""
|
|
Extract domain from URL for 0.9.x path structure.
|
|
Uses full hostname with sanitized special chars.
|
|
|
|
Examples:
|
|
https://example.com:8080 → example.com_8080
|
|
https://sub.example.com → sub.example.com
|
|
file:///path → localhost
|
|
data:text/html → data
|
|
"""
|
|
from urllib.parse import urlparse
|
|
|
|
try:
|
|
parsed = urlparse(url)
|
|
|
|
if parsed.scheme in ("http", "https"):
|
|
if parsed.port:
|
|
return f"{parsed.hostname}_{parsed.port}".replace(":", "_")
|
|
return parsed.hostname or "unknown"
|
|
elif parsed.scheme == "file":
|
|
return "localhost"
|
|
elif parsed.scheme:
|
|
return parsed.scheme
|
|
else:
|
|
return "unknown"
|
|
except Exception:
|
|
return "unknown"
|
|
|
|
def get_storage_path_for_version(self, version: str) -> Path:
|
|
"""
|
|
Calculate storage path for specific filesystem version.
|
|
Centralizes path logic so it's reusable.
|
|
|
|
0.7.x/0.8.x: archive/{timestamp}
|
|
0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
|
|
"""
|
|
from datetime import datetime
|
|
|
|
if version in ("0.7.0", "0.8.0"):
|
|
return CONSTANTS.ARCHIVE_DIR / self.timestamp
|
|
|
|
elif version in ("0.9.0", "1.0.0"):
|
|
username = self.created_by.username
|
|
|
|
# Use created_at for date grouping (fallback to timestamp)
|
|
if self.created_at:
|
|
date_str = self.created_at.strftime("%Y%m%d")
|
|
else:
|
|
date_str = datetime.fromtimestamp(float(self.timestamp)).strftime("%Y%m%d")
|
|
|
|
domain = self.extract_domain_from_url(self.url)
|
|
|
|
return CONSTANTS.DATA_DIR / "users" / username / "snapshots" / date_str / domain / str(self.id)
|
|
else:
|
|
# Unknown version - use current
|
|
return self.get_storage_path_for_version(self._fs_current_version())
|
|
|
|
# =========================================================================
|
|
# Loading and Creation from Filesystem (Used by archivebox update ONLY)
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def load_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]:
|
|
"""
|
|
Load existing Snapshot from DB by reading index.jsonl or index.json.
|
|
|
|
Reads index file, extracts url+timestamp, queries DB.
|
|
Returns existing Snapshot or None if not found/invalid.
|
|
Does NOT create new snapshots.
|
|
|
|
ONLY used by: archivebox update (for orphan detection)
|
|
"""
|
|
from archivebox.machine.models import Process
|
|
|
|
# Try index.jsonl first (new format), then index.json (legacy)
|
|
jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
|
|
json_path = snapshot_dir / CONSTANTS.JSON_INDEX_FILENAME
|
|
|
|
data = None
|
|
if jsonl_path.exists():
|
|
try:
|
|
records = Process.parse_records_from_text(jsonl_path.read_text())
|
|
for record in records:
|
|
if record.get("type") == "Snapshot":
|
|
data = record
|
|
break
|
|
except OSError:
|
|
pass
|
|
elif json_path.exists():
|
|
try:
|
|
with open(json_path) as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
|
|
if not data:
|
|
return None
|
|
|
|
url = data.get("url")
|
|
if not url:
|
|
return None
|
|
|
|
# Get timestamp - prefer index file, fallback to folder name
|
|
timestamp = cls._select_best_timestamp(
|
|
index_timestamp=data.get("timestamp"),
|
|
folder_name=snapshot_dir.name,
|
|
)
|
|
|
|
if not timestamp:
|
|
return None
|
|
|
|
# Look up existing (try exact match first, then fuzzy match for truncated timestamps)
|
|
try:
|
|
snapshot = cls.objects.get(url=url, timestamp=timestamp)
|
|
print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}")
|
|
return snapshot
|
|
except cls.DoesNotExist:
|
|
print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}")
|
|
# Try fuzzy match - index.json may have truncated timestamp
|
|
# e.g., index has "1767000340" but DB has "1767000340.624737"
|
|
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
|
|
if candidates.count() == 1:
|
|
snapshot = candidates.first()
|
|
if snapshot is None:
|
|
return None
|
|
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
|
|
return snapshot
|
|
elif candidates.count() > 1:
|
|
print("[DEBUG load_from_directory] Multiple fuzzy matches, using first")
|
|
return candidates.first()
|
|
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
|
|
return None
|
|
except cls.MultipleObjectsReturned:
|
|
# Should not happen with unique constraint
|
|
print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}")
|
|
return cls.objects.filter(url=url, timestamp=timestamp).first()
|
|
|
|
@classmethod
|
|
def create_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]:
|
|
"""
|
|
Create new Snapshot from orphaned directory.
|
|
|
|
Validates timestamp, ensures uniqueness.
|
|
Returns new UNSAVED Snapshot or None if invalid.
|
|
|
|
ONLY used by: archivebox update (for orphan import)
|
|
"""
|
|
from archivebox.machine.models import Process
|
|
|
|
# Try index.jsonl first (new format), then index.json (legacy)
|
|
jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
|
|
json_path = snapshot_dir / CONSTANTS.JSON_INDEX_FILENAME
|
|
|
|
data = None
|
|
if jsonl_path.exists():
|
|
try:
|
|
records = Process.parse_records_from_text(jsonl_path.read_text())
|
|
for record in records:
|
|
if record.get("type") == "Snapshot":
|
|
data = record
|
|
break
|
|
except OSError:
|
|
pass
|
|
elif json_path.exists():
|
|
try:
|
|
with open(json_path) as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
|
|
if not data:
|
|
return None
|
|
|
|
url = data.get("url")
|
|
if not url:
|
|
return None
|
|
|
|
# Get and validate timestamp
|
|
timestamp = cls._select_best_timestamp(
|
|
index_timestamp=data.get("timestamp"),
|
|
folder_name=snapshot_dir.name,
|
|
)
|
|
|
|
if not timestamp:
|
|
return None
|
|
|
|
# Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
|
|
timestamp = cls._ensure_unique_timestamp(url, timestamp)
|
|
|
|
# Detect version
|
|
fs_version = cls._detect_fs_version_from_index(data)
|
|
|
|
# Get or create catchall crawl for orphaned snapshots
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
system_user_id = get_or_create_system_user_pk()
|
|
catchall_crawl, _ = Crawl.objects.get_or_create(
|
|
label="[migration] orphaned snapshots",
|
|
defaults={
|
|
"urls": f"# Orphaned snapshot: {url}",
|
|
"max_depth": 0,
|
|
"created_by_id": system_user_id,
|
|
},
|
|
)
|
|
|
|
return cls(
|
|
url=url,
|
|
timestamp=timestamp,
|
|
title=data.get("title", ""),
|
|
fs_version=fs_version,
|
|
crawl=catchall_crawl,
|
|
)
|
|
|
|
@staticmethod
|
|
def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> str | None:
|
|
"""
|
|
Select best timestamp from index.json vs folder name.
|
|
|
|
Validates range (1995-2035).
|
|
Prefers index.json if valid.
|
|
"""
|
|
|
|
def is_valid_timestamp(ts: object | None) -> bool:
|
|
if not isinstance(ts, (str, int, float)):
|
|
return False
|
|
try:
|
|
ts_int = int(float(ts))
|
|
# 1995-01-01 to 2035-12-31
|
|
return 788918400 <= ts_int <= 2082758400
|
|
except (TypeError, ValueError, OverflowError):
|
|
return False
|
|
|
|
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
|
|
folder_valid = is_valid_timestamp(folder_name)
|
|
|
|
if index_valid and index_timestamp is not None:
|
|
return str(int(float(str(index_timestamp))))
|
|
if folder_valid:
|
|
return str(int(float(str(folder_name))))
|
|
return None
|
|
|
|
@classmethod
|
|
def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
|
|
"""
|
|
Ensure timestamp is globally unique.
|
|
If collision with different URL, increment by 1 until unique.
|
|
|
|
NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
|
|
This is just an extracted, reusable version.
|
|
"""
|
|
while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
|
|
timestamp = str(int(float(timestamp)) + 1)
|
|
return timestamp
|
|
|
|
@staticmethod
|
|
def _detect_fs_version_from_index(data: dict) -> str:
|
|
"""
|
|
Detect fs_version from index.json structure.
|
|
|
|
- Has fs_version field: use it
|
|
- Has history dict: 0.7.0
|
|
- Has archive_results list: 0.8.0
|
|
- Default: 0.7.0
|
|
"""
|
|
if "fs_version" in data:
|
|
return data["fs_version"]
|
|
if "history" in data and "archive_results" not in data:
|
|
return "0.7.0"
|
|
if "archive_results" in data:
|
|
return "0.8.0"
|
|
return "0.7.0"
|
|
|
|
# =========================================================================
|
|
# Index.json Reconciliation
|
|
# =========================================================================
|
|
|
|
def reconcile_with_index(self):
|
|
"""
|
|
Merge index.json/index.jsonl with DB. DB is source of truth.
|
|
|
|
- Title: longest non-URL
|
|
- Tags: union
|
|
- ArchiveResults: keep both (by plugin+start_ts)
|
|
|
|
Converts index.json to index.jsonl if needed, then writes back in JSONL format.
|
|
|
|
Used by: archivebox update (to sync index with DB)
|
|
"""
|
|
import json
|
|
|
|
# Try to convert index.json to index.jsonl first
|
|
self.convert_index_json_to_jsonl()
|
|
|
|
# Check for index.jsonl (preferred) or index.json (legacy)
|
|
jsonl_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
|
|
json_path = Path(self.output_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
|
|
|
index_data = {}
|
|
|
|
if jsonl_path.exists():
|
|
# Read from JSONL format
|
|
jsonl_data = self.read_index_jsonl()
|
|
if jsonl_data["snapshot"]:
|
|
index_data = jsonl_data["snapshot"]
|
|
# Convert archive_results list to expected format
|
|
index_data["archive_results"] = jsonl_data["archive_results"]
|
|
elif json_path.exists():
|
|
# Fallback to legacy JSON format
|
|
try:
|
|
with open(json_path) as f:
|
|
index_data = json.load(f)
|
|
except (OSError, TypeError, ValueError, json.JSONDecodeError):
|
|
pass
|
|
|
|
# Merge title
|
|
self._merge_title_from_index(index_data)
|
|
|
|
# Merge tags
|
|
self._merge_tags_from_index(index_data)
|
|
|
|
# Merge ArchiveResults
|
|
self._merge_archive_results_from_index(index_data)
|
|
|
|
# Write back in JSONL format
|
|
self.write_index_jsonl()
|
|
|
|
def reconcile_with_index_json(self):
|
|
"""Deprecated: use reconcile_with_index() instead."""
|
|
return self.reconcile_with_index()
|
|
|
|
def _merge_title_from_index(self, index_data: dict):
|
|
"""Merge title - prefer longest non-URL title."""
|
|
index_title = (index_data.get("title") or "").strip()
|
|
db_title = self.title or ""
|
|
|
|
candidates = [t for t in [index_title, db_title] if t and t != self.url]
|
|
if candidates:
|
|
best_title = max(candidates, key=len)
|
|
if self.title != best_title:
|
|
self.title = best_title
|
|
|
|
def _merge_tags_from_index(self, index_data: dict):
|
|
"""Merge tags - union of both sources."""
|
|
from django.db import transaction
|
|
|
|
index_tags = set(index_data.get("tags", "").split(",")) if index_data.get("tags") else set()
|
|
index_tags = {t.strip() for t in index_tags if t.strip()}
|
|
|
|
db_tags = set(self.tags.values_list("name", flat=True))
|
|
|
|
new_tags = index_tags - db_tags
|
|
if new_tags:
|
|
with transaction.atomic():
|
|
for tag_name in new_tags:
|
|
tag, _ = Tag.objects.get_or_create(name=tag_name)
|
|
self.tags.add(tag)
|
|
|
|
def _merge_archive_results_from_index(self, index_data: dict):
|
|
"""Merge ArchiveResults - keep both (by plugin+start_ts)."""
|
|
existing = {(ar.plugin, ar.start_ts): ar for ar in ArchiveResult.objects.filter(snapshot=self)}
|
|
|
|
# Handle 0.8.x format (archive_results list)
|
|
for result_data in index_data.get("archive_results", []):
|
|
self._create_archive_result_if_missing(result_data, existing)
|
|
|
|
# Handle 0.7.x format (history dict)
|
|
if "history" in index_data and isinstance(index_data["history"], dict):
|
|
for plugin, result_list in index_data["history"].items():
|
|
if isinstance(result_list, list):
|
|
for result_data in result_list:
|
|
# Support both old 'extractor' and new 'plugin' keys for backwards compat
|
|
result_data["plugin"] = result_data.get("plugin") or result_data.get("extractor") or plugin
|
|
self._create_archive_result_if_missing(result_data, existing)
|
|
|
|
def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
|
|
"""Create ArchiveResult if not already in DB."""
|
|
from dateutil import parser
|
|
|
|
# Support both old 'extractor' and new 'plugin' keys for backwards compat
|
|
plugin = result_data.get("plugin") or result_data.get("extractor", "")
|
|
if not plugin:
|
|
return
|
|
|
|
start_ts = None
|
|
if result_data.get("start_ts"):
|
|
try:
|
|
start_ts = parser.parse(result_data["start_ts"])
|
|
except (TypeError, ValueError, OverflowError):
|
|
pass
|
|
|
|
if (plugin, start_ts) in existing:
|
|
return
|
|
|
|
try:
|
|
end_ts = None
|
|
if result_data.get("end_ts"):
|
|
try:
|
|
end_ts = parser.parse(result_data["end_ts"])
|
|
except (TypeError, ValueError, OverflowError):
|
|
pass
|
|
|
|
# Support both 'output' (legacy) and 'output_str' (new JSONL) field names
|
|
output_str = result_data.get("output_str") or result_data.get("output", "")
|
|
|
|
ArchiveResult.objects.create(
|
|
snapshot=self,
|
|
plugin=plugin,
|
|
hook_name=result_data.get("hook_name", ""),
|
|
status=result_data.get("status", "failed"),
|
|
output_str=output_str,
|
|
cmd=result_data.get("cmd", []),
|
|
pwd=result_data.get("pwd", str(self.output_dir)),
|
|
start_ts=start_ts,
|
|
end_ts=end_ts,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
def write_index_json(self):
|
|
"""Write index.json in 0.9.x format (deprecated, use write_index_jsonl)."""
|
|
import json
|
|
|
|
index_path = Path(self.output_dir) / "index.json"
|
|
|
|
data = {
|
|
"url": self.url,
|
|
"timestamp": self.timestamp,
|
|
"title": self.title or "",
|
|
"tags": ",".join(sorted(self.tags.values_list("name", flat=True))),
|
|
"fs_version": self.fs_version,
|
|
"bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
"archive_results": [
|
|
{
|
|
"plugin": ar.plugin,
|
|
"status": ar.status,
|
|
"start_ts": ar.start_ts.isoformat() if ar.start_ts else None,
|
|
"end_ts": ar.end_ts.isoformat() if ar.end_ts else None,
|
|
"output": ar.output_str or "",
|
|
"cmd": ar.cmd if isinstance(ar.cmd, list) else [],
|
|
"pwd": ar.pwd,
|
|
}
|
|
for ar in ArchiveResult.objects.filter(snapshot=self).order_by("start_ts")
|
|
],
|
|
}
|
|
|
|
index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(index_path, "w") as f:
|
|
json.dump(data, f, indent=2, sort_keys=True)
|
|
|
|
def write_index_jsonl(self):
|
|
"""
|
|
Write index.jsonl in flat JSONL format.
|
|
|
|
Each line is a JSON record with a 'type' field:
|
|
- Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
|
|
- ArchiveResult: extractor results (plugin, status, output, etc.)
|
|
- Binary: binary info used for the extraction
|
|
- Process: process execution details (cmd, exit_code, timing, etc.)
|
|
"""
|
|
import json
|
|
|
|
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
|
|
index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Track unique binaries and processes to avoid duplicates
|
|
binaries_seen = set()
|
|
processes_seen = set()
|
|
|
|
with open(index_path, "w") as f:
|
|
# Write Snapshot record first (to_json includes crawl_id, fs_version)
|
|
f.write(json.dumps(self.to_json()) + "\n")
|
|
|
|
# Write ArchiveResult records with their associated Binary and Process
|
|
# Use select_related to optimize queries
|
|
for ar in self.archiveresult_set.select_related("process__binary").order_by("start_ts"):
|
|
# Write Binary record if not already written
|
|
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
|
|
binaries_seen.add(ar.process.binary_id)
|
|
f.write(json.dumps(ar.process.binary.to_json()) + "\n")
|
|
|
|
# Write Process record if not already written
|
|
if ar.process and ar.process_id not in processes_seen:
|
|
processes_seen.add(ar.process_id)
|
|
f.write(json.dumps(ar.process.to_json()) + "\n")
|
|
|
|
# Write ArchiveResult record
|
|
f.write(json.dumps(ar.to_json()) + "\n")
|
|
|
|
def read_index_jsonl(self) -> dict:
|
|
"""
|
|
Read index.jsonl and return parsed records grouped by type.
|
|
|
|
Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes'
|
|
"""
|
|
from archivebox.machine.models import Process
|
|
from archivebox.misc.jsonl import (
|
|
TYPE_SNAPSHOT,
|
|
TYPE_ARCHIVERESULT,
|
|
TYPE_BINARY,
|
|
TYPE_PROCESS,
|
|
)
|
|
|
|
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
|
|
result: dict[str, Any] = {
|
|
"snapshot": None,
|
|
"archive_results": [],
|
|
"binaries": [],
|
|
"processes": [],
|
|
}
|
|
|
|
if not index_path.exists():
|
|
return result
|
|
|
|
records = Process.parse_records_from_text(index_path.read_text())
|
|
for record in records:
|
|
record_type = record.get("type")
|
|
if record_type == TYPE_SNAPSHOT:
|
|
result["snapshot"] = record
|
|
elif record_type == TYPE_ARCHIVERESULT:
|
|
result["archive_results"].append(record)
|
|
elif record_type == TYPE_BINARY:
|
|
result["binaries"].append(record)
|
|
elif record_type == TYPE_PROCESS:
|
|
result["processes"].append(record)
|
|
|
|
return result
|
|
|
|
def convert_index_json_to_jsonl(self) -> bool:
|
|
"""
|
|
Convert index.json to index.jsonl format.
|
|
|
|
Reads existing index.json, creates index.jsonl, and removes index.json.
|
|
Returns True if conversion was performed, False if no conversion needed.
|
|
"""
|
|
import json
|
|
|
|
json_path = Path(self.output_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
|
jsonl_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
|
|
|
|
# Skip if already converted or no json file exists
|
|
if jsonl_path.exists() or not json_path.exists():
|
|
return False
|
|
|
|
try:
|
|
with open(json_path) as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
return False
|
|
|
|
# Detect format version and extract records
|
|
fs_version = data.get("fs_version", "0.7.0")
|
|
|
|
jsonl_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(jsonl_path, "w") as f:
|
|
# Write Snapshot record
|
|
snapshot_record = {
|
|
"type": "Snapshot",
|
|
"id": str(self.id),
|
|
"crawl_id": str(self.crawl_id) if self.crawl_id else None,
|
|
"url": data.get("url", self.url),
|
|
"timestamp": data.get("timestamp", self.timestamp),
|
|
"title": data.get("title", self.title or ""),
|
|
"tags": data.get("tags", ""),
|
|
"fs_version": fs_version,
|
|
"bookmarked_at": data.get("bookmarked_at"),
|
|
"created_at": data.get("created_at"),
|
|
}
|
|
f.write(json.dumps(snapshot_record) + "\n")
|
|
|
|
# Handle 0.8.x/0.9.x format (archive_results list)
|
|
for result_data in data.get("archive_results", []):
|
|
ar_record = {
|
|
"type": "ArchiveResult",
|
|
"snapshot_id": str(self.id),
|
|
"plugin": result_data.get("plugin", ""),
|
|
"status": result_data.get("status", ""),
|
|
"output_str": result_data.get("output", ""),
|
|
"start_ts": result_data.get("start_ts"),
|
|
"end_ts": result_data.get("end_ts"),
|
|
}
|
|
if result_data.get("cmd"):
|
|
ar_record["cmd"] = result_data["cmd"]
|
|
f.write(json.dumps(ar_record) + "\n")
|
|
|
|
# Handle 0.7.x format (history dict)
|
|
if "history" in data and isinstance(data["history"], dict):
|
|
for plugin, result_list in data["history"].items():
|
|
if not isinstance(result_list, list):
|
|
continue
|
|
for result_data in result_list:
|
|
ar_record = {
|
|
"type": "ArchiveResult",
|
|
"snapshot_id": str(self.id),
|
|
"plugin": result_data.get("plugin") or result_data.get("extractor") or plugin,
|
|
"status": result_data.get("status", ""),
|
|
"output_str": result_data.get("output", ""),
|
|
"start_ts": result_data.get("start_ts"),
|
|
"end_ts": result_data.get("end_ts"),
|
|
}
|
|
if result_data.get("cmd"):
|
|
ar_record["cmd"] = result_data["cmd"]
|
|
f.write(json.dumps(ar_record) + "\n")
|
|
|
|
# Remove old index.json after successful conversion
|
|
try:
|
|
json_path.unlink()
|
|
except OSError:
|
|
pass
|
|
|
|
return True
|
|
|
|
# =========================================================================
|
|
# Snapshot Utilities
|
|
# =========================================================================
|
|
|
|
@staticmethod
|
|
def move_directory_to_invalid(snapshot_dir: Path):
|
|
"""
|
|
Move invalid directory to data/invalid/YYYYMMDD/.
|
|
|
|
Used by: archivebox update (when encountering invalid directories)
|
|
"""
|
|
from datetime import datetime
|
|
import shutil
|
|
|
|
invalid_dir = CONSTANTS.DATA_DIR / "invalid" / datetime.now().strftime("%Y%m%d")
|
|
invalid_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
dest = invalid_dir / snapshot_dir.name
|
|
counter = 1
|
|
while dest.exists():
|
|
dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
|
|
counter += 1
|
|
|
|
try:
|
|
shutil.move(str(snapshot_dir), str(dest))
|
|
except Exception:
|
|
pass
|
|
|
|
@classmethod
|
|
def find_and_merge_duplicates(cls) -> int:
|
|
"""
|
|
Find and merge snapshots with same url:timestamp.
|
|
Returns count of duplicate sets merged.
|
|
|
|
Used by: archivebox update (Phase 3: deduplication)
|
|
"""
|
|
from django.db.models import Count
|
|
|
|
duplicates = cls.objects.values("url", "timestamp").annotate(count=Count("id")).filter(count__gt=1)
|
|
|
|
merged = 0
|
|
for dup in duplicates.iterator(chunk_size=500):
|
|
snapshots = list(
|
|
cls.objects.filter(url=dup["url"], timestamp=dup["timestamp"]).order_by("created_at"), # Keep oldest
|
|
)
|
|
|
|
if len(snapshots) > 1:
|
|
try:
|
|
cls._merge_snapshots(snapshots)
|
|
merged += 1
|
|
except Exception:
|
|
pass
|
|
|
|
return merged
|
|
|
|
@classmethod
|
|
def _merge_snapshots(cls, snapshots: Sequence["Snapshot"]):
|
|
"""
|
|
Merge exact duplicates.
|
|
Keep oldest, union files + ArchiveResults.
|
|
"""
|
|
import shutil
|
|
|
|
keeper = snapshots[0]
|
|
duplicates = snapshots[1:]
|
|
|
|
keeper_dir = Path(keeper.output_dir)
|
|
|
|
for dup in duplicates:
|
|
dup_dir = Path(dup.output_dir)
|
|
|
|
# Merge files
|
|
if dup_dir.exists() and dup_dir != keeper_dir:
|
|
for dup_file in dup_dir.rglob("*"):
|
|
if not dup_file.is_file():
|
|
continue
|
|
|
|
rel = dup_file.relative_to(dup_dir)
|
|
keeper_file = keeper_dir / rel
|
|
|
|
if not keeper_file.exists():
|
|
keeper_file.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(dup_file, keeper_file)
|
|
|
|
try:
|
|
shutil.rmtree(dup_dir)
|
|
except Exception:
|
|
pass
|
|
|
|
# Merge tags
|
|
for tag in dup.tags.all():
|
|
keeper.tags.add(tag)
|
|
|
|
# Move ArchiveResults
|
|
ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)
|
|
|
|
# Delete
|
|
dup.delete()
|
|
|
|
# =========================================================================
|
|
# Output Directory Properties
|
|
# =========================================================================
|
|
|
|
@property
|
|
def output_dir_parent(self) -> str:
|
|
return "archive"
|
|
|
|
@property
|
|
def output_dir_name(self) -> str:
|
|
return str(self.timestamp)
|
|
|
|
def archive(self, overwrite=False, methods=None):
|
|
return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
|
|
|
|
@admin.display(description="Tags")
|
|
def tags_str(self, nocache=True) -> str | None:
|
|
calc_tags_str = lambda: ",".join(sorted(tag.name for tag in self.tags.all()))
|
|
prefetched_cache = getattr(self, "_prefetched_objects_cache", {})
|
|
if "tags" in prefetched_cache:
|
|
return calc_tags_str()
|
|
cache_key = f"{self.pk}-tags"
|
|
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
|
|
|
|
def icons(self, path: str | None = None) -> str:
|
|
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
|
|
from django.utils.html import format_html
|
|
|
|
cache_key = (
|
|
f"result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}"
|
|
)
|
|
|
|
def calc_icons():
|
|
prefetched_cache = getattr(self, "_prefetched_objects_cache", {})
|
|
if "archiveresult_set" in prefetched_cache:
|
|
archive_results = {
|
|
r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)
|
|
}
|
|
else:
|
|
# Filter for results that have either output_files or output_str
|
|
from django.db.models import Q
|
|
|
|
archive_results = {
|
|
r.plugin: r
|
|
for r in self.archiveresult_set.filter(
|
|
Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str="")),
|
|
)
|
|
}
|
|
|
|
archive_path = path or self.archive_path
|
|
output = ""
|
|
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>'
|
|
|
|
# Get all plugins from hooks system (sorted by numeric prefix)
|
|
all_plugins = [get_plugin_name(e) for e in get_plugins()]
|
|
|
|
for plugin in all_plugins:
|
|
result = archive_results.get(plugin)
|
|
existing = result and result.status == "succeeded" and (result.output_files or result.output_str)
|
|
icon = mark_safe(get_plugin_icon(plugin))
|
|
|
|
# Skip plugins with empty icons that have no output
|
|
# (e.g., staticfile only shows when there's actual output)
|
|
if not icon.strip() and not existing:
|
|
continue
|
|
|
|
embed_path = result.embed_path() if result else f"{plugin}/"
|
|
output += format_html(
|
|
output_template,
|
|
archive_path,
|
|
embed_path,
|
|
str(bool(existing)),
|
|
plugin,
|
|
icon,
|
|
)
|
|
|
|
return format_html(
|
|
'<span class="files-icons" style="font-size: 1em; opacity: 0.8; display: inline-grid; grid-auto-flow: column; grid-auto-columns: auto; grid-template-rows: repeat(4, auto); gap: 0 0; justify-content: start; align-content: start;">{}</span>',
|
|
mark_safe(output),
|
|
)
|
|
|
|
cache_result = cache.get(cache_key)
|
|
if cache_result:
|
|
return cache_result
|
|
|
|
fresh_result = calc_icons()
|
|
cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
|
|
return fresh_result
|
|
|
|
@property
|
|
def api_url(self) -> str:
|
|
return str(reverse_lazy("api-1:get_snapshot", args=[self.id]))
|
|
|
|
def get_absolute_url(self):
|
|
return f"/{self.archive_path}"
|
|
|
|
@cached_property
|
|
def domain(self) -> str:
|
|
return url_domain(self.url)
|
|
|
|
@property
|
|
def title_stripped(self) -> str:
|
|
return (self.title or "").strip()
|
|
|
|
@staticmethod
|
|
def _normalize_title_candidate(candidate: str | None, *, snapshot_url: str) -> str:
|
|
title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip()
|
|
if not title:
|
|
return ""
|
|
if title.lower() in {"pending...", "no title found"}:
|
|
return ""
|
|
if title == snapshot_url:
|
|
return ""
|
|
if title.startswith(("http://", "https://")):
|
|
return ""
|
|
if "/" in title and title.lower().endswith(".txt"):
|
|
return ""
|
|
return title
|
|
|
|
@property
|
|
def resolved_title(self) -> str:
|
|
stored_title = self._normalize_title_candidate(self.title, snapshot_url=self.url)
|
|
if stored_title:
|
|
return stored_title
|
|
|
|
title_result = (
|
|
self.archiveresult_set.filter(plugin="title").exclude(output_str="").order_by("-start_ts", "-end_ts", "-created_at").first()
|
|
)
|
|
if title_result:
|
|
result_title = self._normalize_title_candidate(title_result.output_str, snapshot_url=self.url)
|
|
if result_title:
|
|
return result_title
|
|
|
|
title_file = self.output_dir / "title" / "title.txt"
|
|
if title_file.exists():
|
|
try:
|
|
file_title = self._normalize_title_candidate(title_file.read_text(encoding="utf-8"), snapshot_url=self.url)
|
|
except OSError:
|
|
file_title = ""
|
|
if file_title:
|
|
return file_title
|
|
|
|
return ""
|
|
|
|
@cached_property
|
|
def hashes_index(self) -> dict[str, dict[str, Any]]:
|
|
hashes_path = self.output_dir / "hashes" / "hashes.json"
|
|
if not hashes_path.exists():
|
|
return {}
|
|
|
|
try:
|
|
data = json.loads(hashes_path.read_text(encoding="utf-8"))
|
|
except Exception:
|
|
return {}
|
|
|
|
index: dict[str, dict[str, Any]] = {}
|
|
if isinstance(data, dict) and isinstance(data.get("files"), list):
|
|
for entry in data["files"]:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
path = str(entry.get("path") or "").strip().rstrip("/")
|
|
if not path:
|
|
continue
|
|
index[path] = {
|
|
"size": entry.get("size") or entry.get("num_bytes") or entry.get("bytes") or 0,
|
|
"is_dir": bool(entry.get("is_dir")) or str(entry.get("path") or "").endswith("/"),
|
|
"hash": entry.get("hash") or entry.get("hash_sha256"),
|
|
}
|
|
elif isinstance(data, dict):
|
|
for path, entry in data.items():
|
|
if not isinstance(entry, dict) or path == ".":
|
|
continue
|
|
clean_path = str(path).rstrip("/")
|
|
if not clean_path:
|
|
continue
|
|
index[clean_path] = {
|
|
"size": entry.get("size") or entry.get("num_bytes") or 0,
|
|
"is_dir": bool(entry.get("mime_type") == "inode/directory" or str(path).endswith("/")),
|
|
"hash": entry.get("hash") or entry.get("hash_sha256"),
|
|
}
|
|
return index
|
|
|
|
@property
|
|
def output_dir(self) -> Path:
|
|
"""The filesystem path to the snapshot's output directory."""
|
|
import os
|
|
|
|
current_path = self.get_storage_path_for_version(self.fs_version)
|
|
|
|
if current_path.exists():
|
|
return current_path
|
|
|
|
# Check for backwards-compat symlink
|
|
old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
|
|
if old_path.is_symlink():
|
|
link_target = Path(os.readlink(old_path))
|
|
return (old_path.parent / link_target).resolve() if not link_target.is_absolute() else link_target.resolve()
|
|
elif old_path.exists():
|
|
return old_path
|
|
|
|
return current_path
|
|
|
|
def ensure_legacy_archive_symlink(self) -> None:
|
|
"""Ensure the legacy archive/<timestamp> path resolves to this snapshot."""
|
|
import os
|
|
|
|
legacy_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
|
|
target = Path(self.get_storage_path_for_version(self._fs_current_version()))
|
|
|
|
if target == legacy_path:
|
|
return
|
|
|
|
legacy_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if legacy_path.exists() or legacy_path.is_symlink():
|
|
if legacy_path.is_symlink():
|
|
try:
|
|
if legacy_path.resolve() == target.resolve():
|
|
return
|
|
except OSError:
|
|
pass
|
|
legacy_path.unlink(missing_ok=True)
|
|
else:
|
|
return
|
|
|
|
rel_target = os.path.relpath(target, legacy_path.parent)
|
|
try:
|
|
legacy_path.symlink_to(rel_target, target_is_directory=True)
|
|
except OSError:
|
|
return
|
|
|
|
def ensure_crawl_symlink(self) -> None:
|
|
"""Ensure snapshot is symlinked under its crawl output directory."""
|
|
import os
|
|
from pathlib import Path
|
|
from django.utils import timezone
|
|
from archivebox import DATA_DIR
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
if not self.crawl_id:
|
|
return
|
|
crawl = Crawl.objects.filter(id=self.crawl_id).select_related("created_by").first()
|
|
if not crawl:
|
|
return
|
|
|
|
date_base = crawl.created_at or self.created_at or timezone.now()
|
|
date_str = date_base.strftime("%Y%m%d")
|
|
domain = self.extract_domain_from_url(self.url)
|
|
username = crawl.created_by.username if getattr(crawl, "created_by_id", None) else "system"
|
|
|
|
crawl_dir = DATA_DIR / "users" / username / "crawls" / date_str / domain / str(crawl.id)
|
|
link_path = crawl_dir / "snapshots" / domain / str(self.id)
|
|
link_parent = link_path.parent
|
|
link_parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
target = Path(self.output_dir)
|
|
if link_path.exists() or link_path.is_symlink():
|
|
if link_path.is_symlink():
|
|
if link_path.resolve() == target.resolve():
|
|
return
|
|
link_path.unlink(missing_ok=True)
|
|
else:
|
|
return
|
|
|
|
rel_target = os.path.relpath(target, link_parent)
|
|
try:
|
|
link_path.symlink_to(rel_target, target_is_directory=True)
|
|
except OSError:
|
|
return
|
|
|
|
@cached_property
|
|
def legacy_archive_path(self) -> str:
|
|
return f"{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}"
|
|
|
|
@cached_property
|
|
def archive_path_from_db(self) -> str:
|
|
"""Best-effort public URL path derived from DB fields only."""
|
|
if self.fs_version in ("0.7.0", "0.8.0"):
|
|
return self.legacy_archive_path
|
|
|
|
if self.fs_version in ("0.9.0", "1.0.0"):
|
|
username = "web"
|
|
crawl = getattr(self, "crawl", None)
|
|
if crawl and getattr(crawl, "created_by_id", None):
|
|
username = crawl.created_by.username
|
|
if username == "system":
|
|
username = "web"
|
|
|
|
date_base = self.created_at or self.bookmarked_at
|
|
if date_base:
|
|
date_str = date_base.strftime("%Y%m%d")
|
|
else:
|
|
try:
|
|
date_str = datetime.fromtimestamp(float(self.timestamp)).strftime("%Y%m%d")
|
|
except (TypeError, ValueError, OSError):
|
|
return self.legacy_archive_path
|
|
|
|
domain = self.extract_domain_from_url(self.url)
|
|
return f"{username}/{date_str}/{domain}/{self.id}"
|
|
|
|
return self.legacy_archive_path
|
|
|
|
@cached_property
|
|
def url_path(self) -> str:
|
|
"""URL path matching the current snapshot output_dir layout."""
|
|
try:
|
|
rel_path = Path(self.output_dir).resolve().relative_to(CONSTANTS.DATA_DIR)
|
|
except Exception:
|
|
return self.legacy_archive_path
|
|
|
|
parts = rel_path.parts
|
|
# New layout: users/<username>/snapshots/<YYYYMMDD>/<domain>/<uuid>/
|
|
if len(parts) >= 6 and parts[0] == "users" and parts[2] == "snapshots":
|
|
username = parts[1]
|
|
if username == "system":
|
|
username = "web"
|
|
date_str = parts[3]
|
|
domain = parts[4]
|
|
snapshot_id = parts[5]
|
|
return f"{username}/{date_str}/{domain}/{snapshot_id}"
|
|
|
|
# Legacy layout: archive/<timestamp>/
|
|
if len(parts) >= 2 and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME:
|
|
return f"{parts[0]}/{parts[1]}"
|
|
|
|
return "/".join(parts)
|
|
|
|
@cached_property
|
|
def archive_path(self):
|
|
return self.url_path
|
|
|
|
@cached_property
|
|
def archive_size(self):
|
|
if hasattr(self, "output_size_sum"):
|
|
return int(self.output_size_sum or 0)
|
|
|
|
prefetched_results = None
|
|
if hasattr(self, "_prefetched_objects_cache"):
|
|
prefetched_results = self._prefetched_objects_cache.get("archiveresult_set")
|
|
if prefetched_results is not None:
|
|
return sum(result.output_size or result.output_size_from_files() for result in prefetched_results)
|
|
|
|
stats = self.archiveresult_set.aggregate(result_count=models.Count("id"), total_size=models.Sum("output_size"))
|
|
if stats["result_count"]:
|
|
return int(stats["total_size"] or 0)
|
|
try:
|
|
return get_dir_size(self.output_dir)[0]
|
|
except Exception:
|
|
return 0
|
|
|
|
def save_tags(self, tags: Iterable[str] = ()) -> None:
|
|
tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
|
|
self.tags.clear()
|
|
self.tags.add(*tags_id)
|
|
|
|
def pending_archiveresults(self) -> QuerySet["ArchiveResult"]:
|
|
return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
|
|
|
|
def run(self) -> list["ArchiveResult"]:
|
|
"""
|
|
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
|
|
|
|
Returns:
|
|
list[ArchiveResult]: Newly created pending results
|
|
"""
|
|
return self.create_pending_archiveresults()
|
|
|
|
def cleanup(self):
|
|
"""
|
|
Clean up background ArchiveResult hooks and empty results.
|
|
|
|
Called by the state machine when entering the 'sealed' state.
|
|
Uses Process records to kill background hooks, then deletes empty ArchiveResults.
|
|
"""
|
|
from archivebox.machine.models import Process
|
|
|
|
# Kill any background ArchiveResult hooks using Process records
|
|
# Find all running hook Processes linked to this snapshot's ArchiveResults
|
|
running_hooks = Process.objects.filter(
|
|
archiveresult__snapshot=self,
|
|
process_type=Process.TypeChoices.HOOK,
|
|
status=Process.StatusChoices.RUNNING,
|
|
).distinct()
|
|
|
|
for process in running_hooks:
|
|
# Use Process.kill_tree() to gracefully kill parent + children
|
|
killed_count = process.kill_tree(graceful_timeout=2.0)
|
|
if killed_count > 0:
|
|
print(f"[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]")
|
|
|
|
# Clean up .pid files from output directory
|
|
if Path(self.output_dir).exists():
|
|
for pid_file in Path(self.output_dir).glob("**/*.pid"):
|
|
pid_file.unlink(missing_ok=True)
|
|
|
|
# Update all background ArchiveResults from filesystem (in case output arrived late)
|
|
results = self.archiveresult_set.filter(hook_name__contains=".bg.")
|
|
for ar in results:
|
|
ar.update_from_output()
|
|
|
|
# Delete ArchiveResults that produced no output files
|
|
empty_ars = self.archiveresult_set.filter(
|
|
output_files={}, # No output files
|
|
).filter(
|
|
status__in=ArchiveResult.FINAL_STATES, # Only delete finished ones
|
|
)
|
|
|
|
deleted_count = empty_ars.count()
|
|
if deleted_count > 0:
|
|
empty_ars.delete()
|
|
print(f"[yellow]🗑️ Deleted {deleted_count} empty ArchiveResults for {self.url}[/yellow]")
|
|
|
|
def to_json(self) -> dict:
|
|
"""
|
|
Convert Snapshot model instance to a JSON-serializable dict.
|
|
Includes all fields needed to fully reconstruct/identify this snapshot.
|
|
"""
|
|
from archivebox.config import VERSION
|
|
|
|
archive_size = self.archive_size
|
|
|
|
return {
|
|
"type": "Snapshot",
|
|
"schema_version": VERSION,
|
|
"id": str(self.id),
|
|
"crawl_id": str(self.crawl_id),
|
|
"url": self.url,
|
|
"title": self.title,
|
|
"tags": self.tags_str(),
|
|
"bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
"timestamp": self.timestamp,
|
|
"depth": self.depth,
|
|
"status": self.status,
|
|
"fs_version": self.fs_version,
|
|
"archive_size": archive_size,
|
|
"output_size": archive_size,
|
|
}
|
|
|
|
@staticmethod
|
|
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None, queue_for_extraction: bool = True):
|
|
"""
|
|
Create/update Snapshot from JSON dict.
|
|
|
|
Unified method that handles:
|
|
- ID-based patching: {"id": "...", "title": "new title"}
|
|
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
|
|
- Auto-creates Crawl if not provided
|
|
- Optionally queues for extraction
|
|
|
|
Args:
|
|
record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
|
|
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
|
|
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
|
|
|
|
Returns:
|
|
Snapshot instance or None
|
|
"""
|
|
import re
|
|
from django.utils import timezone
|
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
|
from archivebox.config.common import GENERAL_CONFIG
|
|
|
|
overrides = overrides or {}
|
|
|
|
# If 'id' is provided, lookup and patch that specific snapshot
|
|
snapshot_id = record.get("id")
|
|
if snapshot_id:
|
|
try:
|
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
|
|
|
# Generically update all fields present in record
|
|
update_fields = []
|
|
for field_name, value in record.items():
|
|
# Skip internal fields
|
|
if field_name in ("id", "type"):
|
|
continue
|
|
|
|
# Skip if field doesn't exist on model
|
|
if not hasattr(snapshot, field_name):
|
|
continue
|
|
|
|
# Special parsing for date fields
|
|
if field_name in ("bookmarked_at", "retry_at", "created_at", "modified_at"):
|
|
if value and isinstance(value, str):
|
|
value = parse_date(value)
|
|
|
|
# Update field if value is provided and different
|
|
if value is not None and getattr(snapshot, field_name) != value:
|
|
setattr(snapshot, field_name, value)
|
|
update_fields.append(field_name)
|
|
|
|
if update_fields:
|
|
snapshot.save(update_fields=update_fields + ["modified_at"])
|
|
|
|
return snapshot
|
|
except Snapshot.DoesNotExist:
|
|
# ID not found, fall through to create-by-URL logic
|
|
pass
|
|
|
|
from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url
|
|
|
|
url = sanitize_extracted_url(fix_url_from_markdown(str(record.get("url") or "").strip()))
|
|
if not url:
|
|
return None
|
|
|
|
# Determine or create crawl (every snapshot must have a crawl)
|
|
crawl = overrides.get("crawl")
|
|
parent_snapshot = overrides.get("snapshot") # Parent snapshot
|
|
created_by_id = overrides.get("created_by_id") or (
|
|
parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk()
|
|
)
|
|
|
|
# DEBUG: Check if crawl_id in record matches overrides crawl
|
|
import sys
|
|
|
|
record_crawl_id = record.get("crawl_id")
|
|
if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id):
|
|
print(
|
|
f"[yellow]⚠️ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
# If no crawl provided, inherit from parent or auto-create one
|
|
if not crawl:
|
|
if parent_snapshot:
|
|
# Inherit crawl from parent snapshot
|
|
crawl = parent_snapshot.crawl
|
|
else:
|
|
# Auto-create a single-URL crawl
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.config import CONSTANTS
|
|
|
|
timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
|
sources_file = CONSTANTS.SOURCES_DIR / f"{timestamp_str}__auto_crawl.txt"
|
|
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
|
sources_file.write_text(url)
|
|
|
|
crawl = Crawl.objects.create(
|
|
urls=url,
|
|
max_depth=0,
|
|
label=f"auto-created for {url[:50]}",
|
|
created_by_id=created_by_id,
|
|
)
|
|
print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr)
|
|
|
|
# Parse tags (accept either a list ["tag1", "tag2"] or a comma-separated string "tag1,tag2")
|
|
tags_raw = record.get("tags", "")
|
|
tag_list = []
|
|
if isinstance(tags_raw, list):
|
|
tag_list = list(dict.fromkeys(tag.strip() for tag in tags_raw if tag.strip()))
|
|
elif tags_raw:
|
|
tag_list = list(
|
|
dict.fromkeys(tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw) if tag.strip()),
|
|
)
|
|
|
|
# Check for existing snapshot with same URL in same crawl
|
|
# (URLs can exist in multiple crawls, but should be unique within a crawl)
|
|
snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by("-created_at").first()
|
|
|
|
title = record.get("title")
|
|
timestamp = record.get("timestamp")
|
|
|
|
if snapshot:
|
|
# Update existing snapshot
|
|
if title and (not snapshot.title or len(title) > len(snapshot.title or "")):
|
|
snapshot.title = title
|
|
snapshot.save(update_fields=["title", "modified_at"])
|
|
else:
|
|
# Create new snapshot
|
|
if timestamp:
|
|
while Snapshot.objects.filter(timestamp=timestamp).exists():
|
|
timestamp = str(float(timestamp) + 1.0)
|
|
|
|
snapshot = Snapshot.objects.create(
|
|
url=url,
|
|
timestamp=timestamp,
|
|
title=title,
|
|
crawl=crawl,
|
|
)
|
|
|
|
# Update tags
|
|
if tag_list:
|
|
existing_tags = set(snapshot.tags.values_list("name", flat=True))
|
|
new_tags = set(tag_list) | existing_tags
|
|
snapshot.save_tags(new_tags)
|
|
|
|
# Queue for extraction and update additional fields
|
|
update_fields = []
|
|
|
|
if queue_for_extraction:
|
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
|
snapshot.retry_at = timezone.now()
|
|
update_fields.extend(["status", "retry_at"])
|
|
|
|
# Update additional fields if provided
|
|
for field_name in ("depth", "parent_snapshot_id", "crawl_id", "bookmarked_at"):
|
|
value = record.get(field_name)
|
|
if value is not None and getattr(snapshot, field_name) != value:
|
|
setattr(snapshot, field_name, value)
|
|
update_fields.append(field_name)
|
|
|
|
if update_fields:
|
|
snapshot.save(update_fields=update_fields + ["modified_at"])
|
|
|
|
snapshot.ensure_crawl_symlink()
|
|
|
|
return snapshot
|
|
|
|
def create_pending_archiveresults(self) -> list["ArchiveResult"]:
|
|
"""
|
|
Create ArchiveResult records for all enabled hooks.
|
|
|
|
Uses the hooks system to discover available hooks from:
|
|
- abx_plugins/plugins/*/on_Snapshot__*.{py,sh,js}
|
|
- data/custom_plugins/*/on_Snapshot__*.{py,sh,js}
|
|
|
|
Creates one ArchiveResult per hook (not per plugin), with hook_name set.
|
|
This enables step-based execution where all hooks in a step can run in parallel.
|
|
"""
|
|
from archivebox.hooks import discover_hooks
|
|
from archivebox.config.configset import get_config
|
|
|
|
# Get merged config with crawl-specific PLUGINS filter
|
|
config = get_config(crawl=self.crawl, snapshot=self)
|
|
hooks = discover_hooks("Snapshot", config=config)
|
|
archiveresults = []
|
|
|
|
for hook_path in hooks:
|
|
hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py'
|
|
plugin = hook_path.parent.name # e.g., 'wget'
|
|
|
|
# Check if AR already exists for this specific hook
|
|
if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
|
|
continue
|
|
|
|
archiveresult, created = ArchiveResult.objects.get_or_create(
|
|
snapshot=self,
|
|
hook_name=hook_name,
|
|
defaults={
|
|
"plugin": plugin,
|
|
"status": ArchiveResult.INITIAL_STATE,
|
|
},
|
|
)
|
|
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
|
archiveresults.append(archiveresult)
|
|
|
|
return archiveresults
|
|
|
|
def is_finished_processing(self) -> bool:
|
|
"""
|
|
Check if all ArchiveResults are finished.
|
|
|
|
Note: This is only called for observability/progress tracking.
|
|
The shared runner owns execution and does not poll this.
|
|
"""
|
|
# Check if any ARs are still pending/started
|
|
pending = self.archiveresult_set.exclude(
|
|
status__in=ArchiveResult.FINAL_STATES,
|
|
).exists()
|
|
|
|
return not pending
|
|
|
|
def get_progress_stats(self) -> dict:
|
|
"""
|
|
Get progress statistics for this snapshot's archiving process.
|
|
|
|
Returns dict with:
|
|
- total: Total number of archive results
|
|
- succeeded: Number of succeeded results
|
|
- failed: Number of failed results
|
|
- running: Number of currently running results
|
|
- pending: Number of pending/queued results
|
|
- percent: Completion percentage (0-100)
|
|
- output_size: Total output size in bytes
|
|
- is_sealed: Whether the snapshot is in a final state
|
|
"""
|
|
from django.db.models import Sum
|
|
|
|
results = self.archiveresult_set.all()
|
|
|
|
# Count by status
|
|
succeeded = results.filter(status="succeeded").count()
|
|
failed = results.filter(status="failed").count()
|
|
running = results.filter(status="started").count()
|
|
skipped = results.filter(status="skipped").count()
|
|
noresults = results.filter(status="noresults").count()
|
|
total = results.count()
|
|
pending = total - succeeded - failed - running - skipped - noresults
|
|
|
|
# Calculate percentage (succeeded + failed + skipped + noresults as completed)
|
|
completed = succeeded + failed + skipped + noresults
|
|
percent = int((completed / total * 100) if total > 0 else 0)
|
|
|
|
# Sum output sizes
|
|
output_size = results.aggregate(total_size=Sum("output_size"))["total_size"] or 0
|
|
|
|
# Check if sealed
|
|
is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED)
|
|
|
|
return {
|
|
"total": total,
|
|
"succeeded": succeeded,
|
|
"failed": failed,
|
|
"running": running,
|
|
"pending": pending,
|
|
"skipped": skipped,
|
|
"noresults": noresults,
|
|
"percent": percent,
|
|
"output_size": output_size,
|
|
"is_sealed": is_sealed,
|
|
}
|
|
|
|
def retry_failed_archiveresults(self) -> int:
|
|
"""
|
|
Reset failed/skipped ArchiveResults to queued for retry.
|
|
|
|
Returns count of ArchiveResults reset.
|
|
"""
|
|
count = self.archiveresult_set.filter(
|
|
status__in=[
|
|
ArchiveResult.StatusChoices.FAILED,
|
|
ArchiveResult.StatusChoices.SKIPPED,
|
|
ArchiveResult.StatusChoices.NORESULTS,
|
|
],
|
|
).update(
|
|
status=ArchiveResult.StatusChoices.QUEUED,
|
|
output_str="",
|
|
output_json=None,
|
|
output_files={},
|
|
output_size=0,
|
|
output_mimetypes="",
|
|
start_ts=None,
|
|
end_ts=None,
|
|
)
|
|
|
|
if count > 0:
|
|
self.status = self.StatusChoices.QUEUED
|
|
self.retry_at = timezone.now()
|
|
self.current_step = 0 # Reset to step 0 for retry
|
|
self.save(update_fields=["status", "retry_at", "current_step", "modified_at"])
|
|
|
|
return count
|
|
|
|
# =========================================================================
|
|
# URL Helper Properties (migrated from Link schema)
|
|
# =========================================================================
|
|
|
|
@cached_property
|
|
def url_hash(self) -> str:
|
|
from hashlib import sha256
|
|
|
|
return sha256(self.url.encode()).hexdigest()[:8]
|
|
|
|
@cached_property
|
|
def scheme(self) -> str:
|
|
return self.url.split("://")[0]
|
|
|
|
@cached_property
|
|
def path(self) -> str:
|
|
parts = self.url.split("://", 1)
|
|
return "/" + parts[1].split("/", 1)[1] if len(parts) > 1 and "/" in parts[1] else "/"
|
|
|
|
@cached_property
|
|
def basename(self) -> str:
|
|
return self.path.split("/")[-1]
|
|
|
|
@cached_property
|
|
def extension(self) -> str:
|
|
basename = self.basename
|
|
return basename.split(".")[-1] if "." in basename else ""
|
|
|
|
@cached_property
|
|
def base_url(self) -> str:
|
|
return f"{self.scheme}://{self.domain}"
|
|
|
|
@cached_property
|
|
def is_static(self) -> bool:
|
|
static_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".mp4", ".mp3", ".wav", ".webm"}
|
|
return any(self.url.lower().endswith(ext) for ext in static_extensions)
|
|
|
|
@cached_property
|
|
def is_archived(self) -> bool:
|
|
if self.downloaded_at or self.status == self.StatusChoices.SEALED:
|
|
return True
|
|
|
|
output_paths = (
|
|
self.domain,
|
|
"output.html",
|
|
"output.pdf",
|
|
"screenshot.png",
|
|
"singlefile.html",
|
|
"readability/content.html",
|
|
"mercury/content.html",
|
|
"htmltotext.txt",
|
|
"media",
|
|
"git",
|
|
)
|
|
return any((Path(self.output_dir) / path).exists() for path in output_paths)
|
|
|
|
# =========================================================================
|
|
# Date/Time Properties (migrated from Link schema)
|
|
# =========================================================================
|
|
|
|
@cached_property
|
|
def bookmarked_date(self) -> str | None:
|
|
max_ts = (timezone.now() + timedelta(days=30)).timestamp()
|
|
if self.timestamp and self.timestamp.replace(".", "").isdigit():
|
|
if 0 < float(self.timestamp) < max_ts:
|
|
return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
|
|
return str(self.timestamp)
|
|
return None
|
|
|
|
@cached_property
|
|
def downloaded_datestr(self) -> str | None:
|
|
return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
|
|
|
|
@cached_property
|
|
def archive_dates(self) -> list[datetime]:
|
|
return [result.start_ts for result in self.archiveresult_set.all() if result.start_ts]
|
|
|
|
@cached_property
|
|
def oldest_archive_date(self) -> datetime | None:
|
|
dates = self.archive_dates
|
|
return min(dates) if dates else None
|
|
|
|
@cached_property
|
|
def newest_archive_date(self) -> datetime | None:
|
|
dates = self.archive_dates
|
|
return max(dates) if dates else None
|
|
|
|
@cached_property
|
|
def num_outputs(self) -> int:
|
|
return self.archiveresult_set.filter(status="succeeded").count()
|
|
|
|
@cached_property
|
|
def num_failures(self) -> int:
|
|
return self.archiveresult_set.filter(status="failed").count()
|
|
|
|
# =========================================================================
|
|
# Output Path Methods (migrated from Link schema)
|
|
# =========================================================================
|
|
|
|
def latest_outputs(self, status: str | None = None) -> dict[str, Any]:
|
|
"""Get the latest output that each plugin produced"""
|
|
from archivebox.hooks import get_plugins
|
|
from django.db.models import Q
|
|
|
|
latest: dict[str, Any] = {}
|
|
for plugin in get_plugins():
|
|
results = self.archiveresult_set.filter(plugin=plugin)
|
|
if status is not None:
|
|
results = results.filter(status=status)
|
|
# Filter for results with output_files or output_str
|
|
results = results.filter(Q(output_files__isnull=False) | ~Q(output_str="")).order_by("-start_ts")
|
|
result = results.first()
|
|
# Return embed_path() for backwards compatibility
|
|
latest[plugin] = result.embed_path() if result else None
|
|
return latest
|
|
|
|
def discover_outputs(self, include_filesystem_fallback: bool = True) -> list[dict]:
|
|
"""Discover output files from ArchiveResults and filesystem."""
|
|
from archivebox.misc.util import ts_to_date_str
|
|
|
|
ArchiveResult = self.archiveresult_set.model
|
|
snap_dir = Path(self.output_dir)
|
|
outputs: list[dict] = []
|
|
seen: set[str] = set()
|
|
|
|
text_exts = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log")
|
|
|
|
def is_metadata_path(path: str | None) -> bool:
|
|
lower = (path or "").lower()
|
|
return lower.endswith(text_exts)
|
|
|
|
def is_compact_path(path: str | None) -> bool:
|
|
lower = (path or "").lower()
|
|
return lower.endswith(text_exts)
|
|
|
|
for result in self.archiveresult_set.all().order_by("start_ts"):
|
|
embed_path = result.embed_path_db()
|
|
if not embed_path and include_filesystem_fallback:
|
|
embed_path = result.embed_path()
|
|
if not embed_path or embed_path.strip() in (".", "/", "./"):
|
|
continue
|
|
size = result.output_size or result.output_size_from_files() or self.hashes_index.get(embed_path, {}).get("size") or 0
|
|
if not size and include_filesystem_fallback:
|
|
abs_path = snap_dir / embed_path
|
|
if not abs_path.exists():
|
|
continue
|
|
if abs_path.is_dir():
|
|
if not any(p.is_file() for p in abs_path.rglob("*")):
|
|
continue
|
|
size = sum(p.stat().st_size for p in abs_path.rglob("*") if p.is_file())
|
|
else:
|
|
size = abs_path.stat().st_size
|
|
plugin_lower = (result.plugin or "").lower()
|
|
if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl"):
|
|
plugin_dir = snap_dir / result.plugin
|
|
if plugin_dir.exists():
|
|
try:
|
|
size = sum(p.stat().st_size for p in plugin_dir.rglob("*") if p.is_file())
|
|
except OSError:
|
|
pass
|
|
outputs.append(
|
|
{
|
|
"name": result.plugin,
|
|
"path": embed_path,
|
|
"ts": ts_to_date_str(result.end_ts),
|
|
"size": size or 0,
|
|
"is_metadata": is_metadata_path(embed_path),
|
|
"is_compact": is_compact_path(embed_path),
|
|
"result": result,
|
|
},
|
|
)
|
|
seen.add(result.plugin)
|
|
|
|
hashes_index = self.hashes_index
|
|
if hashes_index:
|
|
grouped_hash_outputs: dict[str, dict[str, dict[str, Any]]] = {}
|
|
ignored_roots = {"index.html", "index.json", "index.jsonl", "favicon.ico", "warc", "hashes"}
|
|
for rel_path, meta in hashes_index.items():
|
|
parts = Path(rel_path).parts
|
|
if len(parts) < 2:
|
|
continue
|
|
root = parts[0]
|
|
if root.startswith(".") or root in seen or root in ignored_roots:
|
|
continue
|
|
child_path = str(Path(*parts[1:]))
|
|
grouped_hash_outputs.setdefault(root, {})[child_path] = meta
|
|
|
|
fallback_ts = ts_to_date_str(self.downloaded_at or self.created_at)
|
|
for root, root_entries in grouped_hash_outputs.items():
|
|
fallback_path = ArchiveResult._fallback_output_file_path(list(root_entries.keys()), root, root_entries)
|
|
if not fallback_path:
|
|
continue
|
|
fallback_meta = root_entries.get(fallback_path, {})
|
|
outputs.append(
|
|
{
|
|
"name": root,
|
|
"path": f"{root}/{fallback_path}",
|
|
"ts": fallback_ts,
|
|
"size": int(fallback_meta.get("size") or 0),
|
|
"is_metadata": is_metadata_path(fallback_path),
|
|
"is_compact": is_compact_path(fallback_path),
|
|
"result": None,
|
|
},
|
|
)
|
|
seen.add(root)
|
|
|
|
if not include_filesystem_fallback:
|
|
return outputs
|
|
|
|
embeddable_exts = {
|
|
"html",
|
|
"htm",
|
|
"pdf",
|
|
"txt",
|
|
"md",
|
|
"json",
|
|
"jsonl",
|
|
"csv",
|
|
"tsv",
|
|
"png",
|
|
"jpg",
|
|
"jpeg",
|
|
"gif",
|
|
"webp",
|
|
"svg",
|
|
"ico",
|
|
"mp4",
|
|
"webm",
|
|
"mp3",
|
|
"opus",
|
|
"ogg",
|
|
"wav",
|
|
}
|
|
|
|
for entry in snap_dir.iterdir():
|
|
if entry.name in ("index.html", "index.json", "favicon.ico", "warc"):
|
|
continue
|
|
if entry.is_dir():
|
|
plugin = entry.name
|
|
if plugin in seen:
|
|
continue
|
|
best_file = ArchiveResult._find_best_output_file(entry, plugin)
|
|
if not best_file:
|
|
continue
|
|
best_file_stat = best_file.stat()
|
|
rel_path = str(best_file.relative_to(snap_dir))
|
|
outputs.append(
|
|
{
|
|
"name": plugin,
|
|
"path": rel_path,
|
|
"ts": ts_to_date_str(best_file_stat.st_mtime or 0),
|
|
"size": best_file_stat.st_size or 0,
|
|
"is_metadata": is_metadata_path(rel_path),
|
|
"is_compact": is_compact_path(rel_path),
|
|
"result": None,
|
|
},
|
|
)
|
|
seen.add(plugin)
|
|
elif entry.is_file():
|
|
ext = entry.suffix.lstrip(".").lower()
|
|
if ext not in embeddable_exts:
|
|
continue
|
|
plugin = entry.stem
|
|
if plugin in seen:
|
|
continue
|
|
entry_stat = entry.stat()
|
|
outputs.append(
|
|
{
|
|
"name": plugin,
|
|
"path": entry.name,
|
|
"ts": ts_to_date_str(entry_stat.st_mtime or 0),
|
|
"size": entry_stat.st_size or 0,
|
|
"is_metadata": is_metadata_path(entry.name),
|
|
"is_compact": is_compact_path(entry.name),
|
|
"result": None,
|
|
},
|
|
)
|
|
seen.add(plugin)
|
|
|
|
return outputs
|
|
|
|
# =========================================================================
|
|
# Serialization Methods
|
|
# =========================================================================
|
|
|
|
def to_dict(self, extended: bool = False) -> dict[str, Any]:
|
|
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
|
|
from archivebox.core.host_utils import build_snapshot_url
|
|
|
|
archive_size = self.archive_size
|
|
|
|
result = {
|
|
"TYPE": "core.models.Snapshot",
|
|
"id": str(self.id),
|
|
"crawl_id": str(self.crawl_id),
|
|
"url": self.url,
|
|
"timestamp": self.timestamp,
|
|
"title": self.title,
|
|
"tags": sorted(tag.name for tag in self.tags.all()),
|
|
"downloaded_at": self.downloaded_at.isoformat() if self.downloaded_at else None,
|
|
"bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
"modified_at": self.modified_at.isoformat() if self.modified_at else None,
|
|
"retry_at": self.retry_at.isoformat() if self.retry_at else None,
|
|
"depth": self.depth,
|
|
"status": self.status,
|
|
"fs_version": self.fs_version,
|
|
# Computed properties
|
|
"domain": self.domain,
|
|
"scheme": self.scheme,
|
|
"base_url": self.base_url,
|
|
"path": self.path,
|
|
"basename": self.basename,
|
|
"extension": self.extension,
|
|
"is_static": self.is_static,
|
|
"is_archived": self.is_archived,
|
|
"archive_path": self.archive_path,
|
|
"archive_url": build_snapshot_url(str(self.id), "index.html"),
|
|
"output_dir": self.output_dir,
|
|
"link_dir": self.output_dir, # backwards compatibility alias
|
|
"archive_size": archive_size,
|
|
"output_size": archive_size,
|
|
"bookmarked_date": self.bookmarked_date,
|
|
"downloaded_datestr": self.downloaded_datestr,
|
|
"num_outputs": self.num_outputs,
|
|
"num_failures": self.num_failures,
|
|
}
|
|
return result
|
|
|
|
def to_json_str(self, indent: int = 4) -> str:
|
|
"""Convert to JSON string (legacy method, use to_json() for dict)"""
|
|
return to_json(self.to_dict(extended=True), indent=indent)
|
|
|
|
def to_csv(self, cols: list[str] | None = None, separator: str = ",", ljust: int = 0) -> str:
|
|
"""Convert to CSV string"""
|
|
data = self.to_dict()
|
|
cols = cols or ["timestamp", "is_archived", "url"]
|
|
return separator.join(to_json(data.get(col, ""), indent=None).ljust(ljust) for col in cols)
|
|
|
|
def write_json_details(self, out_dir: Path | str | None = None) -> None:
|
|
"""Write JSON index file for this snapshot to its output directory"""
|
|
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
|
|
path = output_dir / CONSTANTS.JSON_INDEX_FILENAME
|
|
atomic_write(str(path), self.to_dict(extended=True))
|
|
|
|
def write_html_details(self, out_dir: Path | str | None = None) -> None:
|
|
"""Write HTML detail page for this snapshot to its output directory"""
|
|
from django.template.loader import render_to_string
|
|
from archivebox.config.common import SERVER_CONFIG
|
|
from archivebox.config.configset import get_config
|
|
from archivebox.core.widgets import TagEditorWidget
|
|
from archivebox.misc.logging_util import printable_filesize
|
|
|
|
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
|
|
config = get_config()
|
|
SAVE_ARCHIVE_DOT_ORG = config.get("SAVE_ARCHIVE_DOT_ORG", True)
|
|
TITLE_LOADING_MSG = "Not yet archived..."
|
|
|
|
preview_priority = [
|
|
"singlefile",
|
|
"screenshot",
|
|
"wget",
|
|
"dom",
|
|
"pdf",
|
|
"readability",
|
|
]
|
|
|
|
outputs = self.discover_outputs(include_filesystem_fallback=True)
|
|
loose_items, failed_items = self.get_detail_page_auxiliary_items(outputs)
|
|
outputs_by_plugin = {out["name"]: out for out in outputs}
|
|
output_size = sum(int(out.get("size") or 0) for out in outputs)
|
|
is_archived = bool(outputs or self.downloaded_at or self.status == self.StatusChoices.SEALED)
|
|
|
|
best_preview_path = "about:blank"
|
|
best_result = {"path": "about:blank", "result": None}
|
|
for plugin in preview_priority:
|
|
out = outputs_by_plugin.get(plugin)
|
|
if out and out.get("path"):
|
|
best_preview_path = str(out["path"])
|
|
best_result = out
|
|
break
|
|
|
|
if best_preview_path == "about:blank" and outputs:
|
|
best_preview_path = str(outputs[0].get("path") or "about:blank")
|
|
best_result = outputs[0]
|
|
tag_widget = TagEditorWidget()
|
|
context = {
|
|
**self.to_dict(extended=True),
|
|
"snapshot": self,
|
|
"title": htmlencode(self.resolved_title or (self.base_url if is_archived else TITLE_LOADING_MSG)),
|
|
"url_str": htmlencode(urldecode(self.base_url)),
|
|
"archive_url": urlencode(f"warc/{self.timestamp}" or (self.domain if is_archived else "")) or "about:blank",
|
|
"extension": self.extension or "html",
|
|
"tags": self.tags_str() or "untagged",
|
|
"size": printable_filesize(output_size) if output_size else "pending",
|
|
"status": "archived" if is_archived else "not yet archived",
|
|
"status_color": "success" if is_archived else "danger",
|
|
"oldest_archive_date": ts_to_date_str(self.oldest_archive_date),
|
|
"SAVE_ARCHIVE_DOT_ORG": SAVE_ARCHIVE_DOT_ORG,
|
|
"PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS,
|
|
"best_preview_path": best_preview_path,
|
|
"best_result": best_result,
|
|
"archiveresults": outputs,
|
|
"loose_items": loose_items,
|
|
"failed_items": failed_items,
|
|
"related_snapshots": [],
|
|
"related_years": [],
|
|
"title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in self.tags.all().order_by("name")],
|
|
}
|
|
rendered_html = render_to_string("core/snapshot.html", context)
|
|
atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
|
|
|
# =========================================================================
|
|
# Helper Methods
|
|
# =========================================================================
|
|
|
|
def get_detail_page_auxiliary_items(
|
|
self,
|
|
outputs: list[dict] | None = None,
|
|
hidden_card_plugins: set[str] | None = None,
|
|
) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
|
|
outputs = outputs or self.discover_outputs(include_filesystem_fallback=True)
|
|
hidden_card_plugins = hidden_card_plugins or set()
|
|
accounted_entries: set[str] = set()
|
|
for output in outputs:
|
|
output_name = str(output.get("name") or "")
|
|
if output_name:
|
|
accounted_entries.add(output_name)
|
|
output_path = str(output.get("path") or "")
|
|
if not output_path:
|
|
continue
|
|
parts = Path(output_path).parts
|
|
if parts:
|
|
accounted_entries.add(parts[0])
|
|
|
|
ignore_names = {".DS_Store", "index.html", "index.json", "index.jsonl", "favicon.ico"}
|
|
loose_items: list[dict[str, object]] = []
|
|
if self.hashes_index:
|
|
grouped: dict[str, dict[str, object]] = {}
|
|
for rel_path, meta in self.hashes_index.items():
|
|
parts = Path(rel_path).parts
|
|
if not parts:
|
|
continue
|
|
root = parts[0]
|
|
if root.startswith(".") or root in ignore_names or root in accounted_entries:
|
|
continue
|
|
entry = grouped.setdefault(
|
|
root,
|
|
{
|
|
"name": root,
|
|
"path": root,
|
|
"is_dir": len(parts) > 1 or bool(meta.get("is_dir")),
|
|
"size": 0,
|
|
},
|
|
)
|
|
entry["is_dir"] = bool(entry.get("is_dir")) or len(parts) > 1 or bool(meta.get("is_dir"))
|
|
entry["size"] = int(entry.get("size") or 0) + int(meta.get("size") or 0)
|
|
loose_items = sorted(grouped.values(), key=lambda item: str(item["name"]).lower())
|
|
|
|
ArchiveResult = self.archiveresult_set.model
|
|
failed_items: list[dict[str, object]] = []
|
|
seen_failed: set[str] = set()
|
|
for result in self.archiveresult_set.all().order_by("start_ts"):
|
|
if result.status != ArchiveResult.StatusChoices.FAILED:
|
|
continue
|
|
root = str(result.plugin or "").strip()
|
|
if not root or root in seen_failed:
|
|
continue
|
|
seen_failed.add(root)
|
|
failed_items.append(
|
|
{
|
|
"name": f"{get_plugin_name(root)} ({result.status})",
|
|
"path": root,
|
|
"is_dir": True,
|
|
"size": int(result.output_size or 0),
|
|
},
|
|
)
|
|
|
|
return loose_items, failed_items
|
|
|
|
@staticmethod
|
|
def _ts_to_date_str(dt: datetime | None) -> str | None:
|
|
return dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None
|
|
|
|
|
|
# =============================================================================
|
|
# Snapshot State Machine
|
|
# =============================================================================
|
|
|
|
|
|
class SnapshotMachine(BaseStateMachine):
|
|
"""
|
|
State machine for managing Snapshot lifecycle.
|
|
|
|
Hook Lifecycle:
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ QUEUED State │
|
|
│ • Waiting for snapshot to be ready │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
↓ tick() when can_start()
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ STARTED State → enter_started() │
|
|
│ 1. snapshot.run() │
|
|
│ • discover_hooks('Snapshot') → finds all plugin hooks │
|
|
│ • create_pending_archiveresults() → creates ONE │
|
|
│ ArchiveResult per hook (NO execution yet) │
|
|
│ 2. The shared abx-dl runner executes hooks and the │
|
|
│ projector updates ArchiveResult rows from events │
|
|
│ 3. Advance through steps 0-9 as foreground hooks complete │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
↓ tick() when is_finished()
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ SEALED State → enter_sealed() │
|
|
│ • cleanup() → kills any background hooks still running │
|
|
│ • Set retry_at=None (no more processing) │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
|
|
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
|
"""
|
|
|
|
model_attr_name = "snapshot"
|
|
|
|
# States
|
|
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
|
|
started = State(value=Snapshot.StatusChoices.STARTED)
|
|
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
|
|
|
# Tick Event (polled by workers)
|
|
tick = queued.to.itself(unless="can_start") | queued.to(started, cond="can_start") | started.to(sealed, cond="is_finished")
|
|
|
|
# Manual event (can also be triggered by last ArchiveResult finishing)
|
|
seal = started.to(sealed)
|
|
|
|
snapshot: Snapshot
|
|
|
|
def can_start(self) -> bool:
|
|
can_start = bool(self.snapshot.url)
|
|
return can_start
|
|
|
|
def is_finished(self) -> bool:
|
|
"""Check if all ArchiveResults for this snapshot are finished."""
|
|
return self.snapshot.is_finished_processing()
|
|
|
|
@queued.enter
|
|
def enter_queued(self):
|
|
self.snapshot.update_and_requeue(
|
|
retry_at=timezone.now(),
|
|
status=Snapshot.StatusChoices.QUEUED,
|
|
)
|
|
|
|
@started.enter
|
|
def enter_started(self):
|
|
"""Just mark as started. The shared runner creates ArchiveResults and runs hooks."""
|
|
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
|
self.snapshot.retry_at = None # No more polling
|
|
self.snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
|
|
|
@sealed.enter
|
|
def enter_sealed(self):
|
|
import sys
|
|
|
|
# Clean up background hooks
|
|
self.snapshot.cleanup()
|
|
|
|
self.snapshot.update_and_requeue(
|
|
retry_at=None,
|
|
status=Snapshot.StatusChoices.SEALED,
|
|
)
|
|
|
|
print(f"[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]", file=sys.stderr)
|
|
|
|
# Check if this is the last snapshot for the parent crawl - if so, seal the crawl
|
|
if self.snapshot.crawl:
|
|
crawl = self.snapshot.crawl
|
|
remaining_active = Snapshot.objects.filter(
|
|
crawl=crawl,
|
|
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
|
|
).count()
|
|
|
|
if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
|
|
print(f"[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]", file=sys.stderr)
|
|
# Seal the parent crawl
|
|
cast(Any, crawl).sm.seal()
|
|
|
|
|
|
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes):
|
|
class StatusChoices(models.TextChoices):
|
|
QUEUED = "queued", "Queued"
|
|
STARTED = "started", "Started"
|
|
BACKOFF = "backoff", "Waiting to retry"
|
|
SUCCEEDED = "succeeded", "Succeeded"
|
|
FAILED = "failed", "Failed"
|
|
SKIPPED = "skipped", "Skipped"
|
|
NORESULTS = "noresults", "No Results"
|
|
|
|
INITIAL_STATE = StatusChoices.QUEUED
|
|
ACTIVE_STATE = StatusChoices.STARTED
|
|
FINAL_STATES = (
|
|
StatusChoices.SUCCEEDED,
|
|
StatusChoices.FAILED,
|
|
StatusChoices.SKIPPED,
|
|
StatusChoices.NORESULTS,
|
|
)
|
|
FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE)
|
|
|
|
@classmethod
|
|
def get_plugin_choices(cls):
|
|
"""Get plugin choices from discovered hooks (for forms/admin)."""
|
|
plugins = [get_plugin_name(e) for e in get_plugins()]
|
|
return tuple((e, e) for e in plugins)
|
|
|
|
# UUID primary key (migrated from integer in 0029)
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
|
|
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
|
|
# No choices= constraint - plugin names come from plugin system and can be any string
|
|
plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True, default="")
|
|
hook_name = models.CharField(
|
|
max_length=255,
|
|
blank=True,
|
|
default="",
|
|
db_index=True,
|
|
help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)",
|
|
)
|
|
|
|
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
|
|
# Added POST-v0.9.0, will be added in a separate migration
|
|
process = models.OneToOneField(
|
|
"machine.Process",
|
|
on_delete=models.PROTECT,
|
|
null=True,
|
|
blank=True,
|
|
related_name="archiveresult",
|
|
help_text="Process execution details for this archive result",
|
|
)
|
|
|
|
# New output fields (replacing old 'output' field)
|
|
output_str = models.TextField(blank=True, default="", help_text="Human-readable output summary")
|
|
output_json = models.JSONField(null=True, blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)")
|
|
output_files = models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}")
|
|
output_size = models.BigIntegerField(default=0, help_text="Total bytes of all output files")
|
|
output_mimetypes = models.CharField(max_length=512, blank=True, default="", help_text="CSV of mimetypes sorted by size")
|
|
|
|
start_ts = models.DateTimeField(default=None, null=True, blank=True)
|
|
end_ts = models.DateTimeField(default=None, null=True, blank=True)
|
|
|
|
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
|
|
notes = models.TextField(blank=True, null=False, default="")
|
|
# output_dir is computed via @property from snapshot.output_dir / plugin
|
|
|
|
snapshot_id: uuid.UUID
|
|
process_id: uuid.UUID | None
|
|
|
|
class Meta(
|
|
ModelWithOutputDir.Meta,
|
|
ModelWithConfig.Meta,
|
|
ModelWithNotes.Meta,
|
|
):
|
|
app_label = "core"
|
|
verbose_name = "Archive Result"
|
|
verbose_name_plural = "Archive Results Log"
|
|
indexes = [
|
|
models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"),
|
|
]
|
|
|
|
def __str__(self):
|
|
return f"[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}"
|
|
|
|
@property
|
|
def created_by(self):
|
|
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
|
|
return self.snapshot.crawl.created_by
|
|
|
|
def to_json(self) -> dict:
|
|
"""
|
|
Convert ArchiveResult model instance to a JSON-serializable dict.
|
|
"""
|
|
from archivebox.config import VERSION
|
|
|
|
record = {
|
|
"type": "ArchiveResult",
|
|
"schema_version": VERSION,
|
|
"id": str(self.id),
|
|
"snapshot_id": str(self.snapshot_id),
|
|
"plugin": self.plugin,
|
|
"hook_name": self.hook_name,
|
|
"status": self.status,
|
|
"output_str": self.output_str,
|
|
"start_ts": self.start_ts.isoformat() if self.start_ts else None,
|
|
"end_ts": self.end_ts.isoformat() if self.end_ts else None,
|
|
}
|
|
# Include optional fields if set
|
|
if self.output_json:
|
|
record["output_json"] = self.output_json
|
|
if self.output_files:
|
|
record["output_files"] = self.output_files
|
|
if self.output_size:
|
|
record["output_size"] = self.output_size
|
|
if self.output_mimetypes:
|
|
record["output_mimetypes"] = self.output_mimetypes
|
|
if self.cmd:
|
|
record["cmd"] = self.cmd
|
|
if self.cmd_version:
|
|
record["cmd_version"] = self.cmd_version
|
|
if self.process_id:
|
|
record["process_id"] = str(self.process_id)
|
|
return record
|
|
|
|
@staticmethod
|
|
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
|
"""
|
|
Create/update ArchiveResult from JSON dict.
|
|
|
|
Args:
|
|
record: JSON dict with 'snapshot_id', 'plugin', etc.
|
|
overrides: Optional dict of field overrides
|
|
|
|
Returns:
|
|
ArchiveResult instance or None
|
|
"""
|
|
snapshot_id = record.get("snapshot_id")
|
|
plugin = record.get("plugin")
|
|
|
|
if not snapshot_id or not plugin:
|
|
return None
|
|
|
|
# Try to get existing by ID first
|
|
result_id = record.get("id")
|
|
if result_id:
|
|
try:
|
|
return ArchiveResult.objects.get(id=result_id)
|
|
except ArchiveResult.DoesNotExist:
|
|
pass
|
|
|
|
# Get or create by snapshot_id + plugin
|
|
try:
|
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
|
|
|
result, _ = ArchiveResult.objects.get_or_create(
|
|
snapshot=snapshot,
|
|
plugin=plugin,
|
|
defaults={
|
|
"hook_name": record.get("hook_name", ""),
|
|
"status": record.get("status", "queued"),
|
|
"output_str": record.get("output_str", ""),
|
|
},
|
|
)
|
|
return result
|
|
except Snapshot.DoesNotExist:
|
|
return None
|
|
|
|
def save(self, *args, **kwargs):
|
|
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
|
|
# Call the Django Model.save() directly instead
|
|
models.Model.save(self, *args, **kwargs)
|
|
|
|
# if is_new:
|
|
# from archivebox.misc.logging_util import log_worker_event
|
|
# log_worker_event(
|
|
# worker_type='DB',
|
|
# event='Created ArchiveResult',
|
|
# indent_level=3,
|
|
# plugin=self.plugin,
|
|
# metadata={
|
|
# 'id': str(self.id),
|
|
# 'snapshot_id': str(self.snapshot_id),
|
|
# 'snapshot_url': str(self.snapshot.url)[:64],
|
|
# 'status': self.status,
|
|
# },
|
|
# )
|
|
|
|
@cached_property
|
|
def snapshot_dir(self):
|
|
return Path(self.snapshot.output_dir)
|
|
|
|
@cached_property
|
|
def url(self):
|
|
return self.snapshot.url
|
|
|
|
@property
|
|
def api_url(self) -> str:
|
|
return str(reverse_lazy("api-1:get_archiveresult", args=[self.id]))
|
|
|
|
def get_absolute_url(self):
|
|
return f"/{self.snapshot.archive_path}/{self.plugin}"
|
|
|
|
def reset_for_retry(self, *, save: bool = True) -> None:
|
|
self.status = self.StatusChoices.QUEUED
|
|
self.output_str = ""
|
|
self.output_json = None
|
|
self.output_files = {}
|
|
self.output_size = 0
|
|
self.output_mimetypes = ""
|
|
self.start_ts = None
|
|
self.end_ts = None
|
|
if save:
|
|
self.save(
|
|
update_fields=[
|
|
"status",
|
|
"output_str",
|
|
"output_json",
|
|
"output_files",
|
|
"output_size",
|
|
"output_mimetypes",
|
|
"start_ts",
|
|
"end_ts",
|
|
"modified_at",
|
|
],
|
|
)
|
|
|
|
@property
|
|
def plugin_module(self) -> Any | None:
|
|
# Hook scripts are now used instead of Python plugin modules
|
|
# The plugin name maps to hooks in abx_plugins/plugins/{plugin}/
|
|
return None
|
|
|
|
@staticmethod
|
|
def _normalize_output_files(raw_output_files: Any) -> dict[str, dict[str, Any]]:
|
|
from abx_dl.output_files import guess_mimetype
|
|
|
|
def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
normalized = dict(metadata)
|
|
if "extension" not in normalized:
|
|
normalized["extension"] = Path(path).suffix.lower().lstrip(".")
|
|
if "mimetype" not in normalized:
|
|
guessed = guess_mimetype(path)
|
|
if guessed:
|
|
normalized["mimetype"] = guessed
|
|
return normalized
|
|
|
|
if raw_output_files is None:
|
|
return {}
|
|
if isinstance(raw_output_files, str):
|
|
try:
|
|
raw_output_files = json.loads(raw_output_files)
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
if isinstance(raw_output_files, dict):
|
|
normalized: dict[str, dict[str, Any]] = {}
|
|
for path, metadata in raw_output_files.items():
|
|
if not path:
|
|
continue
|
|
metadata_dict = dict(metadata) if isinstance(metadata, dict) else {}
|
|
metadata_dict.pop("path", None)
|
|
normalized[str(path)] = _enrich_metadata(str(path), metadata_dict)
|
|
return normalized
|
|
if isinstance(raw_output_files, (list, tuple, set)):
|
|
normalized: dict[str, dict[str, Any]] = {}
|
|
for item in raw_output_files:
|
|
if isinstance(item, str):
|
|
normalized[item] = _enrich_metadata(item, {})
|
|
continue
|
|
if not isinstance(item, dict):
|
|
continue
|
|
path = str(item.get("path") or "").strip()
|
|
if not path:
|
|
continue
|
|
normalized[path] = _enrich_metadata(
|
|
path,
|
|
{key: value for key, value in item.items() if key != "path" and value not in (None, "")},
|
|
)
|
|
return normalized
|
|
return {}
|
|
|
|
@staticmethod
|
|
def _coerce_output_file_size(value: Any) -> int:
|
|
try:
|
|
return max(int(value or 0), 0)
|
|
except (TypeError, ValueError):
|
|
return 0
|
|
|
|
def output_file_map(self) -> dict[str, dict[str, Any]]:
|
|
return self._normalize_output_files(self.output_files)
|
|
|
|
def output_file_paths(self) -> list[str]:
|
|
return list(self.output_file_map().keys())
|
|
|
|
def output_file_count(self) -> int:
|
|
return len(self.output_file_paths())
|
|
|
|
def output_size_from_files(self) -> int:
|
|
return sum(self._coerce_output_file_size(metadata.get("size")) for metadata in self.output_file_map().values())
|
|
|
|
def output_exists(self) -> bool:
|
|
return os.path.exists(Path(self.snapshot_dir) / self.plugin)
|
|
|
|
@staticmethod
|
|
def _looks_like_output_path(raw_output: str | None, plugin_name: str | None = None) -> bool:
|
|
value = str(raw_output or "").strip()
|
|
if value in ("", ".", "./", "/"):
|
|
return False
|
|
if plugin_name and value.startswith(f"{plugin_name}/"):
|
|
return True
|
|
if Path(value).is_absolute():
|
|
return True
|
|
if Path(value).suffix:
|
|
return True
|
|
if "/" in value and "\\" not in value and " " not in value:
|
|
left, _, right = value.partition("/")
|
|
if left and right and all(ch.isalnum() or ch in "+-." for ch in left + right):
|
|
return False
|
|
return False
|
|
|
|
def _existing_output_path(self, raw_output: str | None) -> str | None:
|
|
value = str(raw_output or "").strip()
|
|
if not value:
|
|
return None
|
|
|
|
output_path = Path(value)
|
|
snapshot_dir = Path(self.snapshot_dir).resolve(strict=False)
|
|
candidates: list[str] = []
|
|
|
|
if output_path.is_absolute():
|
|
try:
|
|
candidates.append(str(output_path.resolve(strict=False).relative_to(snapshot_dir)))
|
|
except (OSError, ValueError):
|
|
return None
|
|
elif value.startswith(f"{self.plugin}/"):
|
|
candidates.append(value)
|
|
elif len(output_path.parts) == 1:
|
|
candidates.append(f"{self.plugin}/{value}")
|
|
else:
|
|
candidates.append(value)
|
|
|
|
output_file_map = self.output_file_map()
|
|
hashes_index = self.snapshot.hashes_index
|
|
for relative_path in candidates:
|
|
if relative_path in hashes_index:
|
|
return relative_path
|
|
|
|
if relative_path in output_file_map:
|
|
return relative_path
|
|
|
|
plugin_relative = relative_path.removeprefix(f"{self.plugin}/")
|
|
if plugin_relative in output_file_map:
|
|
return relative_path
|
|
|
|
candidate = snapshot_dir / relative_path
|
|
try:
|
|
if candidate.is_file():
|
|
return relative_path
|
|
except OSError:
|
|
continue
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def _fallback_output_file_path(
|
|
output_file_paths: Sequence[str],
|
|
plugin_name: str | None = None,
|
|
output_file_map: dict[str, dict[str, Any]] | None = None,
|
|
) -> str | None:
|
|
ignored = {"stdout.log", "stderr.log", "hook.pid", "listener.pid", "cmd.sh"}
|
|
candidates = [
|
|
path
|
|
for path in output_file_paths
|
|
if Path(path).name not in ignored and Path(path).suffix.lower() not in (".pid", ".log", ".sh")
|
|
]
|
|
if not candidates:
|
|
return None
|
|
|
|
output_file_map = output_file_map or {}
|
|
preferred_names = [
|
|
"index.html",
|
|
"index.htm",
|
|
"output.html",
|
|
"content.html",
|
|
"article.html",
|
|
"output.pdf",
|
|
"index.pdf",
|
|
"content.txt",
|
|
"output.txt",
|
|
"index.txt",
|
|
"index.md",
|
|
"index.json",
|
|
"article.json",
|
|
]
|
|
for preferred_name in preferred_names:
|
|
for candidate in candidates:
|
|
if Path(candidate).name.lower() == preferred_name:
|
|
return candidate
|
|
|
|
ext_groups = (
|
|
(".html", ".htm", ".pdf"),
|
|
(".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico"),
|
|
(".json", ".jsonl", ".txt", ".md", ".csv", ".tsv"),
|
|
(".mp4", ".webm", ".mp3", ".opus", ".ogg", ".wav"),
|
|
)
|
|
for ext_group in ext_groups:
|
|
group_candidates = [candidate for candidate in candidates if Path(candidate).suffix.lower() in ext_group]
|
|
if group_candidates:
|
|
return max(
|
|
group_candidates,
|
|
key=lambda path: ArchiveResult._coerce_output_file_size(output_file_map.get(path, {}).get("size")),
|
|
)
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Path | None:
|
|
if not dir_path.exists() or not dir_path.is_dir():
|
|
return None
|
|
file_map: dict[str, dict[str, Any]] = {}
|
|
file_count = 0
|
|
max_scan = 500
|
|
for file_path in dir_path.rglob("*"):
|
|
file_count += 1
|
|
if file_count > max_scan:
|
|
break
|
|
if file_path.is_dir() or file_path.name.startswith("."):
|
|
continue
|
|
rel_path = str(file_path.relative_to(dir_path))
|
|
try:
|
|
size = file_path.stat().st_size
|
|
except OSError:
|
|
size = 0
|
|
file_map[rel_path] = {"size": size}
|
|
|
|
fallback_path = ArchiveResult._fallback_output_file_path(list(file_map.keys()), plugin_name, file_map)
|
|
if not fallback_path:
|
|
return None
|
|
return dir_path / fallback_path
|
|
|
|
def embed_path_db(self) -> str | None:
|
|
output_file_map = self.output_file_map()
|
|
|
|
if self.output_str:
|
|
raw_output = str(self.output_str).strip()
|
|
if self._looks_like_output_path(raw_output, self.plugin):
|
|
existing_output = self._existing_output_path(raw_output)
|
|
if existing_output:
|
|
return existing_output
|
|
|
|
output_file_paths = list(output_file_map.keys())
|
|
if output_file_paths:
|
|
fallback_path = self._fallback_output_file_path(output_file_paths, self.plugin, output_file_map)
|
|
if fallback_path:
|
|
return f"{self.plugin}/{fallback_path}"
|
|
|
|
return None
|
|
|
|
def embed_path(self) -> str | None:
|
|
"""
|
|
Get the relative path to the embeddable output file for this result.
|
|
|
|
This is intentionally DB-backed only so snapshot/admin rendering stays
|
|
fast and predictable without filesystem probes.
|
|
"""
|
|
return self.embed_path_db()
|
|
|
|
@property
|
|
def output_dir_name(self) -> str:
|
|
return self.plugin
|
|
|
|
@property
|
|
def output_dir_parent(self) -> str:
|
|
return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
|
|
|
|
# Properties that delegate to Process model (for backwards compatibility)
|
|
# These properties will replace the direct fields after migration is complete
|
|
# They allow existing code to continue using archiveresult.pwd, .cmd, etc.
|
|
|
|
# Note: After migration 3 creates Process records and migration 5 removes the old fields,
|
|
# these properties provide seamless access to Process data through ArchiveResult
|
|
|
|
# Uncommented after migration 3 completed - properties now active
|
|
@property
|
|
def pwd(self) -> str:
|
|
"""Working directory (from Process)."""
|
|
return self.process.pwd if self.process_id else ""
|
|
|
|
@property
|
|
def cmd(self) -> list:
|
|
"""Command array (from Process)."""
|
|
return self.process.cmd if self.process_id else []
|
|
|
|
@property
|
|
def cmd_version(self) -> str:
|
|
"""Command version (from Process.binary)."""
|
|
return self.process.cmd_version if self.process_id else ""
|
|
|
|
@property
|
|
def binary(self):
|
|
"""Binary FK (from Process)."""
|
|
return self.process.binary if self.process_id else None
|
|
|
|
@property
|
|
def iface(self):
|
|
"""Network interface FK (from Process)."""
|
|
return self.process.iface if self.process_id else None
|
|
|
|
@property
|
|
def machine(self):
|
|
"""Machine FK (from Process)."""
|
|
return self.process.machine if self.process_id else None
|
|
|
|
@property
|
|
def timeout(self) -> int:
|
|
"""Timeout in seconds (from Process)."""
|
|
return self.process.timeout if self.process_id else 120
|
|
|
|
def save_search_index(self):
|
|
pass
|
|
|
|
def update_from_output(self):
|
|
"""
|
|
Update this ArchiveResult from filesystem logs and output files.
|
|
|
|
Used for Snapshot cleanup / orphan recovery when a hook's output exists
|
|
on disk but the projector did not finalize the row in the database.
|
|
|
|
Updates:
|
|
- status, output_str, output_json from ArchiveResult JSONL record
|
|
- output_files, output_size, output_mimetypes by walking filesystem
|
|
- end_ts, cmd, cmd_version, binary FK
|
|
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
|
|
"""
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from django.utils import timezone
|
|
from abx_dl.output_files import guess_mimetype
|
|
from archivebox.hooks import process_hook_records, extract_records_from_process
|
|
from archivebox.machine.models import Process
|
|
|
|
plugin_dir = Path(self.pwd) if self.pwd else None
|
|
if not plugin_dir or not plugin_dir.exists():
|
|
self.status = self.StatusChoices.FAILED
|
|
self.output_str = "Output directory not found"
|
|
self.end_ts = timezone.now()
|
|
self.save()
|
|
return
|
|
|
|
# Read and parse JSONL output from stdout.log
|
|
stdout_file = plugin_dir / "stdout.log"
|
|
records = []
|
|
if self.process_id and self.process:
|
|
records = extract_records_from_process(self.process)
|
|
|
|
if not records:
|
|
stdout = stdout_file.read_text() if stdout_file.exists() else ""
|
|
records = Process.parse_records_from_text(stdout)
|
|
|
|
# Find ArchiveResult record and update status/output from it
|
|
ar_records = [r for r in records if r.get("type") == "ArchiveResult"]
|
|
if ar_records:
|
|
hook_data = ar_records[0]
|
|
|
|
# Update status
|
|
status_map = {
|
|
"succeeded": self.StatusChoices.SUCCEEDED,
|
|
"failed": self.StatusChoices.FAILED,
|
|
"skipped": self.StatusChoices.SKIPPED,
|
|
"noresults": self.StatusChoices.NORESULTS,
|
|
}
|
|
self.status = status_map.get(hook_data.get("status", "failed"), self.StatusChoices.FAILED)
|
|
|
|
# Update output fields
|
|
self.output_str = hook_data.get("output_str") or hook_data.get("output") or ""
|
|
self.output_json = hook_data.get("output_json")
|
|
|
|
# Update cmd fields
|
|
if hook_data.get("cmd"):
|
|
if self.process_id:
|
|
self.process.cmd = hook_data["cmd"]
|
|
self.process.save()
|
|
self._set_binary_from_cmd(hook_data["cmd"])
|
|
# Note: cmd_version is derived from binary.version, not stored on Process
|
|
else:
|
|
# No ArchiveResult record: treat background hooks or clean exits as skipped
|
|
is_background = False
|
|
try:
|
|
from archivebox.hooks import is_background_hook
|
|
|
|
is_background = bool(self.hook_name and is_background_hook(self.hook_name))
|
|
except Exception:
|
|
pass
|
|
|
|
if is_background or (self.process_id and self.process and self.process.exit_code == 0):
|
|
self.status = self.StatusChoices.SKIPPED
|
|
self.output_str = "Hook did not output ArchiveResult record"
|
|
else:
|
|
self.status = self.StatusChoices.FAILED
|
|
self.output_str = "Hook did not output ArchiveResult record"
|
|
|
|
# Walk filesystem and populate output_files, output_size, output_mimetypes
|
|
exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid", "cmd.sh"}
|
|
mime_sizes = defaultdict(int)
|
|
total_size = 0
|
|
output_files = {}
|
|
|
|
for file_path in plugin_dir.rglob("*"):
|
|
if not file_path.is_file():
|
|
continue
|
|
if ".hooks" in file_path.parts:
|
|
continue
|
|
if file_path.name in exclude_names:
|
|
continue
|
|
|
|
try:
|
|
stat = file_path.stat()
|
|
mime_type = guess_mimetype(file_path) or "application/octet-stream"
|
|
|
|
relative_path = str(file_path.relative_to(plugin_dir))
|
|
output_files[relative_path] = {
|
|
"extension": file_path.suffix.lower().lstrip("."),
|
|
"mimetype": mime_type,
|
|
"size": stat.st_size,
|
|
}
|
|
mime_sizes[mime_type] += stat.st_size
|
|
total_size += stat.st_size
|
|
except OSError:
|
|
continue
|
|
|
|
self.output_files = output_files
|
|
self.output_size = total_size
|
|
sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
|
|
self.output_mimetypes = ",".join(mime for mime, _ in sorted_mimes)
|
|
|
|
# Update timestamps
|
|
self.end_ts = timezone.now()
|
|
|
|
self.save()
|
|
|
|
# Process side-effect records (filter Snapshots for depth/URL)
|
|
filtered_records = []
|
|
for record in records:
|
|
record_type = record.get("type")
|
|
|
|
# Skip ArchiveResult records (already processed above)
|
|
if record_type == "ArchiveResult":
|
|
continue
|
|
|
|
# Filter Snapshot records for depth/URL constraints
|
|
if record_type == "Snapshot":
|
|
url = record.get("url")
|
|
if not url:
|
|
continue
|
|
|
|
depth = record.get("depth", self.snapshot.depth + 1)
|
|
if depth > self.snapshot.crawl.max_depth:
|
|
continue
|
|
|
|
if not self._url_passes_filters(url):
|
|
continue
|
|
|
|
filtered_records.append(record)
|
|
|
|
# Process filtered records with unified dispatcher
|
|
overrides = {
|
|
"snapshot": self.snapshot,
|
|
"crawl": self.snapshot.crawl,
|
|
"created_by_id": self.created_by.pk,
|
|
}
|
|
process_hook_records(filtered_records, overrides=overrides)
|
|
|
|
# Cleanup PID files (keep logs even if empty so they can be tailed)
|
|
pid_file = plugin_dir / "hook.pid"
|
|
pid_file.unlink(missing_ok=True)
|
|
|
|
def _set_binary_from_cmd(self, cmd: list) -> None:
|
|
"""
|
|
Find Binary for command and set binary FK.
|
|
|
|
Tries matching by absolute path first, then by binary name.
|
|
Only matches binaries on the current machine.
|
|
"""
|
|
if not cmd:
|
|
return
|
|
|
|
from archivebox.machine.models import Machine
|
|
|
|
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
|
machine = Machine.current()
|
|
|
|
# Try matching by absolute path first
|
|
binary = Binary.objects.filter(
|
|
abspath=bin_path_or_name,
|
|
machine=machine,
|
|
).first()
|
|
|
|
if binary:
|
|
if self.process_id:
|
|
self.process.binary = binary
|
|
self.process.save()
|
|
return
|
|
|
|
# Fallback: match by binary name
|
|
bin_name = Path(bin_path_or_name).name
|
|
binary = Binary.objects.filter(
|
|
name=bin_name,
|
|
machine=machine,
|
|
).first()
|
|
|
|
if binary:
|
|
if self.process_id:
|
|
self.process.binary = binary
|
|
self.process.save()
|
|
|
|
def _url_passes_filters(self, url: str) -> bool:
|
|
"""Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
|
|
|
|
Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
|
|
"""
|
|
return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot)
|
|
|
|
@property
|
|
def output_dir(self) -> Path:
|
|
"""Get the output directory for this plugin's results."""
|
|
return Path(self.snapshot.output_dir) / self.plugin
|
|
|
|
|
|
# =============================================================================
|
|
# State Machine Registration
|
|
# =============================================================================
|
|
|
|
# Manually register state machines with python-statemachine registry
|
|
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
|
|
registry.register(SnapshotMachine)
|