Files
ArchiveBox/archivebox/services/archive_result_service.py
2026-03-25 05:36:07 -07:00

317 lines
12 KiB
Python

from __future__ import annotations
import json
from collections import defaultdict
from collections.abc import Iterable
from pathlib import Path
from typing import Any
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent
from abx_dl.output_files import guess_mimetype
from abx_dl.services.base import BaseService
from .process_service import parse_event_datetime
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid", "cmd.sh"}
output_files: dict[str, dict] = {}
mime_sizes: dict[str, int] = defaultdict(int)
total_size = 0
if not plugin_dir.exists():
return output_files, total_size, ""
for file_path in plugin_dir.rglob("*"):
if not file_path.is_file():
continue
if ".hooks" in file_path.parts:
continue
if file_path.name in exclude_names:
continue
try:
stat = file_path.stat()
except OSError:
continue
mime_type = guess_mimetype(file_path) or "application/octet-stream"
relative_path = str(file_path.relative_to(plugin_dir))
output_files[relative_path] = {
"extension": file_path.suffix.lower().lstrip("."),
"mimetype": mime_type,
"size": stat.st_size,
}
mime_sizes[mime_type] += stat.st_size
total_size += stat.st_size
output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
return output_files, total_size, output_mimetypes
def _coerce_output_file_size(value: Any) -> int:
try:
return max(int(value or 0), 0)
except (TypeError, ValueError):
return 0
def _normalize_output_files(raw_output_files: Any) -> dict[str, dict]:
def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]:
normalized = dict(metadata)
if "extension" not in normalized:
normalized["extension"] = Path(path).suffix.lower().lstrip(".")
if "mimetype" not in normalized:
guessed = guess_mimetype(path)
if guessed:
normalized["mimetype"] = guessed
return normalized
if raw_output_files is None:
return {}
if isinstance(raw_output_files, str):
try:
raw_output_files = json.loads(raw_output_files)
except json.JSONDecodeError:
return {}
if isinstance(raw_output_files, dict):
normalized: dict[str, dict] = {}
for path, metadata in raw_output_files.items():
if not path:
continue
metadata_dict = dict(metadata) if isinstance(metadata, dict) else {}
metadata_dict.pop("path", None)
normalized[str(path)] = _enrich_metadata(str(path), metadata_dict)
return normalized
if not isinstance(raw_output_files, Iterable):
return {}
normalized: dict[str, dict] = {}
for item in raw_output_files:
if isinstance(item, str):
normalized[item] = _enrich_metadata(item, {})
continue
if hasattr(item, "model_dump"):
item = item.model_dump()
elif hasattr(item, "path"):
item = {
"path": getattr(item, "path", ""),
"extension": getattr(item, "extension", ""),
"mimetype": getattr(item, "mimetype", ""),
"size": getattr(item, "size", 0),
}
if not isinstance(item, dict):
continue
path = str(item.get("path") or "").strip()
if not path:
continue
normalized[path] = _enrich_metadata(path, {key: value for key, value in item.items() if key != "path" and value not in (None, "")})
return normalized
def _has_structured_output_metadata(output_files: dict[str, dict]) -> bool:
return any(any(key in metadata for key in ("extension", "mimetype", "size")) for metadata in output_files.values())
def _summarize_output_files(output_files: dict[str, dict]) -> tuple[int, str]:
mime_sizes: dict[str, int] = defaultdict(int)
total_size = 0
for metadata in output_files.values():
if not isinstance(metadata, dict):
continue
size = _coerce_output_file_size(metadata.get("size"))
mimetype = str(metadata.get("mimetype") or "").strip()
total_size += size
if mimetype and size:
mime_sizes[mimetype] += size
output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
return total_size, output_mimetypes
def _resolve_output_metadata(raw_output_files: Any, plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
normalized_output_files = _normalize_output_files(raw_output_files)
if normalized_output_files and _has_structured_output_metadata(normalized_output_files):
output_size, output_mimetypes = _summarize_output_files(normalized_output_files)
return normalized_output_files, output_size, output_mimetypes
return _collect_output_metadata(plugin_dir)
def _normalize_status(status: str) -> str:
if status == "noresult":
return "noresults"
return status or "failed"
def _normalize_snapshot_title(candidate: str, *, snapshot_url: str) -> str:
title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip()
if not title:
return ""
if title.lower() in {"pending...", "no title found"}:
return ""
if title == snapshot_url:
return ""
if "/" in title and title.lower().endswith(".txt"):
return ""
return title
def _extract_snapshot_title(snapshot_output_dir: str, plugin: str, output_str: str, *, snapshot_url: str) -> str:
if plugin != "title":
return ""
title_file = Path(snapshot_output_dir) / "title" / "title.txt"
if title_file.exists():
try:
file_title = _normalize_snapshot_title(title_file.read_text(encoding="utf-8"), snapshot_url=snapshot_url)
except OSError:
file_title = ""
if file_title:
return file_title
return _normalize_snapshot_title(output_str, snapshot_url=snapshot_url)
def _should_update_snapshot_title(current_title: str, next_title: str, *, snapshot_url: str) -> bool:
current = (current_title or "").strip()
if not current or current.lower() == "pending..." or current == snapshot_url:
return True
return len(next_title) > len(current)
def _has_content_files(output_files: Any) -> bool:
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in _normalize_output_files(output_files))
def _iter_archiveresult_records(stdout: str) -> list[dict]:
records: list[dict] = []
for raw_line in stdout.splitlines():
line = raw_line.strip()
if not line.startswith("{"):
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
if record.get("type") == "ArchiveResult":
records.append(record)
return records
class ArchiveResultService(BaseService):
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
EMITS = []
def __init__(self, bus):
super().__init__(bus)
self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db)
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None:
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if snapshot is None:
return
plugin_dir = Path(snapshot.output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
process_started = await self.bus.find(
ProcessStartedEvent,
past=True,
future=False,
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
)
process = None
if process_started is not None:
started_at = parse_event_datetime(process_started.start_ts)
if started_at is None:
raise ValueError("ProcessStartedEvent.start_ts is required")
process_query = Process.objects.filter(
pwd=process_started.output_dir,
cmd=[process_started.hook_path, *process_started.hook_args],
started_at=started_at,
)
if process_started.pid:
process_query = process_query.filter(pid=process_started.pid)
process = await process_query.order_by("-modified_at").afirst()
result, _created = await ArchiveResult.objects.aget_or_create(
snapshot=snapshot,
plugin=event.plugin,
hook_name=event.hook_name,
defaults={
"status": ArchiveResult.StatusChoices.STARTED,
"process": process,
},
)
result.process = process or result.process
result.status = _normalize_status(event.status)
result.output_str = event.output_str
result.output_json = event.output_json
result.output_files = output_files
result.output_size = output_size
result.output_mimetypes = output_mimetypes
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
if event.error:
result.notes = event.error
await result.asave()
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
snapshot.title = next_title
await snapshot.asave(update_fields=["title", "modified_at"])
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
if not event.hook_name.startswith("on_Snapshot"):
return
snapshot_event = await self.bus.find(
SnapshotEvent,
past=True,
future=False,
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
)
if snapshot_event is None:
return
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
await self.bus.emit(
ArchiveResultEvent(
snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id,
plugin=record.get("plugin") or event.plugin_name,
hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "",
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
),
)
return
await self.bus.emit(
ArchiveResultEvent(
snapshot_id=snapshot_event.snapshot_id,
plugin=event.plugin_name,
hook_name=event.hook_name,
status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
output_str=event.stderr if event.exit_code != 0 else "",
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=event.stderr if event.exit_code != 0 else "",
),
)