Files
ArchiveBox/archivebox/misc/legacy.py
claude[bot] 762cddc8c5 fix: address PR review comments from cubic-dev-ai
- Add JSONL_INDEX_FILENAME to ALLOWED_IN_DATA_DIR for consistency
- Fix fallback logic in legacy.py to try JSON when JSONL parsing fails
- Replace bare except clauses with specific exception types
- Fix stdin double-consumption in archivebox_crawl.py
- Merge CLI --tag option with crawl tags in archivebox_snapshot.py
- Remove tautological mock tests (covered by integration tests)

Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
2025-12-30 20:09:51 +00:00

109 lines
3.6 KiB
Python

"""
Legacy archive import utilities.
These functions are used to import data from old ArchiveBox archive formats
(JSON indexes, archive directory structures) into the new database.
This is separate from the hooks-based parser system which handles importing
new URLs from bookmark files, RSS feeds, etc.
"""
__package__ = 'archivebox.misc'
import os
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Iterator, TypedDict, List
class SnapshotDict(TypedDict, total=False):
"""
Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
"""
url: str # Required: the URL to archive
timestamp: str # Optional: unix timestamp string
title: str # Optional: page title
tags: str # Optional: comma-separated tags string
sources: List[str] # Optional: list of source file paths
def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
"""
Parse links from the main JSON index file (archive/index.json).
This is used to recover links from old archive formats.
"""
from archivebox.config import CONSTANTS
index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
if not index_path.exists():
return
try:
with open(index_path, 'r', encoding='utf-8') as f:
data = json.load(f)
links = data.get('links', [])
for link in links:
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
'title': link.get('title'),
'tags': link.get('tags', ''),
}
except (json.JSONDecodeError, KeyError, TypeError):
return
def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
"""
Parse links from individual snapshot index.jsonl/index.json files in archive directories.
Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
Prefers index.jsonl (new format) over index.json (legacy format).
"""
from archivebox.config import CONSTANTS
archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
if not archive_dir.exists():
return
for entry in os.scandir(archive_dir):
if not entry.is_dir():
continue
# Try index.jsonl first (new format)
jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME
link = None
if jsonl_file.exists():
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('{'):
record = json.loads(line)
if record.get('type') == 'Snapshot':
link = record
break
except (json.JSONDecodeError, KeyError, TypeError):
pass
if link is None and json_file.exists():
try:
with open(json_file, 'r', encoding='utf-8') as f:
link = json.load(f)
except (json.JSONDecodeError, KeyError, TypeError):
pass
if link:
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', entry.name),
'title': link.get('title'),
'tags': link.get('tags', ''),
}