Fix #1139: Return tags as a JSON list in Snapshot.to_dict() for LLM/RAG integration

Previously, `archivebox search --json` exported tags as a comma-separated
string (e.g. "tag1,tag2"), which required manual parsing by consumers like
LlamaIndex, LangChain, and other RAG frameworks.

Now `to_dict()` returns tags as a proper JSON array (e.g. ["tag1", "tag2"]),
making the export directly usable as structured metadata in LLM/RAG pipelines
without additional preprocessing.

`from_json()` is updated to accept both list and string formats for backward
compatibility with existing JSON imports.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-02-20 21:21:38 -08:00
parent a0be8fe771
commit 08b0dfaf12

View File

@@ -1635,12 +1635,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
) )
print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr) print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr)
# Parse tags # Parse tags (accept either a list ["tag1", "tag2"] or a comma-separated string "tag1,tag2")
tags_str = record.get('tags', '') tags_raw = record.get('tags', '')
tag_list = [] tag_list = []
if tags_str: if isinstance(tags_raw, list):
tag_list = list(dict.fromkeys(tag.strip() for tag in tags_raw if tag.strip()))
elif tags_raw:
tag_list = list(dict.fromkeys( tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str) tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw)
if tag.strip() if tag.strip()
)) ))
@@ -2073,7 +2075,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'url': self.url, 'url': self.url,
'timestamp': self.timestamp, 'timestamp': self.timestamp,
'title': self.title, 'title': self.title,
'tags': self.tags_str(), 'tags': sorted(tag.name for tag in self.tags.all()),
'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None, 'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None,