mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Fix #1139: Return tags as a JSON list in Snapshot.to_dict() for LLM/RAG integration
Previously, `archivebox search --json` exported tags as a comma-separated string (e.g. "tag1,tag2"), which required manual parsing by consumers like LlamaIndex, LangChain, and other RAG frameworks. Now `to_dict()` returns tags as a proper JSON array (e.g. ["tag1", "tag2"]), making the export directly usable as structured metadata in LLM/RAG pipelines without additional preprocessing. `from_json()` is updated to accept both list and string formats for backward compatibility with existing JSON imports. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1635,12 +1635,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
)
|
||||
print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr)
|
||||
|
||||
# Parse tags
|
||||
tags_str = record.get('tags', '')
|
||||
# Parse tags (accept either a list ["tag1", "tag2"] or a comma-separated string "tag1,tag2")
|
||||
tags_raw = record.get('tags', '')
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
if isinstance(tags_raw, list):
|
||||
tag_list = list(dict.fromkeys(tag.strip() for tag in tags_raw if tag.strip()))
|
||||
elif tags_raw:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
@@ -2073,7 +2075,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'url': self.url,
|
||||
'timestamp': self.timestamp,
|
||||
'title': self.title,
|
||||
'tags': self.tags_str(),
|
||||
'tags': sorted(tag.name for tag in self.tags.all()),
|
||||
'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
|
||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
|
||||
Reference in New Issue
Block a user