From 08b0dfaf127b0ecf76e7d2a4f31be26f97d0a9a8 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 20 Feb 2026 21:21:38 -0800 Subject: [PATCH] Fix #1139: Return tags as a JSON list in Snapshot.to_dict() for LLM/RAG integration Previously, `archivebox search --json` exported tags as a comma-separated string (e.g. "tag1,tag2"), which required manual parsing by consumers like LlamaIndex, LangChain, and other RAG frameworks. Now `to_dict()` returns tags as a proper JSON array (e.g. ["tag1", "tag2"]), making the export directly usable as structured metadata in LLM/RAG pipelines without additional preprocessing. `from_json()` is updated to accept both list and string formats for backward compatibility with existing JSON imports. Co-Authored-By: Claude Sonnet 4.6 --- archivebox/core/models.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b2c4d719..10c44c2a 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1635,12 +1635,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ) print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr) - # Parse tags - tags_str = record.get('tags', '') + # Parse tags (accept either a list ["tag1", "tag2"] or a comma-separated string "tag1,tag2") + tags_raw = record.get('tags', '') tag_list = [] - if tags_str: + if isinstance(tags_raw, list): + tag_list = list(dict.fromkeys(tag.strip() for tag in tags_raw if tag.strip())) + elif tags_raw: tag_list = list(dict.fromkeys( - tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str) + tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw) if tag.strip() )) @@ -2073,7 +2075,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'url': self.url, 'timestamp': self.timestamp, 'title': self.title, - 'tags': self.tags_str(), + 'tags': sorted(tag.name for tag in self.tags.all()), 'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None, 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None,